• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2014 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18  * USE OR OTHER DEALINGS IN THE SOFTWARE.
19  *
20  * The above copyright notice and this permission notice (including the
21  * next paragraph) shall be included in all copies or substantial portions
22  * of the Software.
23  *
24  */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27 
28 #include "ac_nir.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40 
41 #include <assert.h>
42 #include <stdio.h>
43 
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45 
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47  */
48 struct ac_llvm_flow {
49    /* Loop exit or next part of if/else/endif. */
50    LLVMBasicBlockRef next_block;
51    LLVMBasicBlockRef loop_entry_block;
52 };
53 
54 /* Initialize module-independent parts of the context.
55  *
56  * The caller is responsible for initializing ctx::module and ctx::builder.
57  */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,enum amd_gfx_level gfx_level,enum radeon_family family,bool has_3d_cube_border_color_mipmap,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits)58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59                           enum amd_gfx_level gfx_level, enum radeon_family family,
60                           bool has_3d_cube_border_color_mipmap,
61                           enum ac_float_mode float_mode, unsigned wave_size,
62                           unsigned ballot_mask_bits)
63 {
64    ctx->context = LLVMContextCreate();
65    #if LLVM_VERSION_MAJOR >= 15
66    LLVMContextSetOpaquePointers(ctx->context, false);
67    #endif
68 
69    ctx->gfx_level = gfx_level;
70    ctx->family = family;
71    ctx->has_3d_cube_border_color_mipmap = has_3d_cube_border_color_mipmap;
72    ctx->wave_size = wave_size;
73    ctx->ballot_mask_bits = ballot_mask_bits;
74    ctx->float_mode = float_mode;
75    ctx->module = ac_create_module(compiler->tm, ctx->context);
76    ctx->builder = ac_create_builder(ctx->context, float_mode);
77 
78    ctx->voidt = LLVMVoidTypeInContext(ctx->context);
79    ctx->i1 = LLVMInt1TypeInContext(ctx->context);
80    ctx->i8 = LLVMInt8TypeInContext(ctx->context);
81    ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
82    ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
83    ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
84    ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
85    ctx->intptr = ctx->i32;
86    ctx->f16 = LLVMHalfTypeInContext(ctx->context);
87    ctx->f32 = LLVMFloatTypeInContext(ctx->context);
88    ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
89    ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
90    ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
91    ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
92    ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
93    ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
94    ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
95    ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
96    ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
97    ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
98    ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
99    ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
100    ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
101    ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
102 
103    ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
104    ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
105    ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
106    ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
107    ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
108    ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
109    ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
110    ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
111    ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
112    ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
113    ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
114    ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
115    ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
116    ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
117    ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
118    ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
119 
120    ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
121    ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
122 
123    ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
124 
125    ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
126 
127    ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
128 
129    ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
130    ctx->flow = calloc(1, sizeof(*ctx->flow));
131 }
132 
ac_llvm_context_dispose(struct ac_llvm_context * ctx)133 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
134 {
135    free(ctx->flow->stack);
136    free(ctx->flow);
137    ctx->flow = NULL;
138 }
139 
ac_get_llvm_num_components(LLVMValueRef value)140 int ac_get_llvm_num_components(LLVMValueRef value)
141 {
142    LLVMTypeRef type = LLVMTypeOf(value);
143    unsigned num_components =
144       LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
145    return num_components;
146 }
147 
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)148 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
149 {
150    if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
151       assert(index == 0);
152       return value;
153    }
154 
155    return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
156 }
157 
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)158 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
159 {
160    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
161       type = LLVMGetElementType(type);
162 
163    if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
164       return LLVMGetIntTypeWidth(type);
165 
166    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
167       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
168          return 32;
169    }
170 
171    if (type == ctx->f16)
172       return 16;
173    if (type == ctx->f32)
174       return 32;
175    if (type == ctx->f64)
176       return 64;
177 
178    unreachable("Unhandled type kind in get_elem_bits");
179 }
180 
ac_get_type_size(LLVMTypeRef type)181 unsigned ac_get_type_size(LLVMTypeRef type)
182 {
183    LLVMTypeKind kind = LLVMGetTypeKind(type);
184 
185    switch (kind) {
186    case LLVMIntegerTypeKind:
187       return LLVMGetIntTypeWidth(type) / 8;
188    case LLVMHalfTypeKind:
189       return 2;
190    case LLVMFloatTypeKind:
191       return 4;
192    case LLVMDoubleTypeKind:
193       return 8;
194    case LLVMPointerTypeKind:
195       if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
196          return 4;
197       return 8;
198    case LLVMVectorTypeKind:
199       return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
200    case LLVMArrayTypeKind:
201       return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
202    default:
203       assert(0);
204       return 0;
205    }
206 }
207 
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)208 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
209 {
210    if (t == ctx->i1)
211       return ctx->i1;
212    else if (t == ctx->i8)
213       return ctx->i8;
214    else if (t == ctx->f16 || t == ctx->i16)
215       return ctx->i16;
216    else if (t == ctx->f32 || t == ctx->i32)
217       return ctx->i32;
218    else if (t == ctx->f64 || t == ctx->i64)
219       return ctx->i64;
220    else
221       unreachable("Unhandled integer size");
222 }
223 
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)224 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
225 {
226    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
227       LLVMTypeRef elem_type = LLVMGetElementType(t);
228       return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
229    }
230    if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
231       switch (LLVMGetPointerAddressSpace(t)) {
232       case AC_ADDR_SPACE_GLOBAL:
233          return ctx->i64;
234       case AC_ADDR_SPACE_CONST_32BIT:
235       case AC_ADDR_SPACE_LDS:
236          return ctx->i32;
237       default:
238          unreachable("unhandled address space");
239       }
240    }
241    return to_integer_type_scalar(ctx, t);
242 }
243 
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)244 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
245 {
246    LLVMTypeRef type = LLVMTypeOf(v);
247    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
248       return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
249    }
250    return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
251 }
252 
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)253 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
254 {
255    LLVMTypeRef type = LLVMTypeOf(v);
256    if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
257       return v;
258    return ac_to_integer(ctx, v);
259 }
260 
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)261 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
262 {
263    if (t == ctx->i8)
264       return ctx->i8;
265    else if (t == ctx->i16 || t == ctx->f16)
266       return ctx->f16;
267    else if (t == ctx->i32 || t == ctx->f32)
268       return ctx->f32;
269    else if (t == ctx->i64 || t == ctx->f64)
270       return ctx->f64;
271    else
272       unreachable("Unhandled float size");
273 }
274 
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)275 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
276 {
277    if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
278       LLVMTypeRef elem_type = LLVMGetElementType(t);
279       return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
280    }
281    return to_float_type_scalar(ctx, t);
282 }
283 
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)284 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
285 {
286    LLVMTypeRef type = LLVMTypeOf(v);
287    return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
288 }
289 
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)290 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
291                                 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
292                                 unsigned attrib_mask)
293 {
294    LLVMValueRef call;
295    bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
296 
297    LLVMTypeRef param_types[32];
298    assert(param_count <= 32);
299    for (unsigned i = 0; i < param_count; ++i) {
300       assert(params[i]);
301       param_types[i] = LLVMTypeOf(params[i]);
302    }
303 
304    LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
305    LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
306 
307    if (!function) {
308       function = LLVMAddFunction(ctx->module, name, function_type);
309 
310       LLVMSetFunctionCallConv(function, LLVMCCallConv);
311       LLVMSetLinkage(function, LLVMExternalLinkage);
312 
313       if (!set_callsite_attrs)
314          ac_add_func_attributes(ctx->context, function, attrib_mask);
315    }
316 
317    call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
318    if (set_callsite_attrs)
319       ac_add_func_attributes(ctx->context, call, attrib_mask);
320    return call;
321 }
322 
323 /**
324  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
325  * intrinsic names).
326  */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)327 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
328 {
329    LLVMTypeRef elem_type = type;
330 
331    if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
332       unsigned count = LLVMCountStructElementTypes(type);
333       int ret = snprintf(buf, bufsize, "sl_");
334       buf += ret;
335       bufsize -= ret;
336 
337       LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
338       LLVMGetStructElementTypes(type, elems);
339 
340       for (unsigned i = 0; i < count; i++) {
341          ac_build_type_name_for_intr(elems[i], buf, bufsize);
342          ret = strlen(buf);
343          buf += ret;
344          bufsize -= ret;
345       }
346 
347       snprintf(buf, bufsize, "s");
348       return;
349    }
350 
351    assert(bufsize >= 8);
352    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
353       int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
354       if (ret < 0) {
355          char *type_name = LLVMPrintTypeToString(type);
356          fprintf(stderr, "Error building type name for: %s\n", type_name);
357          LLVMDisposeMessage(type_name);
358          return;
359       }
360       elem_type = LLVMGetElementType(type);
361       buf += ret;
362       bufsize -= ret;
363    }
364    switch (LLVMGetTypeKind(elem_type)) {
365    default:
366       break;
367    case LLVMIntegerTypeKind:
368       snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
369       break;
370    case LLVMHalfTypeKind:
371       snprintf(buf, bufsize, "f16");
372       break;
373    case LLVMFloatTypeKind:
374       snprintf(buf, bufsize, "f32");
375       break;
376    case LLVMDoubleTypeKind:
377       snprintf(buf, bufsize, "f64");
378       break;
379    }
380 }
381 
382 /**
383  * Helper function that builds an LLVM IR PHI node and immediately adds
384  * incoming edges.
385  */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)386 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
387                           LLVMValueRef *values, LLVMBasicBlockRef *blocks)
388 {
389    LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
390    LLVMAddIncoming(phi, values, blocks, count_incoming);
391    return phi;
392 }
393 
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)394 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
395 {
396    /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
397     * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
398     */
399    if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
400       return;
401 
402    ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
403 }
404 
405 /* Prevent optimizations (at least of memory accesses) across the current
406  * point in the program by emitting empty inline assembly that is marked as
407  * having side effects.
408  *
409  * Optionally, a value can be passed through the inline assembly to prevent
410  * LLVM from hoisting calls to ReadNone functions.
411  */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)412 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
413 {
414    static int counter = 0;
415 
416    LLVMBuilderRef builder = ctx->builder;
417    char code[16];
418    const char *constraint = sgpr ? "=s,0" : "=v,0";
419 
420    snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
421 
422    if (!pgpr) {
423       LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
424       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
425       LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
426    } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
427       /* Simple version for i32 that allows the caller to set LLVM metadata on the call
428        * instruction. */
429       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
430       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
431 
432       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
433    } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
434       /* Simple version for i16 that allows the caller to set LLVM metadata on the call
435        * instruction. */
436       LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
437       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
438 
439       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
440    } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
441       LLVMTypeRef type = LLVMTypeOf(*pgpr);
442       LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
443       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
444 
445       *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
446    } else {
447       LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
448       LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
449       LLVMTypeRef type = LLVMTypeOf(*pgpr);
450       unsigned bitsize = ac_get_elem_bits(ctx, type);
451       LLVMValueRef vgpr = *pgpr;
452       LLVMTypeRef vgpr_type;
453       unsigned vgpr_size;
454       LLVMValueRef vgpr0;
455 
456       if (bitsize < 32)
457          vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
458 
459       vgpr_type = LLVMTypeOf(vgpr);
460       vgpr_size = ac_get_type_size(vgpr_type);
461 
462       assert(vgpr_size % 4 == 0);
463 
464       vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
465       vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
466       vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, "");
467       vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
468       vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
469 
470       if (bitsize < 32)
471          vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
472 
473       *pgpr = vgpr;
474    }
475 }
476 
ac_build_shader_clock(struct ac_llvm_context * ctx,nir_scope scope)477 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
478 {
479    const char *subgroup = "llvm.readcyclecounter";
480    const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
481 
482    LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
483    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
484 }
485 
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)486 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
487 {
488    const char *name;
489 
490    if (LLVMTypeOf(value) == ctx->i1)
491       value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
492 
493    if (ctx->wave_size == 64)
494       name = "llvm.amdgcn.icmp.i64.i32";
495    else
496       name = "llvm.amdgcn.icmp.i32.i32";
497 
498    LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
499 
500    /* We currently have no other way to prevent LLVM from lifting the icmp
501     * calls to a dominating basic block.
502     */
503    ac_build_optimization_barrier(ctx, &args[0], false);
504 
505    args[0] = ac_to_integer(ctx, args[0]);
506 
507    return ac_build_intrinsic(
508       ctx, name, ctx->iN_wavemask, args, 3,
509       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
510 }
511 
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)512 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
513 {
514    const char *name;
515 
516    if (ctx->wave_size == 64)
517       name = "llvm.amdgcn.icmp.i64.i1";
518    else
519       name = "llvm.amdgcn.icmp.i32.i1";
520 
521    LLVMValueRef args[3] = {
522       value,
523       ctx->i1false,
524       LLVMConstInt(ctx->i32, LLVMIntNE, 0),
525    };
526 
527    return ac_build_intrinsic(
528       ctx, name, ctx->iN_wavemask, args, 3,
529       AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
530 }
531 
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)532 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
533 {
534    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
535    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
536    return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
537 }
538 
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)539 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
540 {
541    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
542    return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
543                         "");
544 }
545 
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)546 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
547 {
548    LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
549    LLVMValueRef vote_set = ac_build_ballot(ctx, value);
550 
551    LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
552    LLVMValueRef none =
553       LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
554    return LLVMBuildOr(ctx->builder, all, none, "");
555 }
556 
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)557 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
558                                             unsigned value_count, unsigned component)
559 {
560    LLVMValueRef vec = NULL;
561 
562    if (value_count == 1) {
563       return values[component];
564    } else if (!value_count)
565       unreachable("value_count is 0");
566 
567    for (unsigned i = component; i < value_count + component; i++) {
568       LLVMValueRef value = values[i];
569 
570       if (i == component)
571          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
572       LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
573       vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
574    }
575    return vec;
576 }
577 
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)578 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
579                                              unsigned value_count, unsigned value_stride,
580                                              bool always_vector)
581 {
582    LLVMBuilderRef builder = ctx->builder;
583    LLVMValueRef vec = NULL;
584    unsigned i;
585 
586    if (value_count == 1 && !always_vector) {
587       return values[0];
588    } else if (!value_count)
589       unreachable("value_count is 0");
590 
591    for (i = 0; i < value_count; i++) {
592       LLVMValueRef value = values[i * value_stride];
593 
594       if (!i)
595          vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
596       LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
597       vec = LLVMBuildInsertElement(builder, vec, value, index, "");
598    }
599    return vec;
600 }
601 
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)602 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
603                                     unsigned value_count)
604 {
605    return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
606 }
607 
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)608 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
609 {
610    unsigned a_size = ac_get_llvm_num_components(a);
611    unsigned b_size = ac_get_llvm_num_components(b);
612 
613    LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
614    for (unsigned i = 0; i < a_size; i++)
615       elems[i] = ac_llvm_extract_elem(ctx, a, i);
616    for (unsigned i = 0; i < b_size; i++)
617       elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
618 
619    return ac_build_gather_values(ctx, elems, a_size + b_size);
620 }
621 
622 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
623  * channels with undef. Extract at most src_channels components from the input.
624  */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)625 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
626                              unsigned src_channels, unsigned dst_channels)
627 {
628    LLVMTypeRef elemtype;
629    LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
630 
631    if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
632       unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
633 
634       if (src_channels == dst_channels && vec_size == dst_channels)
635          return value;
636 
637       src_channels = MIN2(src_channels, vec_size);
638 
639       for (unsigned i = 0; i < src_channels; i++)
640          chan[i] = ac_llvm_extract_elem(ctx, value, i);
641 
642       elemtype = LLVMGetElementType(LLVMTypeOf(value));
643    } else {
644       if (src_channels) {
645          assert(src_channels == 1);
646          chan[0] = value;
647       }
648       elemtype = LLVMTypeOf(value);
649    }
650 
651    for (unsigned i = src_channels; i < dst_channels; i++)
652       chan[i] = LLVMGetUndef(elemtype);
653 
654    return ac_build_gather_values(ctx, chan, dst_channels);
655 }
656 
657 /* Extract components [start, start + channels) from a vector.
658  */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)659 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
660                                    unsigned channels)
661 {
662    LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
663 
664    for (unsigned i = 0; i < channels; i++)
665       chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
666 
667    return ac_build_gather_values(ctx, chan, channels);
668 }
669 
670 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
671  * with undef. Extract at most num_channels components from the input.
672  */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)673 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
674                                      unsigned num_channels)
675 {
676    return ac_build_expand(ctx, value, num_channels, 4);
677 }
678 
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)679 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
680 {
681    unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
682    const char *name;
683 
684    if (type_size == 2)
685       name = "llvm.rint.f16";
686    else if (type_size == 4)
687       name = "llvm.rint.f32";
688    else
689       name = "llvm.rint.f64";
690 
691    return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
692 }
693 
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)694 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
695 {
696    unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
697    const char *name;
698 
699    /* For doubles, we need precise division to pass GLCTS. */
700    if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
701       return LLVMBuildFDiv(ctx->builder, num, den, "");
702 
703    if (type_size == 2)
704       name = "llvm.amdgcn.rcp.f16";
705    else if (type_size == 4)
706       name = "llvm.amdgcn.rcp.f32";
707    else
708       name = "llvm.amdgcn.rcp.f64";
709 
710    LLVMValueRef rcp =
711       ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
712 
713    return LLVMBuildFMul(ctx->builder, num, rcp, "");
714 }
715 
716 /* See fast_idiv_by_const.h. */
717 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)718 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
719                                 LLVMValueRef multiplier, LLVMValueRef pre_shift,
720                                 LLVMValueRef post_shift, LLVMValueRef increment)
721 {
722    LLVMBuilderRef builder = ctx->builder;
723 
724    num = LLVMBuildLShr(builder, num, pre_shift, "");
725    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
726                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
727    num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
728    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
729    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
730    return LLVMBuildLShr(builder, num, post_shift, "");
731 }
732 
733 /* See fast_idiv_by_const.h. */
734 /* If num != UINT_MAX, this more efficient version can be used. */
735 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)736 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
737                                     LLVMValueRef multiplier, LLVMValueRef pre_shift,
738                                     LLVMValueRef post_shift, LLVMValueRef increment)
739 {
740    LLVMBuilderRef builder = ctx->builder;
741 
742    num = LLVMBuildLShr(builder, num, pre_shift, "");
743    num = LLVMBuildNUWAdd(builder, num, increment, "");
744    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
745                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
746    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
747    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
748    return LLVMBuildLShr(builder, num, post_shift, "");
749 }
750 
751 /* See fast_idiv_by_const.h. */
752 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)753 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
754                                               LLVMValueRef multiplier, LLVMValueRef post_shift)
755 {
756    LLVMBuilderRef builder = ctx->builder;
757 
758    num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
759                       LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
760    num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
761    num = LLVMBuildTrunc(builder, num, ctx->i32, "");
762    return LLVMBuildLShr(builder, num, post_shift, "");
763 }
764 
765 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
766  * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
767  * already multiplied by two. id is the cube face number.
768  */
769 struct cube_selection_coords {
770    LLVMValueRef stc[2];
771    LLVMValueRef ma;
772    LLVMValueRef id;
773 };
774 
build_cube_intrinsic(struct ac_llvm_context * ctx,LLVMValueRef in[3],struct cube_selection_coords * out)775 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
776                                  struct cube_selection_coords *out)
777 {
778    LLVMTypeRef f32 = ctx->f32;
779 
780    out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
781    out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
782    out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
783    out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
784 }
785 
786 /**
787  * Build a manual selection sequence for cube face sc/tc coordinates and
788  * major axis vector (multiplied by 2 for consistency) for the given
789  * vec3 \p coords, for the face implied by \p selcoords.
790  *
791  * For the major axis, we always adjust the sign to be in the direction of
792  * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
793  * the selcoords major axis.
794  */
build_cube_select(struct ac_llvm_context * ctx,const struct cube_selection_coords * selcoords,const LLVMValueRef * coords,LLVMValueRef * out_st,LLVMValueRef * out_ma)795 static void build_cube_select(struct ac_llvm_context *ctx,
796                               const struct cube_selection_coords *selcoords,
797                               const LLVMValueRef *coords, LLVMValueRef *out_st,
798                               LLVMValueRef *out_ma)
799 {
800    LLVMBuilderRef builder = ctx->builder;
801    LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
802    LLVMValueRef is_ma_positive;
803    LLVMValueRef sgn_ma;
804    LLVMValueRef is_ma_z, is_not_ma_z;
805    LLVMValueRef is_ma_y;
806    LLVMValueRef is_ma_x;
807    LLVMValueRef sgn;
808    LLVMValueRef tmp;
809 
810    is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
811    sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
812                             LLVMConstReal(f32, -1.0), "");
813 
814    is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
815    is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
816    is_ma_y = LLVMBuildAnd(
817       builder, is_not_ma_z,
818       LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
819    is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
820 
821    /* Select sc */
822    tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
823    sgn = LLVMBuildSelect(
824       builder, is_ma_y, LLVMConstReal(f32, 1.0),
825       LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
826    out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
827 
828    /* Select tc */
829    tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
830    sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
831    out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
832 
833    /* Select ma */
834    tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
835                          LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
836    tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
837    *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
838 }
839 
ac_prepare_cube_coords(struct ac_llvm_context * ctx,bool is_deriv,bool is_array,bool is_lod,LLVMValueRef * coords_arg,LLVMValueRef * derivs_arg)840 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
841                             LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
842 {
843 
844    LLVMBuilderRef builder = ctx->builder;
845    struct cube_selection_coords selcoords;
846    LLVMValueRef coords[3];
847    LLVMValueRef invma;
848 
849    if (is_array && !is_lod) {
850       LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
851 
852       /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
853        *
854        *    "For Array forms, the array layer used will be
855        *
856        *       max(0, min(d−1, floor(layer+0.5)))
857        *
858        *     where d is the depth of the texture array and layer
859        *     comes from the component indicated in the tables below.
860        *     Workaroudn for an issue where the layer is taken from a
861        *     helper invocation which happens to fall on a different
862        *     layer due to extrapolation."
863        *
864        * GFX8 and earlier attempt to implement this in hardware by
865        * clamping the value of coords[2] = (8 * layer) + face.
866        * Unfortunately, this means that the we end up with the wrong
867        * face when clamping occurs.
868        *
869        * Clamp the layer earlier to work around the issue.
870        */
871       if (ctx->gfx_level <= GFX8) {
872          LLVMValueRef ge0;
873          ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
874          tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
875       }
876 
877       coords_arg[3] = tmp;
878    }
879 
880    build_cube_intrinsic(ctx, coords_arg, &selcoords);
881 
882    invma =
883       ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
884    invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
885 
886    for (int i = 0; i < 2; ++i)
887       coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
888 
889    coords[2] = selcoords.id;
890 
891    if (is_deriv && derivs_arg) {
892       LLVMValueRef derivs[4];
893       int axis;
894 
895       /* Convert cube derivatives to 2D derivatives. */
896       for (axis = 0; axis < 2; axis++) {
897          LLVMValueRef deriv_st[2];
898          LLVMValueRef deriv_ma;
899 
900          /* Transform the derivative alongside the texture
901           * coordinate. Mathematically, the correct formula is
902           * as follows. Assume we're projecting onto the +Z face
903           * and denote by dx/dh the derivative of the (original)
904           * X texture coordinate with respect to horizontal
905           * window coordinates. The projection onto the +Z face
906           * plane is:
907           *
908           *   f(x,z) = x/z
909           *
910           * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
911           *            = 1/z * dx/dh - x/z * 1/z * dz/dh.
912           *
913           * This motivatives the implementation below.
914           *
915           * Whether this actually gives the expected results for
916           * apps that might feed in derivatives obtained via
917           * finite differences is anyone's guess. The OpenGL spec
918           * seems awfully quiet about how textureGrad for cube
919           * maps should be handled.
920           */
921          build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
922 
923          deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
924 
925          for (int i = 0; i < 2; ++i)
926             derivs[axis * 2 + i] =
927                LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
928                              LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
929       }
930 
931       memcpy(derivs_arg, derivs, sizeof(derivs));
932    }
933 
934    /* Shift the texture coordinate. This must be applied after the
935     * derivative calculation.
936     */
937    for (int i = 0; i < 2; ++i)
938       coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
939 
940    if (is_array) {
941       /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
942       /* coords_arg.w component - array_index for cube arrays */
943       coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
944    }
945 
946    memcpy(coords_arg, coords, sizeof(coords));
947 }
948 
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)949 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
950                                 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
951                                 LLVMValueRef j)
952 {
953    LLVMValueRef args[5];
954 
955    if (ctx->gfx_level >= GFX11) {
956       LLVMValueRef p;
957       LLVMValueRef p10;
958 
959       args[0] = llvm_chan;
960       args[1] = attr_number;
961       args[2] = params;
962 
963       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
964                              ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
965 
966       args[0] = p;
967       args[1] = i;
968       args[2] = p;
969 
970       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
971                                ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
972 
973       args[0] = p;
974       args[1] = j;
975       args[2] = p10;
976 
977       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
978                                 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
979 
980    } else {
981       LLVMValueRef p1;
982 
983       args[0] = i;
984       args[1] = llvm_chan;
985       args[2] = attr_number;
986       args[3] = params;
987 
988       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
989                               ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
990 
991       args[0] = p1;
992       args[1] = j;
993       args[2] = llvm_chan;
994       args[3] = attr_number;
995       args[4] = params;
996 
997       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
998                                 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
999    }
1000 }
1001 
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)1002 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
1003                                     LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
1004                                     LLVMValueRef j, bool high_16bits)
1005 {
1006    LLVMValueRef args[6];
1007 
1008    if (ctx->gfx_level >= GFX11) {
1009       LLVMValueRef p;
1010       LLVMValueRef p10;
1011 
1012       args[0] = llvm_chan;
1013       args[1] = attr_number;
1014       args[2] = params;
1015 
1016       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1017                              ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1018 
1019       args[0] = p;
1020       args[1] = i;
1021       args[2] = p;
1022       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1023 
1024       p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
1025                                ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1026 
1027       args[0] = p;
1028       args[1] = j;
1029       args[2] = p10;
1030       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1031 
1032       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
1033                                 ctx->f16, args, 4, AC_FUNC_ATTR_READNONE);
1034 
1035    } else {
1036       LLVMValueRef p1;
1037 
1038       args[0] = i;
1039       args[1] = llvm_chan;
1040       args[2] = attr_number;
1041       args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1042       args[4] = params;
1043 
1044       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
1045                               AC_FUNC_ATTR_READNONE);
1046 
1047       args[0] = p1;
1048       args[1] = j;
1049       args[2] = llvm_chan;
1050       args[3] = attr_number;
1051       args[4] = high_16bits ? ctx->i1true : ctx->i1false;
1052       args[5] = params;
1053 
1054       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
1055                                 AC_FUNC_ATTR_READNONE);
1056    }
1057 }
1058 
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,LLVMValueRef parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)1059 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
1060                                     LLVMValueRef llvm_chan, LLVMValueRef attr_number,
1061                                     LLVMValueRef params)
1062 {
1063    LLVMValueRef args[4];
1064 
1065    if (ctx->gfx_level >= GFX11) {
1066       LLVMValueRef p;
1067 
1068       args[0] = llvm_chan;
1069       args[1] = attr_number;
1070       args[2] = params;
1071 
1072       p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1073                              ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1074       p = ac_build_quad_swizzle(ctx, p, 0, 0, 0 ,0);
1075       return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, AC_FUNC_ATTR_READNONE);
1076    } else {
1077       args[0] = parameter;
1078       args[1] = llvm_chan;
1079       args[2] = attr_number;
1080       args[3] = params;
1081 
1082       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
1083                                 AC_FUNC_ATTR_READNONE);
1084    }
1085 }
1086 
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1087 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1088                               LLVMValueRef index)
1089 {
1090    return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1091 }
1092 
ac_build_gep0(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1093 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1094 {
1095    LLVMValueRef indices[2] = {
1096       ctx->i32_0,
1097       index,
1098    };
1099    return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1100 }
1101 
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef index)1102 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
1103 {
1104    LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
1105    return LLVMBuildPointerCast(ctx->builder, offset_ptr, LLVMTypeOf(ptr), "");
1106 }
1107 
ac_build_indexed_store(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,LLVMValueRef value)1108 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
1109                             LLVMValueRef value)
1110 {
1111    LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
1112 }
1113 
1114 /**
1115  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1116  * It's equivalent to doing a load from &base_ptr[index].
1117  *
1118  * \param base_ptr  Where the array starts.
1119  * \param index     The element index into the array.
1120  * \param uniform   Whether the base_ptr and index can be assumed to be
1121  *                  dynamically uniform (i.e. load to an SGPR)
1122  * \param invariant Whether the load is invariant (no other opcodes affect it)
1123  * \param no_unsigned_wraparound
1124  *    For all possible re-associations and re-distributions of an expression
1125  *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1126  *    without inbounds in base_ptr), this parameter is true if "addr + offset"
1127  *    does not result in an unsigned integer wraparound. This is used for
1128  *    optimal code generation of 32-bit pointer arithmetic.
1129  *
1130  *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
1131  *    integer wraparound can't be an imm offset in s_load_dword, because
1132  *    the instruction performs "addr + offset" in 64 bits.
1133  *
1134  *    Expected usage for bindless textures by chaining GEPs:
1135  *      // possible unsigned wraparound, don't use InBounds:
1136  *      ptr1 = LLVMBuildGEP(base_ptr, index);
1137  *      image = load(ptr1); // becomes "s_load ptr1, 0"
1138  *
1139  *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1140  *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1141  */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)1142 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1143                                          LLVMValueRef index, bool uniform, bool invariant,
1144                                          bool no_unsigned_wraparound)
1145 {
1146    LLVMValueRef pointer, result;
1147 
1148    if (no_unsigned_wraparound &&
1149        LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1150       pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1151    else
1152       pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1153 
1154    if (uniform)
1155       LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1156    result = LLVMBuildLoad(ctx->builder, pointer, "");
1157    if (invariant)
1158       LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1159    LLVMSetAlignment(result, 4);
1160    return result;
1161 }
1162 
ac_build_load(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1163 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1164 {
1165    return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1166 }
1167 
ac_build_load_invariant(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1168 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1169                                      LLVMValueRef index)
1170 {
1171    return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1172 }
1173 
1174 /* This assumes that there is no unsigned integer wraparound during the address
1175  * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1176 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1177                                    LLVMValueRef index)
1178 {
1179    return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1180 }
1181 
1182 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1183 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1184                                                    LLVMValueRef base_ptr, LLVMValueRef index)
1185 {
1186    return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1187 }
1188 
get_load_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1189 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1190 {
1191    return cache_policy |
1192           (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0);
1193 }
1194 
get_store_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1195 static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1196 {
1197    if (ctx->gfx_level >= GFX11)
1198       cache_policy &= ~ac_glc; /* GLC has no effect on stores */
1199    return cache_policy;
1200 }
1201 
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool use_format)1202 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1203                                          LLVMValueRef data, LLVMValueRef vindex,
1204                                          LLVMValueRef voffset, LLVMValueRef soffset,
1205                                          unsigned cache_policy, bool use_format)
1206 {
1207    LLVMValueRef args[6];
1208    int idx = 0;
1209    args[idx++] = data;
1210    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1211    if (vindex)
1212       args[idx++] = vindex ? vindex : ctx->i32_0;
1213    args[idx++] = voffset ? voffset : ctx->i32_0;
1214    args[idx++] = soffset ? soffset : ctx->i32_0;
1215    args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0);
1216    const char *indexing_kind = vindex ? "struct" : "raw";
1217    char name[256], type_name[8];
1218 
1219    ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1220 
1221    if (use_format) {
1222       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1223                type_name);
1224    } else {
1225       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1226    }
1227 
1228    ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1229 }
1230 
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,unsigned cache_policy)1231 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1232                                   LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1233 {
1234    ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true);
1235 }
1236 
1237 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1238 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1239                                  LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1240                                  unsigned cache_policy)
1241 {
1242    unsigned num_channels = ac_get_llvm_num_components(vdata);
1243 
1244    /* Split 3 channel stores if unsupported. */
1245    if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1246       LLVMValueRef v[3], v01, voffset2;
1247 
1248       for (int i = 0; i < 3; i++) {
1249          v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1250       }
1251       v01 = ac_build_gather_values(ctx, v, 2);
1252 
1253       voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1254                               LLVMConstInt(ctx->i32, 8, 0), "");
1255 
1256       ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy);
1257       ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy);
1258       return;
1259    }
1260 
1261    ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1262                                 cache_policy, false);
1263 }
1264 
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool use_format,bool structurized)1265 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1266                                                 LLVMValueRef vindex, LLVMValueRef voffset,
1267                                                 LLVMValueRef soffset, unsigned num_channels,
1268                                                 LLVMTypeRef channel_type, unsigned cache_policy,
1269                                                 bool can_speculate, bool use_format,
1270                                                 bool structurized)
1271 {
1272    LLVMValueRef args[5];
1273    int idx = 0;
1274    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1275    if (structurized)
1276       args[idx++] = vindex ? vindex : ctx->i32_0;
1277    args[idx++] = voffset ? voffset : ctx->i32_0;
1278    args[idx++] = soffset ? soffset : ctx->i32_0;
1279    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1280    unsigned func =
1281       !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1282    const char *indexing_kind = structurized ? "struct" : "raw";
1283    char name[256], type_name[8];
1284 
1285    /* D16 is only supported on gfx8+ */
1286    assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1287           ctx->gfx_level >= GFX8);
1288 
1289    LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1290    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1291 
1292    if (use_format) {
1293       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1294                type_name);
1295    } else {
1296       snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1297    }
1298 
1299    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1300 }
1301 
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool allow_smem)1302 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1303                                   LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1304                                   LLVMTypeRef channel_type, unsigned cache_policy,
1305                                   bool can_speculate, bool allow_smem)
1306 {
1307    if (allow_smem && !(cache_policy & ac_slc) &&
1308        (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) {
1309       assert(vindex == NULL);
1310 
1311       LLVMValueRef result[8];
1312 
1313       LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1314       if (soffset)
1315          offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1316 
1317       for (int i = 0; i < num_channels; i++) {
1318          if (i) {
1319             offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1320          }
1321          LLVMValueRef args[3] = {
1322             rsrc,
1323             offset,
1324             LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1325          };
1326          result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1327                                         AC_FUNC_ATTR_READNONE);
1328       }
1329       if (num_channels == 1)
1330          return result[0];
1331 
1332       if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false))
1333          result[num_channels++] = LLVMGetUndef(ctx->f32);
1334       return ac_build_gather_values(ctx, result, num_channels);
1335    }
1336 
1337    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, soffset, num_channels,
1338                                       channel_type, cache_policy, can_speculate, false, false);
1339 }
1340 
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,unsigned cache_policy,bool can_speculate,bool d16,bool tfe)1341 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1342                                          LLVMValueRef vindex, LLVMValueRef voffset,
1343                                          unsigned num_channels, unsigned cache_policy,
1344                                          bool can_speculate, bool d16, bool tfe)
1345 {
1346    if (tfe) {
1347       assert(!d16);
1348 
1349       cache_policy = get_load_cache_policy(ctx, cache_policy);
1350 
1351       char code[256];
1352       /* The definition in the assembly and the one in the constraint string
1353        * differs because of an assembler bug.
1354        */
1355       snprintf(code, sizeof(code),
1356                "v_mov_b32 v0, 0\n"
1357                "v_mov_b32 v1, 0\n"
1358                "v_mov_b32 v2, 0\n"
1359                "v_mov_b32 v3, 0\n"
1360                "v_mov_b32 v4, 0\n"
1361                "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1362                "s_waitcnt vmcnt(0)",
1363                cache_policy & ac_glc ? "glc" : "",
1364                cache_policy & ac_slc ? "slc" : "",
1365                cache_policy & ac_dlc ? "dlc" : "");
1366 
1367       LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1368       LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1369       LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1370 
1371       LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1372                                    voffset ? voffset : ctx->i32_0};
1373 
1374       LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1375                              LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1376       LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1377 
1378       return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1379                              ac_llvm_extract_elem(ctx, res, 4));
1380    }
1381 
1382    return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1383                                       d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1384                                       true);
1385 }
1386 
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate,bool structurized)1387 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1388                                           LLVMValueRef vindex, LLVMValueRef voffset,
1389                                           LLVMValueRef soffset, unsigned num_channels,
1390                                           unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1391                                           bool can_speculate, bool structurized)
1392 {
1393    LLVMValueRef args[6];
1394    int idx = 0;
1395    args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1396    if (structurized)
1397       args[idx++] = vindex ? vindex : ctx->i32_0;
1398    args[idx++] = voffset ? voffset : ctx->i32_0;
1399    args[idx++] = soffset ? soffset : ctx->i32_0;
1400    args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->gfx_level, dfmt, nfmt), 0);
1401    args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1402    unsigned func =
1403       !ac_has_vec3_support(ctx->gfx_level, true) && num_channels == 3 ? 4 : num_channels;
1404    const char *indexing_kind = structurized ? "struct" : "raw";
1405    char name[256], type_name[8];
1406 
1407    LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1408    ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1409 
1410    snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1411 
1412    return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1413 }
1414 
ac_build_struct_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1415 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1416                                           LLVMValueRef vindex, LLVMValueRef voffset,
1417                                           LLVMValueRef soffset, unsigned num_channels,
1418                                           unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1419                                           bool can_speculate)
1420 {
1421    return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, num_channels, dfmt,
1422                                 nfmt, cache_policy, can_speculate, true);
1423 }
1424 
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1425 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1426                                         LLVMValueRef voffset, LLVMValueRef soffset,
1427                                         unsigned cache_policy)
1428 {
1429    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1430                                       cache_policy, false, false, false);
1431 }
1432 
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1433 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1434                                        LLVMValueRef voffset, LLVMValueRef soffset,
1435                                        unsigned cache_policy)
1436 {
1437    return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1438                                       false, false, false);
1439 }
1440 
1441 /**
1442  * Convert an 11- or 10-bit unsigned floating point number to an f32.
1443  *
1444  * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1445  * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1446  */
ac_ufN_to_float(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned exp_bits,unsigned mant_bits)1447 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1448                                     unsigned exp_bits, unsigned mant_bits)
1449 {
1450    assert(LLVMTypeOf(src) == ctx->i32);
1451 
1452    LLVMValueRef tmp;
1453    LLVMValueRef mantissa;
1454    mantissa =
1455       LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1456 
1457    /* Converting normal numbers is just a shift + correcting the exponent bias */
1458    unsigned normal_shift = 23 - mant_bits;
1459    unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1460    LLVMValueRef shifted, normal;
1461 
1462    shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1463    normal =
1464       LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1465 
1466    /* Converting nan/inf numbers is the same, but with a different exponent update */
1467    LLVMValueRef naninf;
1468    naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1469 
1470    /* Converting denormals is the complex case: determine the leading zeros of the
1471     * mantissa to obtain the correct shift for the mantissa and exponent correction.
1472     */
1473    LLVMValueRef denormal;
1474    LLVMValueRef params[2] = {
1475       mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1476    };
1477    LLVMValueRef ctlz =
1478       ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1479 
1480    /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1481    tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1482    denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1483 
1484    unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1485    tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1486    tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1487    denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1488 
1489    /* Select the final result. */
1490    LLVMValueRef result;
1491 
1492    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1493                        LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");
1494    result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1495 
1496    tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1497                        LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");
1498    result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1499 
1500    tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1501    result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1502 
1503    return ac_to_float(ctx, result);
1504 }
1505 
1506 /**
1507  * Generate a fully general open coded buffer format fetch with all required
1508  * fixups suitable for vertex fetch, using non-format buffer loads.
1509  *
1510  * Some combinations of argument values have special interpretations:
1511  * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1512  * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1513  *
1514  * \param log_size log(size of channel in bytes)
1515  * \param num_channels number of channels (1 to 4)
1516  * \param format AC_FETCH_FORMAT_xxx value
1517  * \param reverse whether XYZ channels are reversed
1518  * \param known_aligned whether the source is known to be aligned to hardware's
1519  *                      effective element size for loading the given format
1520  *                      (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1521  * \param rsrc buffer resource descriptor
1522  * \return the resulting vector of floats or integers bitcast to <4 x i32>
1523  */
ac_build_opencoded_load_format(struct ac_llvm_context * ctx,unsigned log_size,unsigned num_channels,unsigned format,bool reverse,bool known_aligned,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool can_speculate)1524 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1525                                             unsigned num_channels, unsigned format, bool reverse,
1526                                             bool known_aligned, LLVMValueRef rsrc,
1527                                             LLVMValueRef vindex, LLVMValueRef voffset,
1528                                             LLVMValueRef soffset, unsigned cache_policy,
1529                                             bool can_speculate)
1530 {
1531    LLVMValueRef tmp;
1532    unsigned load_log_size = log_size;
1533    unsigned load_num_channels = num_channels;
1534    if (log_size == 3) {
1535       load_log_size = 2;
1536       if (format == AC_FETCH_FORMAT_FLOAT) {
1537          load_num_channels = 2 * num_channels;
1538       } else {
1539          load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1540       }
1541    }
1542 
1543    int log_recombine = 0;
1544    if ((ctx->gfx_level == GFX6 || ctx->gfx_level >= GFX10) && !known_aligned) {
1545       /* Avoid alignment restrictions by loading one byte at a time. */
1546       load_num_channels <<= load_log_size;
1547       log_recombine = load_log_size;
1548       load_log_size = 0;
1549    } else if (load_num_channels == 2 || load_num_channels == 4) {
1550       log_recombine = -util_logbase2(load_num_channels);
1551       load_num_channels = 1;
1552       load_log_size += -log_recombine;
1553    }
1554 
1555    LLVMValueRef loads[32]; /* up to 32 bytes */
1556    for (unsigned i = 0; i < load_num_channels; ++i) {
1557       tmp =
1558          LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1559       LLVMTypeRef channel_type =
1560          load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1561       unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1562       loads[i] =
1563          ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1564                                      cache_policy, can_speculate, false, true);
1565       if (load_log_size >= 2)
1566          loads[i] = ac_to_integer(ctx, loads[i]);
1567    }
1568 
1569    if (log_recombine > 0) {
1570       /* Recombine bytes if necessary (GFX6 only) */
1571       LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1572 
1573       for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1574          LLVMValueRef accum = NULL;
1575          for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1576             tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1577             if (i == 0) {
1578                accum = tmp;
1579             } else {
1580                tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1581                accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1582             }
1583          }
1584          loads[dst] = accum;
1585       }
1586    } else if (log_recombine < 0) {
1587       /* Split vectors of dwords */
1588       if (load_log_size > 2) {
1589          assert(load_num_channels == 1);
1590          LLVMValueRef loaded = loads[0];
1591          unsigned log_split = load_log_size - 2;
1592          log_recombine += log_split;
1593          load_num_channels = 1 << log_split;
1594          load_log_size = 2;
1595          for (unsigned i = 0; i < load_num_channels; ++i) {
1596             tmp = LLVMConstInt(ctx->i32, i, false);
1597             loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1598          }
1599       }
1600 
1601       /* Further split dwords and shorts if required */
1602       if (log_recombine < 0) {
1603          for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1604               --src) {
1605             unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1606             LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1607             LLVMValueRef loaded = loads[src - 1];
1608             LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1609             for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1610                tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1611                tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1612                loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1613             }
1614          }
1615       }
1616    }
1617 
1618    if (log_size == 3) {
1619       if (format == AC_FETCH_FORMAT_FLOAT) {
1620          for (unsigned i = 0; i < num_channels; ++i) {
1621             tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1622             loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1623          }
1624       } else if (format == AC_FETCH_FORMAT_FIXED) {
1625          /* 10_11_11_FLOAT */
1626          LLVMValueRef data = loads[0];
1627          LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1628          LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1629          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1630          LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1631          LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1632 
1633          loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1634          loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1635          loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1636 
1637          num_channels = 3;
1638          log_size = 2;
1639          format = AC_FETCH_FORMAT_FLOAT;
1640       } else {
1641          /* 2_10_10_10 data formats */
1642          LLVMValueRef data = loads[0];
1643          LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1644          LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1645          loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1646          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1647          loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1648          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1649          loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1650          tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1651          loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1652 
1653          num_channels = 4;
1654       }
1655    }
1656 
1657    if (format == AC_FETCH_FORMAT_FLOAT) {
1658       if (log_size != 2) {
1659          for (unsigned chan = 0; chan < num_channels; ++chan) {
1660             tmp = ac_to_float(ctx, loads[chan]);
1661             if (log_size == 3)
1662                tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1663             else if (log_size == 1)
1664                tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1665             loads[chan] = ac_to_integer(ctx, tmp);
1666          }
1667       }
1668    } else if (format == AC_FETCH_FORMAT_UINT) {
1669       if (log_size != 2) {
1670          for (unsigned chan = 0; chan < num_channels; ++chan)
1671             loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1672       }
1673    } else if (format == AC_FETCH_FORMAT_SINT) {
1674       if (log_size != 2) {
1675          for (unsigned chan = 0; chan < num_channels; ++chan)
1676             loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1677       }
1678    } else {
1679       bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1680                     format == AC_FETCH_FORMAT_UINT;
1681 
1682       for (unsigned chan = 0; chan < num_channels; ++chan) {
1683          if (unsign) {
1684             tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1685          } else {
1686             tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1687          }
1688 
1689          LLVMValueRef scale = NULL;
1690          if (format == AC_FETCH_FORMAT_FIXED) {
1691             assert(log_size == 2);
1692             scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1693          } else if (format == AC_FETCH_FORMAT_UNORM) {
1694             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1695             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1696          } else if (format == AC_FETCH_FORMAT_SNORM) {
1697             unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1698             scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1699          }
1700          if (scale)
1701             tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1702 
1703          if (format == AC_FETCH_FORMAT_SNORM) {
1704             /* Clamp to [-1, 1] */
1705             LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1706             LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1707             tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1708          }
1709 
1710          loads[chan] = ac_to_integer(ctx, tmp);
1711       }
1712    }
1713 
1714    while (num_channels < 4) {
1715       if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1716          loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1717       } else {
1718          loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1719       }
1720       num_channels++;
1721    }
1722 
1723    if (reverse) {
1724       tmp = loads[0];
1725       loads[0] = loads[2];
1726       loads[2] = tmp;
1727    }
1728 
1729    return ac_build_gather_values(ctx, loads, 4);
1730 }
1731 
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1732 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1733                                  LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1734                                  unsigned cache_policy)
1735 {
1736    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1737 
1738    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1739 }
1740 
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1741 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1742                                 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1743 {
1744    vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1745 
1746    ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1747 }
1748 
1749 /**
1750  * Set range metadata on an instruction.  This can only be used on load and
1751  * call instructions.  If you know an instruction can only produce the values
1752  * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1753  * \p lo is the minimum value inclusive.
1754  * \p hi is the maximum value exclusive.
1755  */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1756 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1757                            unsigned hi)
1758 {
1759    LLVMValueRef range_md, md_args[2];
1760    LLVMTypeRef type = LLVMTypeOf(value);
1761    LLVMContextRef context = LLVMGetTypeContext(type);
1762 
1763    md_args[0] = LLVMConstInt(type, lo, false);
1764    md_args[1] = LLVMConstInt(type, hi, false);
1765    range_md = LLVMMDNodeInContext(context, md_args, 2);
1766    LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1767 }
1768 
ac_get_thread_id(struct ac_llvm_context * ctx)1769 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1770 {
1771    return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1772 }
1773 
1774 /*
1775  * AMD GCN implements derivatives using the local data store (LDS)
1776  * All writes to the LDS happen in all executing threads at
1777  * the same time. TID is the Thread ID for the current
1778  * thread and is a value between 0 and 63, representing
1779  * the thread's position in the wavefront.
1780  *
1781  * For the pixel shader threads are grouped into quads of four pixels.
1782  * The TIDs of the pixels of a quad are:
1783  *
1784  *  +------+------+
1785  *  |4n + 0|4n + 1|
1786  *  +------+------+
1787  *  |4n + 2|4n + 3|
1788  *  +------+------+
1789  *
1790  * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1791  * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1792  * the current pixel's column, and masking with 0xfffffffe yields the TID
1793  * of the left pixel of the current pixel's row.
1794  *
1795  * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1796  * adding 2 yields the TID of the pixel below the top pixel.
1797  */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1798 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1799 {
1800    unsigned tl_lanes[4], trbl_lanes[4];
1801    char name[32], type[8];
1802    LLVMValueRef tl, trbl;
1803    LLVMTypeRef result_type;
1804    LLVMValueRef result;
1805 
1806    result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1807 
1808    if (result_type == ctx->f16)
1809       val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1810    else if (result_type == ctx->v2f16)
1811       val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1812 
1813    for (unsigned i = 0; i < 4; ++i) {
1814       tl_lanes[i] = i & mask;
1815       trbl_lanes[i] = (i & mask) + idx;
1816    }
1817 
1818    tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1819    trbl =
1820       ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1821 
1822    if (result_type == ctx->f16) {
1823       tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1824       trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1825    }
1826 
1827    tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1828    trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1829    result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1830 
1831    ac_build_type_name_for_intr(result_type, type, sizeof(type));
1832    snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1833 
1834    return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1835 }
1836 
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t msg,LLVMValueRef wave_id)1837 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1838 {
1839    LLVMValueRef args[2];
1840    args[0] = LLVMConstInt(ctx->i32, msg, false);
1841    args[1] = wave_id;
1842    ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1843 }
1844 
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1845 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1846 {
1847    LLVMValueRef msb =
1848       ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1849 
1850    /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1851     * the index from LSB. Invert it by doing "31 - msb". */
1852    msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1853 
1854    LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1855    LLVMValueRef cond =
1856       LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1857                   LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1858 
1859    return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1860 }
1861 
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1862 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1863 {
1864    const char *intrin_name;
1865    LLVMTypeRef type;
1866    LLVMValueRef highest_bit;
1867    LLVMValueRef zero;
1868    unsigned bitsize;
1869 
1870    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1871    switch (bitsize) {
1872    case 64:
1873       intrin_name = "llvm.ctlz.i64";
1874       type = ctx->i64;
1875       highest_bit = LLVMConstInt(ctx->i64, 63, false);
1876       zero = ctx->i64_0;
1877       break;
1878    case 32:
1879       intrin_name = "llvm.ctlz.i32";
1880       type = ctx->i32;
1881       highest_bit = LLVMConstInt(ctx->i32, 31, false);
1882       zero = ctx->i32_0;
1883       break;
1884    case 16:
1885       intrin_name = "llvm.ctlz.i16";
1886       type = ctx->i16;
1887       highest_bit = LLVMConstInt(ctx->i16, 15, false);
1888       zero = ctx->i16_0;
1889       break;
1890    case 8:
1891       intrin_name = "llvm.ctlz.i8";
1892       type = ctx->i8;
1893       highest_bit = LLVMConstInt(ctx->i8, 7, false);
1894       zero = ctx->i8_0;
1895       break;
1896    default:
1897       unreachable(!"invalid bitsize");
1898       break;
1899    }
1900 
1901    LLVMValueRef params[2] = {
1902       arg,
1903       ctx->i1true,
1904    };
1905 
1906    LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1907 
1908    /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1909     * the index from LSB. Invert it by doing "31 - msb". */
1910    msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1911 
1912    if (bitsize == 64) {
1913       msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1914    } else if (bitsize < 32) {
1915       msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1916    }
1917 
1918    /* check for zero */
1919    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1920                           LLVMConstInt(ctx->i32, -1, true), msb, "");
1921 }
1922 
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1923 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1924 {
1925    char name[64], type[64];
1926 
1927    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1928    snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1929    LLVMValueRef args[2] = {a, b};
1930    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1931 }
1932 
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1933 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935    char name[64], type[64];
1936 
1937    ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1938    snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1939    LLVMValueRef args[2] = {a, b};
1940    return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1941 }
1942 
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1943 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1944 {
1945    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1946    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1947 }
1948 
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1949 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1950 {
1951    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1952    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1953 }
1954 
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1955 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1956 {
1957    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1958    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1959 }
1960 
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1961 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1962 {
1963    LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1964    return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1965 }
1966 
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1967 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1968 {
1969    LLVMTypeRef t = LLVMTypeOf(value);
1970    return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1971                         LLVMConstReal(t, 1.0));
1972 }
1973 
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1974 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1975 {
1976    LLVMValueRef args[9];
1977 
1978    args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1979    args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1980 
1981    if (a->compr) {
1982       assert(ctx->gfx_level < GFX11);
1983 
1984       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1985       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1986       args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1987       args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1988 
1989       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1990    } else {
1991       args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1992       args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1993       args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1994       args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1995       args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1996       args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1997 
1998       ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1999    }
2000 }
2001 
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)2002 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
2003 {
2004    struct ac_export_args args;
2005 
2006    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
2007     * for discard.
2008     */
2009    if (ctx->gfx_level >= GFX10 && !uses_discard)
2010       return;
2011 
2012    args.enabled_channels = 0x0; /* enabled channels */
2013    args.valid_mask = 1;         /* whether the EXEC mask is valid */
2014    args.done = 1;               /* DONE bit */
2015    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
2016    args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2017    args.compr = 0;                       /* COMPR flag (0 = 32-bit export) */
2018    args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2019    args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2020    args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2021    args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2022 
2023    ac_build_export(ctx, &args);
2024 }
2025 
ac_num_coords(enum ac_image_dim dim)2026 static unsigned ac_num_coords(enum ac_image_dim dim)
2027 {
2028    switch (dim) {
2029    case ac_image_1d:
2030       return 1;
2031    case ac_image_2d:
2032    case ac_image_1darray:
2033       return 2;
2034    case ac_image_3d:
2035    case ac_image_cube:
2036    case ac_image_2darray:
2037    case ac_image_2dmsaa:
2038       return 3;
2039    case ac_image_2darraymsaa:
2040       return 4;
2041    default:
2042       unreachable("ac_num_coords: bad dim");
2043    }
2044 }
2045 
ac_num_derivs(enum ac_image_dim dim)2046 static unsigned ac_num_derivs(enum ac_image_dim dim)
2047 {
2048    switch (dim) {
2049    case ac_image_1d:
2050    case ac_image_1darray:
2051       return 2;
2052    case ac_image_2d:
2053    case ac_image_2darray:
2054    case ac_image_cube:
2055       return 4;
2056    case ac_image_3d:
2057       return 6;
2058    case ac_image_2dmsaa:
2059    case ac_image_2darraymsaa:
2060    default:
2061       unreachable("derivatives not supported");
2062    }
2063 }
2064 
get_atomic_name(enum ac_atomic_op op)2065 static const char *get_atomic_name(enum ac_atomic_op op)
2066 {
2067    switch (op) {
2068    case ac_atomic_swap:
2069       return "swap";
2070    case ac_atomic_add:
2071       return "add";
2072    case ac_atomic_sub:
2073       return "sub";
2074    case ac_atomic_smin:
2075       return "smin";
2076    case ac_atomic_umin:
2077       return "umin";
2078    case ac_atomic_smax:
2079       return "smax";
2080    case ac_atomic_umax:
2081       return "umax";
2082    case ac_atomic_and:
2083       return "and";
2084    case ac_atomic_or:
2085       return "or";
2086    case ac_atomic_xor:
2087       return "xor";
2088    case ac_atomic_inc_wrap:
2089       return "inc";
2090    case ac_atomic_dec_wrap:
2091       return "dec";
2092    case ac_atomic_fmin:
2093       return "fmin";
2094    case ac_atomic_fmax:
2095       return "fmax";
2096    }
2097    unreachable("bad atomic op");
2098 }
2099 
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)2100 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2101 {
2102    const char *overload[3] = {"", "", ""};
2103    unsigned num_overloads = 0;
2104    LLVMValueRef args[18];
2105    unsigned num_args = 0;
2106    enum ac_image_dim dim = a->dim;
2107 
2108    assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2109    assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2110            a->opcode != ac_image_store_mip) ||
2111           a->lod);
2112    assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113           (!a->compare && !a->offset));
2114    assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2115            a->opcode == ac_image_get_lod) ||
2116           !a->bias);
2117    assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2118           1);
2119    assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2120    assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
2121                       a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2122                       a->opcode != ac_image_get_resinfo));
2123    assert(!a->a16 || ctx->gfx_level >= GFX9);
2124    assert(a->g16 == a->a16 || ctx->gfx_level >= GFX10);
2125 
2126    assert(!a->offset ||
2127           ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
2128    assert(!a->bias ||
2129           ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
2130    assert(!a->compare ||
2131           ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
2132    assert(!a->derivs[0] ||
2133           ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
2134            (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
2135    assert(!a->coords[0] ||
2136           ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
2137            (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
2138    assert(!a->lod ||
2139           ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
2140            (a->opcode == ac_image_get_resinfo ||
2141             ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
2142             ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
2143    assert(!a->min_lod ||
2144           ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
2145           ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
2146 
2147    if (a->opcode == ac_image_get_lod) {
2148       switch (dim) {
2149       case ac_image_1darray:
2150          dim = ac_image_1d;
2151          break;
2152       case ac_image_2darray:
2153       case ac_image_cube:
2154          dim = ac_image_2d;
2155          break;
2156       default:
2157          break;
2158       }
2159    }
2160 
2161    bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2162                  a->opcode == ac_image_get_lod;
2163    bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2164    bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2165                a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2166    LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
2167    uint8_t dmask = a->dmask;
2168    LLVMTypeRef data_type;
2169    char data_type_str[32];
2170 
2171    if (atomic) {
2172       data_type = LLVMTypeOf(a->data[0]);
2173    } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2174       /* Image stores might have been shrinked using the format. */
2175       data_type = LLVMTypeOf(a->data[0]);
2176       dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2177    } else {
2178       data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2179    }
2180 
2181    if (a->tfe) {
2182       data_type = LLVMStructTypeInContext(
2183          ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
2184    }
2185 
2186    if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2187       args[num_args++] = a->data[0];
2188       if (a->opcode == ac_image_atomic_cmpswap)
2189          args[num_args++] = a->data[1];
2190    }
2191 
2192    if (!atomic)
2193       args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2194 
2195    if (a->offset)
2196       args[num_args++] = ac_to_integer(ctx, a->offset);
2197    if (a->bias) {
2198       args[num_args++] = ac_to_float(ctx, a->bias);
2199       overload[num_overloads++] = ".f32";
2200    }
2201    if (a->compare)
2202       args[num_args++] = ac_to_float(ctx, a->compare);
2203    if (a->derivs[0]) {
2204       unsigned count = ac_num_derivs(dim);
2205       for (unsigned i = 0; i < count; ++i)
2206          args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2207       overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
2208    }
2209    unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2210    for (unsigned i = 0; i < num_coords; ++i)
2211       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2212    if (a->lod)
2213       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2214    if (a->min_lod)
2215       args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2216 
2217    overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
2218 
2219    args[num_args++] = a->resource;
2220    if (sample) {
2221       args[num_args++] = a->sampler;
2222       args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2223    }
2224 
2225    args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
2226    args[num_args++] = LLVMConstInt(
2227       ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2228 
2229    const char *name;
2230    const char *atomic_subop = "";
2231    switch (a->opcode) {
2232    case ac_image_sample:
2233       name = "sample";
2234       break;
2235    case ac_image_gather4:
2236       name = "gather4";
2237       break;
2238    case ac_image_load:
2239       name = "load";
2240       break;
2241    case ac_image_load_mip:
2242       name = "load.mip";
2243       break;
2244    case ac_image_store:
2245       name = "store";
2246       break;
2247    case ac_image_store_mip:
2248       name = "store.mip";
2249       break;
2250    case ac_image_atomic:
2251       name = "atomic.";
2252       atomic_subop = get_atomic_name(a->atomic);
2253       break;
2254    case ac_image_atomic_cmpswap:
2255       name = "atomic.";
2256       atomic_subop = "cmpswap";
2257       break;
2258    case ac_image_get_lod:
2259       name = "getlod";
2260       break;
2261    case ac_image_get_resinfo:
2262       name = "getresinfo";
2263       break;
2264    default:
2265       unreachable("invalid image opcode");
2266    }
2267 
2268    const char *dimname;
2269    switch (dim) {
2270    case ac_image_1d:
2271       dimname = "1d";
2272       break;
2273    case ac_image_2d:
2274       dimname = "2d";
2275       break;
2276    case ac_image_3d:
2277       dimname = "3d";
2278       break;
2279    case ac_image_cube:
2280       dimname = "cube";
2281       break;
2282    case ac_image_1darray:
2283       dimname = "1darray";
2284       break;
2285    case ac_image_2darray:
2286       dimname = "2darray";
2287       break;
2288    case ac_image_2dmsaa:
2289       dimname = "2dmsaa";
2290       break;
2291    case ac_image_2darraymsaa:
2292       dimname = "2darraymsaa";
2293       break;
2294    default:
2295       unreachable("invalid dim");
2296    }
2297 
2298    ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2299 
2300    bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2301    char intr_name[96];
2302    snprintf(intr_name, sizeof(intr_name),
2303             "llvm.amdgcn.image.%s%s" /* base name */
2304             "%s%s%s%s"               /* sample/gather modifiers */
2305             ".%s.%s%s%s%s",          /* dimension and type overloads */
2306             name, atomic_subop, a->compare ? ".c" : "",
2307             a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2308             a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2309             data_type_str, overload[0], overload[1], overload[2]);
2310 
2311    LLVMTypeRef retty;
2312    if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2313       retty = ctx->voidt;
2314    else
2315       retty = data_type;
2316 
2317    LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2318    if (a->tfe) {
2319       LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
2320       LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
2321       result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
2322    }
2323 
2324    if (!sample && !atomic && retty != ctx->voidt)
2325       result = ac_to_integer(ctx, result);
2326 
2327    return result;
2328 }
2329 
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)2330 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2331 {
2332    LLVMValueRef samples;
2333 
2334    /* Read the samples from the descriptor directly.
2335     * Hardware doesn't have any instruction for this.
2336     */
2337    samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2338    samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2339    samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2340    samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2341    return samples;
2342 }
2343 
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2344 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2345 {
2346    return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2347                              AC_FUNC_ATTR_READNONE);
2348 }
2349 
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2350 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2351 {
2352    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2353                                          AC_FUNC_ATTR_READNONE);
2354    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2355 }
2356 
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2357 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2358 {
2359    LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2360                                          AC_FUNC_ATTR_READNONE);
2361    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2362 }
2363 
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2364 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2365                                          LLVMValueRef args[2])
2366 {
2367    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2368    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2369    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2370                                           ctx->gfx_level >= GFX11 ?
2371                                              "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
2372                                              "v_cvt_pknorm_i16_f16 $0, $1, $2",
2373                                           "=v,v,v", false, false);
2374    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2375 }
2376 
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2377 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2378                                          LLVMValueRef args[2])
2379 {
2380    LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2381    LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2382    LLVMValueRef code = LLVMConstInlineAsm(calltype,
2383                                           ctx->gfx_level >= GFX11 ?
2384                                              "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
2385                                              "v_cvt_pknorm_u16_f16 $0, $1, $2",
2386                                           "=v,v,v", false, false);
2387    return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2388 }
2389 
2390 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2391 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2392                                  bool hi)
2393 {
2394    assert(bits == 8 || bits == 10 || bits == 16);
2395 
2396    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2397    LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2398    LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2399    LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2400 
2401    /* Clamp. */
2402    if (bits != 16) {
2403       for (int i = 0; i < 2; i++) {
2404          bool alpha = hi && i == 1;
2405          args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2406          args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2407       }
2408    }
2409 
2410    LLVMValueRef res =
2411       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2412    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2413 }
2414 
2415 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2416 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2417                                  bool hi)
2418 {
2419    assert(bits == 8 || bits == 10 || bits == 16);
2420 
2421    LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2422    LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2423 
2424    /* Clamp. */
2425    if (bits != 16) {
2426       for (int i = 0; i < 2; i++) {
2427          bool alpha = hi && i == 1;
2428          args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2429       }
2430    }
2431 
2432    LLVMValueRef res =
2433       ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2434    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2435 }
2436 
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2437 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2438 {
2439    return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2440 }
2441 
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2442 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2443 {
2444    ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2445 }
2446 
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2447 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2448                           LLVMValueRef width, bool is_signed)
2449 {
2450    LLVMValueRef args[] = {
2451       input,
2452       offset,
2453       width,
2454    };
2455 
2456    return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2457                              ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2458 }
2459 
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2460 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2461                            LLVMValueRef s2)
2462 {
2463    return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2464 }
2465 
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2466 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2467                            LLVMValueRef s2)
2468 {
2469    /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2470    if (ctx->gfx_level >= GFX10) {
2471       return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2472                                 AC_FUNC_ATTR_READNONE);
2473    }
2474 
2475    return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2476 }
2477 
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2478 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2479 {
2480    if (!wait_flags)
2481       return;
2482 
2483    unsigned expcnt = 7;
2484    unsigned lgkmcnt = 63;
2485    unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2486    unsigned vscnt = 63;
2487 
2488    if (wait_flags & AC_WAIT_EXP)
2489       expcnt = 0;
2490    if (wait_flags & AC_WAIT_LGKM)
2491       lgkmcnt = 0;
2492    if (wait_flags & AC_WAIT_VLOAD)
2493       vmcnt = 0;
2494 
2495    if (wait_flags & AC_WAIT_VSTORE) {
2496       if (ctx->gfx_level >= GFX10)
2497          vscnt = 0;
2498       else
2499          vmcnt = 0;
2500    }
2501 
2502    /* There is no intrinsic for vscnt(0), so use a fence. */
2503    if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2504        vscnt == 0) {
2505       assert(!(wait_flags & AC_WAIT_EXP));
2506       LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2507       return;
2508    }
2509 
2510    unsigned simm16;
2511 
2512    if (ctx->gfx_level >= GFX11)
2513       simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2514    else
2515       simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2516 
2517    LLVMValueRef args[1] = {
2518       LLVMConstInt(ctx->i32, simm16, false),
2519    };
2520    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2521 }
2522 
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2523 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2524                            LLVMTypeRef type)
2525 {
2526    unsigned bitsize = ac_get_elem_bits(ctx, type);
2527    LLVMValueRef zero = LLVMConstReal(type, 0.0);
2528    LLVMValueRef one = LLVMConstReal(type, 1.0);
2529    LLVMValueRef result;
2530 
2531    if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2532       /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2533        * doesn't expose an intrinsic.
2534        */
2535       result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2536    } else {
2537       LLVMTypeRef type;
2538       char *intr;
2539 
2540       if (bitsize == 16) {
2541          intr = "llvm.amdgcn.fmed3.f16";
2542          type = ctx->f16;
2543       } else {
2544          assert(bitsize == 32);
2545          intr = "llvm.amdgcn.fmed3.f32";
2546          type = ctx->f32;
2547       }
2548 
2549       LLVMValueRef params[] = {
2550          zero,
2551          one,
2552          src,
2553       };
2554 
2555       result = ac_build_intrinsic(ctx, intr, type, params, 3,
2556                                   AC_FUNC_ATTR_READNONE);
2557    }
2558 
2559    if (ctx->gfx_level < GFX9 && bitsize == 32) {
2560       /* Only pre-GFX9 chips do not flush denorms. */
2561       result = ac_build_canonicalize(ctx, result, bitsize);
2562    }
2563 
2564    return result;
2565 }
2566 
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2567 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2568 {
2569    LLVMTypeRef type;
2570    char *intr;
2571 
2572    if (bitsize == 16) {
2573       intr = "llvm.amdgcn.fract.f16";
2574       type = ctx->f16;
2575    } else if (bitsize == 32) {
2576       intr = "llvm.amdgcn.fract.f32";
2577       type = ctx->f32;
2578    } else {
2579       intr = "llvm.amdgcn.fract.f64";
2580       type = ctx->f64;
2581    }
2582 
2583    LLVMValueRef params[] = {
2584       src0,
2585    };
2586    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2587 }
2588 
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2589 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2590 {
2591 
2592    if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2593       LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2594       unsigned vec_size = LLVMGetVectorSize(type);
2595       LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2596 
2597       for (unsigned i = 0; i < vec_size; i++)
2598          scalars[i] = scalar;
2599       return LLVMConstVector(scalars, vec_size);
2600    }
2601    return LLVMConstInt(type, value, 0);
2602 }
2603 
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2604 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2605 {
2606    LLVMTypeRef type = LLVMTypeOf(src0);
2607    LLVMValueRef val;
2608 
2609    /* v_med3 is selected only when max is first. (LLVM bug?) */
2610    val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2611    return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2612 }
2613 
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2614 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2615 {
2616    ac_enable_signed_zeros(ctx);
2617    /* (val + 0) converts negative zero to positive zero. */
2618    val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2619    ac_disable_signed_zeros(ctx);
2620    return val;
2621 }
2622 
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2623 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2624 {
2625    LLVMTypeRef type = LLVMTypeOf(src);
2626    LLVMValueRef pos, neg, dw[2], val;
2627    unsigned bitsize = ac_get_elem_bits(ctx, type);
2628 
2629    /* The standard version leads to this:
2630     *   v_cmp_ngt_f32_e64 s[0:1], s4, 0                       ; D40B0000 00010004
2631     *   v_cndmask_b32_e64 v4, 1.0, s4, s[0:1]                 ; D5010004 000008F2
2632     *   v_cmp_le_f32_e32 vcc, 0, v4                           ; 7C060880
2633     *   v_cndmask_b32_e32 v4, -1.0, v4, vcc                   ; 020808F3
2634     *
2635     * The isign version:
2636     *   v_add_f32_e64 v4, s4, 0                               ; D5030004 00010004
2637     *   v_med3_i32 v4, v4, -1, 1                              ; D5580004 02058304
2638     *   v_cvt_f32_i32_e32 v4, v4                              ; 7E080B04
2639     *
2640     * (src0 + 0) converts negative zero to positive zero.
2641     * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2642     *
2643     * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2644     * reduction. (FP64 comparisons are as fast as int64 comparisons)
2645     */
2646    if (bitsize == 16 || bitsize == 32) {
2647       val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2648       val = ac_build_isign(ctx, val);
2649       return LLVMBuildSIToFP(ctx->builder, val, type, "");
2650    }
2651 
2652    assert(bitsize == 64);
2653    pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2654    neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2655    dw[0] = ctx->i32_0;
2656    dw[1] = LLVMBuildSelect(
2657       ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2658       LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2659       "");
2660    return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2661 }
2662 
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2663 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2664 {
2665    LLVMValueRef result;
2666    unsigned bitsize;
2667 
2668    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2669 
2670    switch (bitsize) {
2671    case 128:
2672       result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2673                                   AC_FUNC_ATTR_READNONE);
2674       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2675       break;
2676    case 64:
2677       result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2678                                   AC_FUNC_ATTR_READNONE);
2679 
2680       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2681       break;
2682    case 32:
2683       result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2684                                   AC_FUNC_ATTR_READNONE);
2685       break;
2686    case 16:
2687       result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2688                                   AC_FUNC_ATTR_READNONE);
2689 
2690       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2691       break;
2692    case 8:
2693       result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2694                                   AC_FUNC_ATTR_READNONE);
2695 
2696       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2697       break;
2698    default:
2699       unreachable(!"invalid bitsize");
2700       break;
2701    }
2702 
2703    return result;
2704 }
2705 
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2706 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2707 {
2708    LLVMValueRef result;
2709    unsigned bitsize;
2710 
2711    bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2712 
2713    switch (bitsize) {
2714    case 64:
2715       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2716                                   AC_FUNC_ATTR_READNONE);
2717 
2718       result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2719       break;
2720    case 32:
2721       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2722                                   AC_FUNC_ATTR_READNONE);
2723       break;
2724    case 16:
2725       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2726                                   AC_FUNC_ATTR_READNONE);
2727 
2728       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2729       break;
2730    case 8:
2731       result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2732                                   AC_FUNC_ATTR_READNONE);
2733 
2734       result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2735       break;
2736    default:
2737       unreachable(!"invalid bitsize");
2738       break;
2739    }
2740 
2741    return result;
2742 }
2743 
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2744 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2745 {
2746    LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2747    ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2748                       AC_FUNC_ATTR_CONVERGENT);
2749 }
2750 
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2751 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2752 {
2753    unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2754    ctx->lds = LLVMBuildIntToPtr(
2755       ctx->builder, ctx->i32_0,
2756       LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2757 }
2758 
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2759 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2760 {
2761    return LLVMBuildLoad2(ctx->builder, ctx->i32, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2762 }
2763 
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2764 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2765 {
2766    value = ac_to_integer(ctx, value);
2767    ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2768 }
2769 
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2770 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2771 {
2772    unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2773    const char *intrin_name;
2774    LLVMTypeRef type;
2775    LLVMValueRef zero;
2776 
2777    switch (src0_bitsize) {
2778    case 64:
2779       intrin_name = "llvm.cttz.i64";
2780       type = ctx->i64;
2781       zero = ctx->i64_0;
2782       break;
2783    case 32:
2784       intrin_name = "llvm.cttz.i32";
2785       type = ctx->i32;
2786       zero = ctx->i32_0;
2787       break;
2788    case 16:
2789       intrin_name = "llvm.cttz.i16";
2790       type = ctx->i16;
2791       zero = ctx->i16_0;
2792       break;
2793    case 8:
2794       intrin_name = "llvm.cttz.i8";
2795       type = ctx->i8;
2796       zero = ctx->i8_0;
2797       break;
2798    default:
2799       unreachable(!"invalid bitsize");
2800    }
2801 
2802    LLVMValueRef params[2] = {
2803       src0,
2804 
2805       /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2806        * add special code to check for x=0. The reason is that
2807        * the LLVM behavior for x=0 is different from what we
2808        * need here. However, LLVM also assumes that ffs(x) is
2809        * in [0, 31], but GLSL expects that ffs(0) = -1, so
2810        * a conditional assignment to handle 0 is still required.
2811        *
2812        * The hardware already implements the correct behavior.
2813        */
2814       ctx->i1true,
2815    };
2816 
2817    LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
2818 
2819    if (src0_bitsize == 64) {
2820       lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2821    } else if (src0_bitsize < 32) {
2822       lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2823    }
2824 
2825    /* TODO: We need an intrinsic to skip this conditional. */
2826    /* Check for zero: */
2827    return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2828                           LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2829 }
2830 
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2831 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2832 {
2833    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2834 }
2835 
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2836 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2837 {
2838    return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2839 }
2840 
get_current_flow(struct ac_llvm_context * ctx)2841 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2842 {
2843    if (ctx->flow->depth > 0)
2844       return &ctx->flow->stack[ctx->flow->depth - 1];
2845    return NULL;
2846 }
2847 
get_innermost_loop(struct ac_llvm_context * ctx)2848 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2849 {
2850    for (unsigned i = ctx->flow->depth; i > 0; --i) {
2851       if (ctx->flow->stack[i - 1].loop_entry_block)
2852          return &ctx->flow->stack[i - 1];
2853    }
2854    return NULL;
2855 }
2856 
push_flow(struct ac_llvm_context * ctx)2857 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2858 {
2859    struct ac_llvm_flow *flow;
2860 
2861    if (ctx->flow->depth >= ctx->flow->depth_max) {
2862       unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2863 
2864       ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2865       ctx->flow->depth_max = new_max;
2866    }
2867 
2868    flow = &ctx->flow->stack[ctx->flow->depth];
2869    ctx->flow->depth++;
2870 
2871    flow->next_block = NULL;
2872    flow->loop_entry_block = NULL;
2873    return flow;
2874 }
2875 
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2876 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2877 {
2878    char buf[32];
2879    snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2880    LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2881 }
2882 
2883 /* Append a basic block at the level of the parent flow.
2884  */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2885 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2886 {
2887    assert(ctx->flow->depth >= 1);
2888 
2889    if (ctx->flow->depth >= 2) {
2890       struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2891 
2892       return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2893    }
2894 
2895    LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2896    return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2897 }
2898 
2899 /* Emit a branch to the given default target for the current block if
2900  * applicable -- that is, if the current block does not already contain a
2901  * branch from a break or continue.
2902  */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2903 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2904 {
2905    if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2906       LLVMBuildBr(builder, target);
2907 }
2908 
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2909 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2910 {
2911    struct ac_llvm_flow *flow = push_flow(ctx);
2912    flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2913    flow->next_block = append_basic_block(ctx, "ENDLOOP");
2914    set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2915    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2916    LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2917 }
2918 
ac_build_break(struct ac_llvm_context * ctx)2919 void ac_build_break(struct ac_llvm_context *ctx)
2920 {
2921    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2922    LLVMBuildBr(ctx->builder, flow->next_block);
2923 }
2924 
ac_build_continue(struct ac_llvm_context * ctx)2925 void ac_build_continue(struct ac_llvm_context *ctx)
2926 {
2927    struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2928    LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2929 }
2930 
ac_build_else(struct ac_llvm_context * ctx,int label_id)2931 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2932 {
2933    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2934    LLVMBasicBlockRef endif_block;
2935 
2936    assert(!current_branch->loop_entry_block);
2937 
2938    endif_block = append_basic_block(ctx, "ENDIF");
2939    emit_default_branch(ctx->builder, endif_block);
2940 
2941    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2942    set_basicblock_name(current_branch->next_block, "else", label_id);
2943 
2944    current_branch->next_block = endif_block;
2945 }
2946 
2947 /* Invoked after a branch is exited. */
ac_branch_exited(struct ac_llvm_context * ctx)2948 static void ac_branch_exited(struct ac_llvm_context *ctx)
2949 {
2950    if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {
2951       /* The previous conditional branch contained demote. Kill threads
2952        * after all conditional blocks because amdgcn.wqm.vote doesn't
2953        * return usable values inside the blocks.
2954        *
2955        * This is an optional optimization that only kills whole inactive quads.
2956        */
2957       LLVMValueRef cond = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
2958       ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));
2959       ctx->conditional_demote_seen = false;
2960    }
2961 }
2962 
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2963 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2964 {
2965    struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2966 
2967    assert(!current_branch->loop_entry_block);
2968 
2969    emit_default_branch(ctx->builder, current_branch->next_block);
2970    LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2971    set_basicblock_name(current_branch->next_block, "endif", label_id);
2972 
2973    ctx->flow->depth--;
2974    ac_branch_exited(ctx);
2975 }
2976 
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2977 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2978 {
2979    struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2980 
2981    assert(current_loop->loop_entry_block);
2982 
2983    emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2984 
2985    LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2986    set_basicblock_name(current_loop->next_block, "endloop", label_id);
2987    ctx->flow->depth--;
2988    ac_branch_exited(ctx);
2989 }
2990 
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2991 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2992 {
2993    struct ac_llvm_flow *flow = push_flow(ctx);
2994    LLVMBasicBlockRef if_block;
2995 
2996    if_block = append_basic_block(ctx, "IF");
2997    flow->next_block = append_basic_block(ctx, "ELSE");
2998    set_basicblock_name(if_block, "if", label_id);
2999    LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3000    LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3001 }
3002 
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3003 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3004 {
3005    LLVMBuilderRef builder = ac->builder;
3006    LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3007    LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3008    LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3009    LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3010    LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3011    LLVMValueRef res;
3012 
3013    if (first_instr) {
3014       LLVMPositionBuilderBefore(first_builder, first_instr);
3015    } else {
3016       LLVMPositionBuilderAtEnd(first_builder, first_block);
3017    }
3018 
3019    res = LLVMBuildAlloca(first_builder, type, name);
3020    LLVMDisposeBuilder(first_builder);
3021    return res;
3022 }
3023 
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3024 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3025 {
3026    LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3027    LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3028    return ptr;
3029 }
3030 
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)3031 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
3032 {
3033    LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
3034    LLVMBuildStore(ac->builder, val, ptr);
3035    return ptr;
3036 }
3037 
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)3038 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3039 {
3040    int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3041    return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3042 }
3043 
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)3044 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3045 {
3046    unsigned num_components = ac_get_llvm_num_components(value);
3047    if (count == num_components)
3048       return value;
3049 
3050    LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3051    masks[0] = ctx->i32_0;
3052    masks[1] = ctx->i32_1;
3053    for (unsigned i = 2; i < count; i++)
3054       masks[i] = LLVMConstInt(ctx->i32, i, false);
3055 
3056    if (count == 1)
3057       return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3058 
3059    LLVMValueRef swizzle = LLVMConstVector(masks, count);
3060    return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3061 }
3062 
3063 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)3064 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3065                              unsigned bitwidth)
3066 {
3067    LLVMValueRef value = param;
3068    if (rshift)
3069       value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
3070 
3071    if (rshift + bitwidth < 32) {
3072       uint64_t mask = (1ull << bitwidth) - 1;
3073       value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
3074    }
3075 
3076    if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
3077       value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
3078    return value;
3079 }
3080 
3081 /* Adjust the sample index according to FMASK.
3082  *
3083  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3084  * which is the identity mapping. Each nibble says which physical sample
3085  * should be fetched to get that sample.
3086  *
3087  * For example, 0x11111100 means there are only 2 samples stored and
3088  * the second sample covers 3/4 of the pixel. When reading samples 0
3089  * and 1, return physical sample 0 (determined by the first two 0s
3090  * in FMASK), otherwise return physical sample 1.
3091  *
3092  * The sample index should be adjusted as follows:
3093  *   addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3094  */
ac_apply_fmask_to_sample(struct ac_llvm_context * ac,LLVMValueRef fmask,LLVMValueRef * addr,bool is_array_tex)3095 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3096                               bool is_array_tex)
3097 {
3098    struct ac_image_args fmask_load = {0};
3099    fmask_load.opcode = ac_image_load;
3100    fmask_load.resource = fmask;
3101    fmask_load.dmask = 0xf;
3102    fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3103    fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3104 
3105    fmask_load.coords[0] = addr[0];
3106    fmask_load.coords[1] = addr[1];
3107    if (is_array_tex)
3108       fmask_load.coords[2] = addr[2];
3109    fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;
3110 
3111    LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3112    fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3113 
3114    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3115     * resource descriptor is 0 (invalid).
3116     */
3117    LLVMValueRef tmp;
3118    tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3119    tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3120    tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3121    fmask_value =
3122       LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), "");
3123 
3124    /* Apply the formula. */
3125    unsigned sample_chan = is_array_tex ? 3 : 2;
3126    LLVMValueRef final_sample;
3127    final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3128                                LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");
3129    final_sample = LLVMBuildLShr(ac->builder, fmask_value,
3130                                 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
3131    /* Mask the sample index by 0x7, because 0x8 means an unknown value
3132     * with EQAA, so those will map to 0. */
3133    addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3134    if (fmask_load.a16)
3135       addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
3136 }
3137 
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3138 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3139                                        LLVMValueRef lane, bool with_opt_barrier)
3140 {
3141    LLVMTypeRef type = LLVMTypeOf(src);
3142    LLVMValueRef result;
3143 
3144    if (with_opt_barrier)
3145       ac_build_optimization_barrier(ctx, &src, false);
3146 
3147    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3148    if (lane)
3149       lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3150 
3151    result =
3152       ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3153                          ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3154                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3155 
3156    return LLVMBuildTrunc(ctx->builder, result, type, "");
3157 }
3158 
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3159 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3160                                              LLVMValueRef lane, bool with_opt_barrier)
3161 {
3162    LLVMTypeRef src_type = LLVMTypeOf(src);
3163    src = ac_to_integer(ctx, src);
3164    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3165    LLVMValueRef ret;
3166 
3167    if (bits > 32) {
3168       assert(bits % 32 == 0);
3169       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3170       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3171       ret = LLVMGetUndef(vec_type);
3172       for (unsigned i = 0; i < bits / 32; i++) {
3173          LLVMValueRef ret_comp;
3174 
3175          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3176 
3177          ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3178 
3179          ret =
3180             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3181       }
3182    } else {
3183       ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3184    }
3185 
3186    if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3187       return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3188    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3189 }
3190 
3191 /**
3192  * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3193  *
3194  * The optimization barrier is not needed if the value is the same in all lanes
3195  * or if this is called in the outermost block.
3196  *
3197  * @param ctx
3198  * @param src
3199  * @param lane - id of the lane or NULL for the first active lane
3200  * @return value of the lane
3201  */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3202 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3203                                               LLVMValueRef lane)
3204 {
3205    return ac_build_readlane_common(ctx, src, lane, false);
3206 }
3207 
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3208 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3209 {
3210    return ac_build_readlane_common(ctx, src, lane, true);
3211 }
3212 
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)3213 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3214                                 LLVMValueRef lane)
3215 {
3216    return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3217                              (LLVMValueRef[]){value, lane, src}, 3,
3218                              AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3219 }
3220 
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)3221 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
3222 {
3223    LLVMValueRef val;
3224 
3225    if (ctx->wave_size == 32) {
3226       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3227                                (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3228    } else {
3229       LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3230       LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3231       LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3232       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3233                                (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3234       val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3235                                2, AC_FUNC_ATTR_READNONE);
3236    }
3237 
3238    /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
3239     * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
3240     */
3241    if (add_src != NULL && add_src != ctx->i32_0) {
3242       return LLVMBuildAdd(ctx->builder, val, add_src, "");
3243    }
3244 
3245    return val;
3246 }
3247 
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)3248 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3249 {
3250    return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
3251 }
3252 
3253 enum dpp_ctrl
3254 {
3255    _dpp_quad_perm = 0x000,
3256    _dpp_row_sl = 0x100,
3257    _dpp_row_sr = 0x110,
3258    _dpp_row_rr = 0x120,
3259    dpp_wf_sl1 = 0x130,
3260    dpp_wf_rl1 = 0x134,
3261    dpp_wf_sr1 = 0x138,
3262    dpp_wf_rr1 = 0x13C,
3263    dpp_row_mirror = 0x140,
3264    dpp_row_half_mirror = 0x141,
3265    dpp_row_bcast15 = 0x142,
3266    dpp_row_bcast31 = 0x143
3267 };
3268 
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3269 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3270                                           unsigned lane3)
3271 {
3272    assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3273    return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3274 }
3275 
dpp_row_sr(unsigned amount)3276 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3277 {
3278    assert(amount > 0 && amount < 16);
3279    return _dpp_row_sr | amount;
3280 }
3281 
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3282 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3283                                   enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3284                                   bool bound_ctrl)
3285 {
3286    LLVMTypeRef type = LLVMTypeOf(src);
3287    LLVMValueRef res;
3288 
3289    old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3290    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3291 
3292    res = ac_build_intrinsic(
3293       ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3294       (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3295                        LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3296                        LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3297       6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3298 
3299    return LLVMBuildTrunc(ctx->builder, res, type, "");
3300 }
3301 
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3302 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3303                                  enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3304                                  bool bound_ctrl)
3305 {
3306    LLVMTypeRef src_type = LLVMTypeOf(src);
3307    src = ac_to_integer(ctx, src);
3308    old = ac_to_integer(ctx, old);
3309    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3310    LLVMValueRef ret;
3311    if (bits > 32) {
3312       assert(bits % 32 == 0);
3313       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3314       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3315       LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3316       ret = LLVMGetUndef(vec_type);
3317       for (unsigned i = 0; i < bits / 32; i++) {
3318          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3319          old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3320          LLVMValueRef ret_comp =
3321             _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3322          ret =
3323             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3324       }
3325    } else {
3326       ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3327    }
3328    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3329 }
3330 
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3331 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3332                                          uint64_t sel, bool exchange_rows, bool bound_ctrl)
3333 {
3334    LLVMTypeRef type = LLVMTypeOf(src);
3335    LLVMValueRef result;
3336 
3337    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3338 
3339    LLVMValueRef args[6] = {
3340       src,
3341       src,
3342       LLVMConstInt(ctx->i32, sel, false),
3343       LLVMConstInt(ctx->i32, sel >> 32, false),
3344       ctx->i1true, /* fi */
3345       bound_ctrl ? ctx->i1true : ctx->i1false,
3346    };
3347 
3348    result =
3349       ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3350                          ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3351 
3352    return LLVMBuildTrunc(ctx->builder, result, type, "");
3353 }
3354 
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3355 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3356                                         bool exchange_rows, bool bound_ctrl)
3357 {
3358    LLVMTypeRef src_type = LLVMTypeOf(src);
3359    src = ac_to_integer(ctx, src);
3360    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3361    LLVMValueRef ret;
3362    if (bits > 32) {
3363       assert(bits % 32 == 0);
3364       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3365       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3366       ret = LLVMGetUndef(vec_type);
3367       for (unsigned i = 0; i < bits / 32; i++) {
3368          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3369          LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3370          ret =
3371             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3372       }
3373    } else {
3374       ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3375    }
3376    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3377 }
3378 
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)3379 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3380 {
3381    assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3382    return and_mask | (or_mask << 5) | (xor_mask << 10);
3383 }
3384 
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3385 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3386                                          unsigned mask)
3387 {
3388    LLVMTypeRef src_type = LLVMTypeOf(src);
3389    LLVMValueRef ret;
3390 
3391    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3392 
3393    ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3394                             (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3395                             AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3396 
3397    return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3398 }
3399 
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3400 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3401 {
3402    LLVMTypeRef src_type = LLVMTypeOf(src);
3403    src = ac_to_integer(ctx, src);
3404    unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3405    LLVMValueRef ret;
3406    if (bits > 32) {
3407       assert(bits % 32 == 0);
3408       LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3409       LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3410       ret = LLVMGetUndef(vec_type);
3411       for (unsigned i = 0; i < bits / 32; i++) {
3412          src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3413          LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3414          ret =
3415             LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3416       }
3417    } else {
3418       ret = _ac_build_ds_swizzle(ctx, src, mask);
3419    }
3420    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3421 }
3422 
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3423 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3424 {
3425    LLVMTypeRef src_type = LLVMTypeOf(src);
3426    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3427    char name[32], type[8];
3428    LLVMValueRef ret;
3429 
3430    src = ac_to_integer(ctx, src);
3431 
3432    if (bitsize < 32)
3433       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3434 
3435    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3436    snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3437    ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3438                             AC_FUNC_ATTR_READNONE);
3439 
3440    if (bitsize < 32)
3441       ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3442 
3443    return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3444 }
3445 
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3446 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3447                                           LLVMValueRef inactive)
3448 {
3449    char name[33], type[8];
3450    LLVMTypeRef src_type = LLVMTypeOf(src);
3451    unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3452    src = ac_to_integer(ctx, src);
3453    inactive = ac_to_integer(ctx, inactive);
3454 
3455    if (bitsize < 32) {
3456       src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3457       inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3458    }
3459 
3460    ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3461    snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3462    LLVMValueRef ret =
3463       ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3464                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3465    if (bitsize < 32)
3466       ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3467 
3468    return ret;
3469 }
3470 
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3471 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3472                                            unsigned type_size)
3473 {
3474 
3475    if (type_size == 0) {
3476       switch (op) {
3477       case nir_op_ior:
3478       case nir_op_ixor:
3479          return LLVMConstInt(ctx->i1, 0, 0);
3480       case nir_op_iand:
3481          return LLVMConstInt(ctx->i1, 1, 0);
3482       default:
3483          unreachable("bad reduction intrinsic");
3484       }
3485    } else if (type_size == 1) {
3486       switch (op) {
3487       case nir_op_iadd:
3488          return ctx->i8_0;
3489       case nir_op_imul:
3490          return ctx->i8_1;
3491       case nir_op_imin:
3492          return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3493       case nir_op_umin:
3494          return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3495       case nir_op_imax:
3496          return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3497       case nir_op_umax:
3498          return ctx->i8_0;
3499       case nir_op_iand:
3500          return LLVMConstInt(ctx->i8, -1, 0);
3501       case nir_op_ior:
3502          return ctx->i8_0;
3503       case nir_op_ixor:
3504          return ctx->i8_0;
3505       default:
3506          unreachable("bad reduction intrinsic");
3507       }
3508    } else if (type_size == 2) {
3509       switch (op) {
3510       case nir_op_iadd:
3511          return ctx->i16_0;
3512       case nir_op_fadd:
3513          return ctx->f16_0;
3514       case nir_op_imul:
3515          return ctx->i16_1;
3516       case nir_op_fmul:
3517          return ctx->f16_1;
3518       case nir_op_imin:
3519          return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3520       case nir_op_umin:
3521          return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3522       case nir_op_fmin:
3523          return LLVMConstReal(ctx->f16, INFINITY);
3524       case nir_op_imax:
3525          return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3526       case nir_op_umax:
3527          return ctx->i16_0;
3528       case nir_op_fmax:
3529          return LLVMConstReal(ctx->f16, -INFINITY);
3530       case nir_op_iand:
3531          return LLVMConstInt(ctx->i16, -1, 0);
3532       case nir_op_ior:
3533          return ctx->i16_0;
3534       case nir_op_ixor:
3535          return ctx->i16_0;
3536       default:
3537          unreachable("bad reduction intrinsic");
3538       }
3539    } else if (type_size == 4) {
3540       switch (op) {
3541       case nir_op_iadd:
3542          return ctx->i32_0;
3543       case nir_op_fadd:
3544          return ctx->f32_0;
3545       case nir_op_imul:
3546          return ctx->i32_1;
3547       case nir_op_fmul:
3548          return ctx->f32_1;
3549       case nir_op_imin:
3550          return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3551       case nir_op_umin:
3552          return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3553       case nir_op_fmin:
3554          return LLVMConstReal(ctx->f32, INFINITY);
3555       case nir_op_imax:
3556          return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3557       case nir_op_umax:
3558          return ctx->i32_0;
3559       case nir_op_fmax:
3560          return LLVMConstReal(ctx->f32, -INFINITY);
3561       case nir_op_iand:
3562          return LLVMConstInt(ctx->i32, -1, 0);
3563       case nir_op_ior:
3564          return ctx->i32_0;
3565       case nir_op_ixor:
3566          return ctx->i32_0;
3567       default:
3568          unreachable("bad reduction intrinsic");
3569       }
3570    } else { /* type_size == 64bit */
3571       switch (op) {
3572       case nir_op_iadd:
3573          return ctx->i64_0;
3574       case nir_op_fadd:
3575          return ctx->f64_0;
3576       case nir_op_imul:
3577          return ctx->i64_1;
3578       case nir_op_fmul:
3579          return ctx->f64_1;
3580       case nir_op_imin:
3581          return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3582       case nir_op_umin:
3583          return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3584       case nir_op_fmin:
3585          return LLVMConstReal(ctx->f64, INFINITY);
3586       case nir_op_imax:
3587          return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3588       case nir_op_umax:
3589          return ctx->i64_0;
3590       case nir_op_fmax:
3591          return LLVMConstReal(ctx->f64, -INFINITY);
3592       case nir_op_iand:
3593          return LLVMConstInt(ctx->i64, -1, 0);
3594       case nir_op_ior:
3595          return ctx->i64_0;
3596       case nir_op_ixor:
3597          return ctx->i64_0;
3598       default:
3599          unreachable("bad reduction intrinsic");
3600       }
3601    }
3602 }
3603 
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3604 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3605                                     nir_op op)
3606 {
3607    bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3608    bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3609    switch (op) {
3610    case nir_op_iadd:
3611       return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3612    case nir_op_fadd:
3613       return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3614    case nir_op_imul:
3615       return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3616    case nir_op_fmul:
3617       return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3618    case nir_op_imin:
3619       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3620                              lhs, rhs, "");
3621    case nir_op_umin:
3622       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3623                              lhs, rhs, "");
3624    case nir_op_fmin:
3625       return ac_build_intrinsic(
3626          ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3627          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3628          AC_FUNC_ATTR_READNONE);
3629    case nir_op_imax:
3630       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3631                              lhs, rhs, "");
3632    case nir_op_umax:
3633       return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3634                              lhs, rhs, "");
3635    case nir_op_fmax:
3636       return ac_build_intrinsic(
3637          ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3638          _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3639          AC_FUNC_ATTR_READNONE);
3640    case nir_op_iand:
3641       return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3642    case nir_op_ior:
3643       return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3644    case nir_op_ixor:
3645       return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3646    default:
3647       unreachable("bad reduction intrinsic");
3648    }
3649 }
3650 
3651 /**
3652  * \param src The value to shift.
3653  * \param identity The value to use the first lane.
3654  * \param maxprefix specifies that the result only needs to be correct for a
3655  *     prefix of this many threads
3656  * \return src, shifted 1 lane up, and identity shifted into lane 0.
3657  */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3658 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3659                                                LLVMValueRef identity, unsigned maxprefix)
3660 {
3661    if (ctx->gfx_level >= GFX10) {
3662       /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3663       LLVMValueRef active, tmp1, tmp2;
3664       LLVMValueRef tid = ac_get_thread_id(ctx);
3665 
3666       tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3667 
3668       tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3669 
3670       if (maxprefix > 32) {
3671          active =
3672             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3673 
3674          tmp2 = LLVMBuildSelect(ctx->builder, active,
3675                                 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3676                                 tmp2, "");
3677 
3678          active = LLVMBuildOr(
3679             ctx->builder, active,
3680             LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3681                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3682                           LLVMConstInt(ctx->i32, 0x10, false), ""),
3683             "");
3684          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3685       } else if (maxprefix > 16) {
3686          active =
3687             LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3688 
3689          return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3690       }
3691    } else if (ctx->gfx_level >= GFX8) {
3692       return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3693    }
3694 
3695    /* wavefront shift_right by 1 on SI/CI */
3696    LLVMValueRef active, tmp1, tmp2;
3697    LLVMValueRef tid = ac_get_thread_id(ctx);
3698    tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3699    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3700    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3701                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3702                           LLVMConstInt(ctx->i32, 0x4, 0), "");
3703    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3704    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3705    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3706                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3707                           LLVMConstInt(ctx->i32, 0x8, 0), "");
3708    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3709    tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3710    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3711                           LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3712                           LLVMConstInt(ctx->i32, 0x10, 0), "");
3713    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3714    tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3715    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3716    tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3717    active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3718    return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3719 }
3720 
3721 /**
3722  * \param maxprefix specifies that the result only needs to be correct for a
3723  *     prefix of this many threads
3724  */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3725 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3726                                   LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3727 {
3728    LLVMValueRef result, tmp;
3729 
3730    if (!inclusive)
3731       src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3732 
3733    result = src;
3734 
3735    if (ctx->gfx_level <= GFX7) {
3736       assert(maxprefix == 64);
3737       LLVMValueRef tid = ac_get_thread_id(ctx);
3738       LLVMValueRef active;
3739       tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3740       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3741                              LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3742       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3743       result = ac_build_alu_op(ctx, result, tmp, op);
3744       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3745       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3746                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3747                              ctx->i32_0, "");
3748       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3749       result = ac_build_alu_op(ctx, result, tmp, op);
3750       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3751       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3752                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3753                              ctx->i32_0, "");
3754       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3755       result = ac_build_alu_op(ctx, result, tmp, op);
3756       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3757       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3758                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3759                              ctx->i32_0, "");
3760       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3761       result = ac_build_alu_op(ctx, result, tmp, op);
3762       tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3763       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3764                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3765                              ctx->i32_0, "");
3766       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3767       result = ac_build_alu_op(ctx, result, tmp, op);
3768       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3769       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3770                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3771                              ctx->i32_0, "");
3772       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3773       result = ac_build_alu_op(ctx, result, tmp, op);
3774       return result;
3775    }
3776 
3777    if (maxprefix <= 1)
3778       return result;
3779    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3780    result = ac_build_alu_op(ctx, result, tmp, op);
3781    if (maxprefix <= 2)
3782       return result;
3783    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3784    result = ac_build_alu_op(ctx, result, tmp, op);
3785    if (maxprefix <= 3)
3786       return result;
3787    tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3788    result = ac_build_alu_op(ctx, result, tmp, op);
3789    if (maxprefix <= 4)
3790       return result;
3791    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3792    result = ac_build_alu_op(ctx, result, tmp, op);
3793    if (maxprefix <= 8)
3794       return result;
3795    tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3796    result = ac_build_alu_op(ctx, result, tmp, op);
3797    if (maxprefix <= 16)
3798       return result;
3799 
3800    if (ctx->gfx_level >= GFX10) {
3801       LLVMValueRef tid = ac_get_thread_id(ctx);
3802       LLVMValueRef active;
3803 
3804       tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3805 
3806       active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3807                              LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3808                              ctx->i32_0, "");
3809 
3810       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3811 
3812       result = ac_build_alu_op(ctx, result, tmp, op);
3813 
3814       if (maxprefix <= 32)
3815          return result;
3816 
3817       tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3818 
3819       active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3820 
3821       tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3822 
3823       result = ac_build_alu_op(ctx, result, tmp, op);
3824       return result;
3825    }
3826 
3827    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3828    result = ac_build_alu_op(ctx, result, tmp, op);
3829    if (maxprefix <= 32)
3830       return result;
3831    tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3832    result = ac_build_alu_op(ctx, result, tmp, op);
3833    return result;
3834 }
3835 
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3836 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3837 {
3838    LLVMValueRef result;
3839 
3840    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3841       LLVMBuilderRef builder = ctx->builder;
3842       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3843       result = ac_build_ballot(ctx, src);
3844       result = ac_build_mbcnt(ctx, result);
3845       result = LLVMBuildAdd(builder, result, src, "");
3846       return result;
3847    }
3848 
3849    ac_build_optimization_barrier(ctx, &src, false);
3850 
3851    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3852    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3853                              LLVMTypeOf(identity), "");
3854    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3855 
3856    return ac_build_wwm(ctx, result);
3857 }
3858 
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3859 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3860 {
3861    LLVMValueRef result;
3862 
3863    if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3864       LLVMBuilderRef builder = ctx->builder;
3865       src = LLVMBuildZExt(builder, src, ctx->i32, "");
3866       result = ac_build_ballot(ctx, src);
3867       result = ac_build_mbcnt(ctx, result);
3868       return result;
3869    }
3870 
3871    ac_build_optimization_barrier(ctx, &src, false);
3872 
3873    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3874    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3875                              LLVMTypeOf(identity), "");
3876    result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3877 
3878    return ac_build_wwm(ctx, result);
3879 }
3880 
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3881 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3882                              unsigned cluster_size)
3883 {
3884    if (cluster_size == 1)
3885       return src;
3886    ac_build_optimization_barrier(ctx, &src, false);
3887    LLVMValueRef result, swap;
3888    LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3889    result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3890                              LLVMTypeOf(identity), "");
3891    swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3892    result = ac_build_alu_op(ctx, result, swap, op);
3893    if (cluster_size == 2)
3894       return ac_build_wwm(ctx, result);
3895 
3896    swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3897    result = ac_build_alu_op(ctx, result, swap, op);
3898    if (cluster_size == 4)
3899       return ac_build_wwm(ctx, result);
3900 
3901    if (ctx->gfx_level >= GFX8)
3902       swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3903    else
3904       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3905    result = ac_build_alu_op(ctx, result, swap, op);
3906    if (cluster_size == 8)
3907       return ac_build_wwm(ctx, result);
3908 
3909    if (ctx->gfx_level >= GFX8)
3910       swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3911    else
3912       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3913    result = ac_build_alu_op(ctx, result, swap, op);
3914    if (cluster_size == 16)
3915       return ac_build_wwm(ctx, result);
3916 
3917    if (ctx->gfx_level >= GFX10)
3918       swap = ac_build_permlane16(ctx, result, 0, true, false);
3919    else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3920       swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3921    else
3922       swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3923    result = ac_build_alu_op(ctx, result, swap, op);
3924    if (cluster_size == 32)
3925       return ac_build_wwm(ctx, result);
3926 
3927    if (ctx->gfx_level >= GFX8) {
3928       if (ctx->wave_size == 64) {
3929          if (ctx->gfx_level >= GFX10)
3930             swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3931          else
3932             swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3933          result = ac_build_alu_op(ctx, result, swap, op);
3934          result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3935       }
3936 
3937       return ac_build_wwm(ctx, result);
3938    } else {
3939       swap = ac_build_readlane(ctx, result, ctx->i32_0);
3940       result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3941       result = ac_build_alu_op(ctx, result, swap, op);
3942       return ac_build_wwm(ctx, result);
3943    }
3944 }
3945 
3946 /**
3947  * "Top half" of a scan that reduces per-wave values across an entire
3948  * workgroup.
3949  *
3950  * The source value must be present in the highest lane of the wave, and the
3951  * highest lane must be live.
3952  */
ac_build_wg_wavescan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)3953 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3954 {
3955    if (ws->maxwaves <= 1)
3956       return;
3957 
3958    const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
3959    LLVMBuilderRef builder = ctx->builder;
3960    LLVMValueRef tid = ac_get_thread_id(ctx);
3961    LLVMValueRef tmp;
3962 
3963    tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
3964    ac_build_ifcc(ctx, tmp, 1000);
3965    LLVMBuildStore(builder, ws->src,
3966                   LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &ws->waveidx, 1, ""));
3967    ac_build_endif(ctx, 1000);
3968 }
3969 
3970 /**
3971  * "Bottom half" of a scan that reduces per-wave values across an entire
3972  * workgroup.
3973  *
3974  * The caller must place a barrier between the top and bottom halves.
3975  */
ac_build_wg_wavescan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)3976 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3977 {
3978    const LLVMTypeRef type = LLVMTypeOf(ws->src);
3979    const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
3980 
3981    if (ws->maxwaves <= 1) {
3982       ws->result_reduce = ws->src;
3983       ws->result_inclusive = ws->src;
3984       ws->result_exclusive = identity;
3985       return;
3986    }
3987    assert(ws->maxwaves <= 32);
3988 
3989    LLVMBuilderRef builder = ctx->builder;
3990    LLVMValueRef tid = ac_get_thread_id(ctx);
3991    LLVMBasicBlockRef bbs[2];
3992    LLVMValueRef phivalues_scan[2];
3993    LLVMValueRef tmp, tmp2;
3994 
3995    bbs[0] = LLVMGetInsertBlock(builder);
3996    phivalues_scan[0] = LLVMGetUndef(type);
3997 
3998    if (ws->enable_reduce)
3999       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4000    else if (ws->enable_inclusive)
4001       tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4002    else
4003       tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4004    ac_build_ifcc(ctx, tmp, 1001);
4005    {
4006       tmp = LLVMBuildLoad2(builder, LLVMTypeOf(ws->src),
4007                           LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &tid, 1, ""), "");
4008 
4009       ac_build_optimization_barrier(ctx, &tmp, false);
4010 
4011       bbs[1] = LLVMGetInsertBlock(builder);
4012       phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4013    }
4014    ac_build_endif(ctx, 1001);
4015 
4016    const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4017 
4018    if (ws->enable_reduce) {
4019       tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4020       ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4021    }
4022    if (ws->enable_inclusive)
4023       ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4024    if (ws->enable_exclusive) {
4025       tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4026       tmp = ac_build_readlane(ctx, scan, tmp);
4027       tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4028       ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4029    }
4030 }
4031 
4032 /**
4033  * Inclusive scan of a per-wave value across an entire workgroup.
4034  *
4035  * This implies an s_barrier instruction.
4036  *
4037  * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4038  * of the workgroup are live. (This requirement cannot easily be relaxed in a
4039  * useful manner because of the barrier in the algorithm.)
4040  */
ac_build_wg_wavescan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4041 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4042 {
4043    ac_build_wg_wavescan_top(ctx, ws);
4044    ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4045    ac_build_s_barrier(ctx, ws->stage);
4046    ac_build_wg_wavescan_bottom(ctx, ws);
4047 }
4048 
4049 /**
4050  * "Top half" of a scan that reduces per-thread values across an entire
4051  * workgroup.
4052  *
4053  * All lanes must be active when this code runs.
4054  */
ac_build_wg_scan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4055 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4056 {
4057    if (ws->enable_exclusive) {
4058       ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4059       if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4060          ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4061       ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4062    } else {
4063       ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4064    }
4065 
4066    bool enable_inclusive = ws->enable_inclusive;
4067    bool enable_exclusive = ws->enable_exclusive;
4068    ws->enable_inclusive = false;
4069    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4070    ac_build_wg_wavescan_top(ctx, ws);
4071    ws->enable_inclusive = enable_inclusive;
4072    ws->enable_exclusive = enable_exclusive;
4073 }
4074 
4075 /**
4076  * "Bottom half" of a scan that reduces per-thread values across an entire
4077  * workgroup.
4078  *
4079  * The caller must place a barrier between the top and bottom halves.
4080  */
ac_build_wg_scan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4081 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4082 {
4083    bool enable_inclusive = ws->enable_inclusive;
4084    bool enable_exclusive = ws->enable_exclusive;
4085    ws->enable_inclusive = false;
4086    ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4087    ac_build_wg_wavescan_bottom(ctx, ws);
4088    ws->enable_inclusive = enable_inclusive;
4089    ws->enable_exclusive = enable_exclusive;
4090 
4091    /* ws->result_reduce is already the correct value */
4092    if (ws->enable_inclusive)
4093       ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4094    if (ws->enable_exclusive)
4095       ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4096 }
4097 
4098 /**
4099  * A scan that reduces per-thread values across an entire workgroup.
4100  *
4101  * The caller must ensure that all lanes are active when this code runs
4102  * (WWM is insufficient!), because there is an implied barrier.
4103  */
ac_build_wg_scan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4104 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4105 {
4106    ac_build_wg_scan_top(ctx, ws);
4107    ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4108    ac_build_s_barrier(ctx, ws->stage);
4109    ac_build_wg_scan_bottom(ctx, ws);
4110 }
4111 
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)4112 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4113                                              LLVMValueRef *arg0, LLVMValueRef *arg1)
4114 {
4115    LLVMValueRef tid;
4116    LLVMValueRef src0, src1;
4117    LLVMValueRef tmp0;
4118    LLVMValueRef params[2];
4119    LLVMValueRef is_even;
4120 
4121    src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
4122    src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
4123 
4124    /* swap odd,even lanes of arg_0*/
4125    params[0] = src0;
4126    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4127    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4128                              ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4129 
4130    /* swap even lanes between arg_0 and arg_1 */
4131    tid = ac_get_thread_id(ctx);
4132    is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
4133                            LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
4134                            ctx->i32_0, "");
4135    tmp0 = src0;
4136    src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
4137    src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
4138 
4139    /* swap odd,even lanes again for arg_0*/
4140    params[0] = src0;
4141    params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4142    src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4143                              ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4144 
4145    *arg0 = src0;
4146    *arg1 = src1;
4147 }
4148 
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)4149 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4150                                      struct ac_export_args *mrt0,
4151                                      struct ac_export_args *mrt1)
4152 {
4153    assert(ctx->gfx_level >= GFX11);
4154    assert(mrt0->enabled_channels == mrt1->enabled_channels);
4155 
4156    for (int i = 0; i < 4; i++) {
4157       if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
4158          _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
4159    }
4160 }
4161 
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)4162 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4163                                    unsigned lane1, unsigned lane2, unsigned lane3)
4164 {
4165    unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4166    if (ctx->gfx_level >= GFX8) {
4167       return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4168    } else {
4169       return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4170    }
4171 }
4172 
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)4173 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4174 {
4175    LLVMTypeRef type = LLVMTypeOf(src);
4176    LLVMValueRef result;
4177 
4178    index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4179    src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4180 
4181    result =
4182       ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4183                          AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4184    return LLVMBuildTrunc(ctx->builder, result, type, "");
4185 }
4186 
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4187 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4188 {
4189    LLVMTypeRef type;
4190    char *intr;
4191 
4192    if (bitsize == 16) {
4193       intr = "llvm.amdgcn.frexp.exp.i16.f16";
4194       type = ctx->i16;
4195    } else if (bitsize == 32) {
4196       intr = "llvm.amdgcn.frexp.exp.i32.f32";
4197       type = ctx->i32;
4198    } else {
4199       intr = "llvm.amdgcn.frexp.exp.i32.f64";
4200       type = ctx->i32;
4201    }
4202 
4203    LLVMValueRef params[] = {
4204       src0,
4205    };
4206    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4207 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4208 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4209 {
4210    LLVMTypeRef type;
4211    char *intr;
4212 
4213    if (bitsize == 16) {
4214       intr = "llvm.amdgcn.frexp.mant.f16";
4215       type = ctx->f16;
4216    } else if (bitsize == 32) {
4217       intr = "llvm.amdgcn.frexp.mant.f32";
4218       type = ctx->f32;
4219    } else {
4220       intr = "llvm.amdgcn.frexp.mant.f64";
4221       type = ctx->f64;
4222    }
4223 
4224    LLVMValueRef params[] = {
4225       src0,
4226    };
4227    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4228 }
4229 
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4230 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4231 {
4232    LLVMTypeRef type;
4233    char *intr;
4234 
4235    if (bitsize == 16) {
4236       intr = "llvm.canonicalize.f16";
4237       type = ctx->f16;
4238    } else if (bitsize == 32) {
4239       intr = "llvm.canonicalize.f32";
4240       type = ctx->f32;
4241    } else {
4242       intr = "llvm.canonicalize.f64";
4243       type = ctx->f64;
4244    }
4245 
4246    LLVMValueRef params[] = {
4247       src0,
4248    };
4249    return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4250 }
4251 
4252 /*
4253  * this takes an I,J coordinate pair,
4254  * and works out the X and Y derivatives.
4255  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4256  */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)4257 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4258 {
4259    LLVMValueRef result[4], a;
4260    unsigned i;
4261 
4262    for (i = 0; i < 2; i++) {
4263       a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4264       result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4265       result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4266    }
4267    return ac_build_gather_values(ctx, result, 4);
4268 }
4269 
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)4270 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4271 {
4272    LLVMValueRef result;
4273 
4274    if (LLVM_VERSION_MAJOR >= 13) {
4275       result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,
4276                                   AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
4277    } else {
4278       result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,
4279                                   AC_FUNC_ATTR_READNONE);
4280    }
4281    return LLVMBuildNot(ctx->builder, result, "");
4282 }
4283 
ac_build_is_helper_invocation(struct ac_llvm_context * ctx)4284 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4285 {
4286    if (!ctx->postponed_kill)
4287       return ac_build_load_helper_invocation(ctx);
4288 
4289    /* postponed_kill should be NULL on LLVM 13+ */
4290    assert(LLVM_VERSION_MAJOR < 13);
4291 
4292    /* !(exact && postponed) */
4293    LLVMValueRef exact =
4294       ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4295 
4296    LLVMValueRef postponed = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
4297    return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4298 }
4299 
ac_build_call(struct ac_llvm_context * ctx,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)4300 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4301                            unsigned num_args)
4302 {
4303    LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4304    LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4305    return ret;
4306 }
4307 
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)4308 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4309                      LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
4310                      struct ac_export_args *args)
4311 {
4312    unsigned mask = 0;
4313    unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
4314                                                 mrt0_alpha != NULL);
4315 
4316    assert(depth || stencil || samplemask);
4317 
4318    memset(args, 0, sizeof(*args));
4319 
4320    if (is_last) {
4321       args->valid_mask = 1; /* whether the EXEC mask is valid */
4322       args->done = 1;       /* DONE bit */
4323    }
4324 
4325    /* Specify the target we are exporting */
4326    args->target = V_008DFC_SQ_EXP_MRTZ;
4327 
4328    args->compr = 0;                       /* COMP flag */
4329    args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4330    args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4331    args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4332    args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4333 
4334    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4335       assert(!depth);
4336       args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
4337 
4338       if (stencil) {
4339          /* Stencil should be in X[23:16]. */
4340          stencil = ac_to_integer(ctx, stencil);
4341          stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4342          args->out[0] = ac_to_float(ctx, stencil);
4343          mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
4344       }
4345       if (samplemask) {
4346          /* SampleMask should be in Y[15:0]. */
4347          args->out[1] = samplemask;
4348          mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
4349       }
4350    } else {
4351       if (depth) {
4352          args->out[0] = depth;
4353          mask |= 0x1;
4354       }
4355       if (stencil) {
4356          args->out[1] = stencil;
4357          mask |= 0x2;
4358       }
4359       if (samplemask) {
4360          args->out[2] = samplemask;
4361          mask |= 0x4;
4362       }
4363       if (mrt0_alpha) {
4364          args->out[3] = mrt0_alpha;
4365          mask |= 0x8;
4366       }
4367    }
4368 
4369    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4370     * at the X writemask component. */
4371    if (ctx->gfx_level == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4372       mask |= 0x1;
4373 
4374    /* Specify which components to enable */
4375    args->enabled_channels = mask;
4376 }
4377 
4378 /* Send GS Alloc Req message from the first wave of the group to SPI.
4379  * Message payload is:
4380  * - bits 0..10: vertices in group
4381  * - bits 12..22: primitives in group
4382  */
ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context * ctx,LLVMValueRef wave_id,LLVMValueRef vtx_cnt,LLVMValueRef prim_cnt)4383 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4384                                    LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4385 {
4386    LLVMBuilderRef builder = ctx->builder;
4387    LLVMValueRef tmp;
4388    bool export_dummy_prim = false;
4389 
4390    /* HW workaround for a GPU hang with 100% culling.
4391     * We always have to export at least 1 primitive.
4392     * Export a degenerate triangle using vertex 0 for all 3 vertices.
4393     */
4394    if (prim_cnt == ctx->i32_0 && ctx->gfx_level == GFX10) {
4395       assert(vtx_cnt == ctx->i32_0);
4396       prim_cnt = ctx->i32_1;
4397       vtx_cnt = ctx->i32_1;
4398       export_dummy_prim = true;
4399    }
4400 
4401    if (wave_id)
4402       ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4403 
4404    tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4405    tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4406    ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4407 
4408    if (export_dummy_prim) {
4409       struct ac_ngg_prim prim = {0};
4410       /* The vertex indices are 0,0,0. */
4411       prim.passthrough = ctx->i32_0;
4412 
4413       struct ac_export_args pos = {0};
4414       /* The hw culls primitives with NaN. */
4415       pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);
4416       pos.target = V_008DFC_SQ_EXP_POS;
4417       pos.enabled_channels = 0xf;
4418       pos.done = true;
4419 
4420       ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4421                     5021);
4422       ac_build_export_prim(ctx, &prim);
4423       ac_build_export(ctx, &pos);
4424       ac_build_endif(ctx, 5021);
4425    }
4426 
4427    if (wave_id)
4428       ac_build_endif(ctx, 5020);
4429 }
4430 
4431 
ac_pack_edgeflags_for_export(struct ac_llvm_context * ctx,const struct ac_shader_args * args)4432 LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
4433                                           const struct ac_shader_args *args)
4434 {
4435    /* Use the following trick to extract the edge flags:
4436     *   extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
4437     *   shifted = v_mul_u32_u24 extracted, 0x80402u   ; shift the bits: 8->9, 9->19, 10->29
4438     *   result = v_and_b32 shifted, 0x20080200        ; remove garbage
4439     */
4440    LLVMValueRef tmp = LLVMBuildAnd(ctx->builder,
4441                                    ac_get_arg(ctx, args->gs_invocation_id),
4442                                    LLVMConstInt(ctx->i32, 0x700, 0), "");
4443    tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), "");
4444    return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), "");
4445 }
4446 
ac_pack_prim_export(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4447 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4448 {
4449    /* The prim export format is:
4450     *  - bits 0..8: index 0
4451     *  - bit 9: edge flag 0
4452     *  - bits 10..18: index 1
4453     *  - bit 19: edge flag 1
4454     *  - bits 20..28: index 2
4455     *  - bit 29: edge flag 2
4456     *  - bit 31: null primitive (skip)
4457     */
4458    LLVMBuilderRef builder = ctx->builder;
4459    LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4460    LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4461    result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, "");
4462 
4463    for (unsigned i = 0; i < prim->num_vertices; ++i) {
4464       tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4465       result = LLVMBuildOr(builder, result, tmp, "");
4466    }
4467    return result;
4468 }
4469 
ac_build_export_prim(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4470 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4471 {
4472    struct ac_export_args args;
4473 
4474    if (prim->passthrough) {
4475       args.out[0] = prim->passthrough;
4476    } else {
4477       args.out[0] = ac_pack_prim_export(ctx, prim);
4478    }
4479 
4480    args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4481    args.out[1] = LLVMGetUndef(ctx->f32);
4482    args.out[2] = LLVMGetUndef(ctx->f32);
4483    args.out[3] = LLVMGetUndef(ctx->f32);
4484 
4485    args.target = V_008DFC_SQ_EXP_PRIM;
4486    args.enabled_channels = 1;
4487    args.done = true;
4488    args.valid_mask = false;
4489    args.compr = false;
4490 
4491    ac_build_export(ctx, &args);
4492 }
4493 
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)4494 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4495 {
4496    if (type == AC_ARG_FLOAT) {
4497       return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4498    } else if (type == AC_ARG_INT) {
4499       return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4500    } else {
4501       LLVMTypeRef ptr_type;
4502       switch (type) {
4503       case AC_ARG_CONST_PTR:
4504          ptr_type = ctx->i8;
4505          break;
4506       case AC_ARG_CONST_FLOAT_PTR:
4507          ptr_type = ctx->f32;
4508          break;
4509       case AC_ARG_CONST_PTR_PTR:
4510          ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4511          break;
4512       case AC_ARG_CONST_DESC_PTR:
4513          ptr_type = ctx->v4i32;
4514          break;
4515       case AC_ARG_CONST_IMAGE_PTR:
4516          ptr_type = ctx->v8i32;
4517          break;
4518       default:
4519          unreachable("unknown arg type");
4520       }
4521       if (size == 1) {
4522          return ac_array_in_const32_addr_space(ptr_type);
4523       } else {
4524          assert(size == 2);
4525          return ac_array_in_const_addr_space(ptr_type);
4526       }
4527    }
4528 }
4529 
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)4530 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4531                            enum ac_llvm_calling_convention convention, const char *name,
4532                            LLVMTypeRef ret_type, LLVMModuleRef module)
4533 {
4534    LLVMTypeRef arg_types[AC_MAX_ARGS];
4535 
4536    for (unsigned i = 0; i < args->arg_count; i++) {
4537       arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4538    }
4539 
4540    LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4541 
4542    LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4543    LLVMBasicBlockRef main_function_body =
4544       LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4545    LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4546 
4547    LLVMSetFunctionCallConv(main_function, convention);
4548    for (unsigned i = 0; i < args->arg_count; ++i) {
4549       LLVMValueRef P = LLVMGetParam(main_function, i);
4550 
4551       if (args->args[i].file != AC_ARG_SGPR)
4552          continue;
4553 
4554       ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4555 
4556       if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4557          ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4558          ac_add_attr_dereferenceable(P, UINT64_MAX);
4559          ac_add_attr_alignment(P, 4);
4560       }
4561    }
4562 
4563    ctx->main_function = main_function;
4564 
4565    /* Enable denormals for FP16 and FP64: */
4566    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4567    /* Disable denormals for FP32: */
4568    LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4569                                       "preserve-sign,preserve-sign");
4570    return main_function;
4571 }
4572 
ac_build_s_endpgm(struct ac_llvm_context * ctx)4573 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4574 {
4575    LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4576    LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4577    LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
4578 }
4579 
4580 /**
4581  * Convert triangle strip indices to triangle indices. This is used to decompose
4582  * triangle strips into triangles.
4583  */
ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context * ctx,LLVMValueRef is_odd,LLVMValueRef flatshade_first,LLVMValueRef index[3])4584 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4585                                                  LLVMValueRef flatshade_first,
4586                                                  LLVMValueRef index[3])
4587 {
4588    LLVMBuilderRef builder = ctx->builder;
4589    LLVMValueRef out[3];
4590 
4591    /* We need to change the vertex order for odd triangles to get correct
4592     * front/back facing by swapping 2 vertex indices, but we also have to
4593     * keep the provoking vertex in the same place.
4594     *
4595     * If the first vertex is provoking, swap index 1 and 2.
4596     * If the last vertex is provoking, swap index 0 and 1.
4597     */
4598    out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4599                             LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4600    out[1] = LLVMBuildSelect(builder, flatshade_first,
4601                             LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4602                             LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4603    out[2] = LLVMBuildSelect(builder, flatshade_first,
4604                             LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4605    memcpy(index, out, sizeof(out));
4606 }
4607 
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)4608 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
4609 {
4610    LLVMValueRef args[2] = {
4611       a,
4612       LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
4613    };
4614    return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2,
4615                              AC_FUNC_ATTR_READNONE);
4616 }
4617