1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include "ac_nir.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40
41 #include <assert.h>
42 #include <stdio.h>
43
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47 */
48 struct ac_llvm_flow {
49 /* Loop exit or next part of if/else/endif. */
50 LLVMBasicBlockRef next_block;
51 LLVMBasicBlockRef loop_entry_block;
52 };
53
54 /* Initialize module-independent parts of the context.
55 *
56 * The caller is responsible for initializing ctx::module and ctx::builder.
57 */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,enum amd_gfx_level gfx_level,enum radeon_family family,bool has_3d_cube_border_color_mipmap,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits)58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59 enum amd_gfx_level gfx_level, enum radeon_family family,
60 bool has_3d_cube_border_color_mipmap,
61 enum ac_float_mode float_mode, unsigned wave_size,
62 unsigned ballot_mask_bits)
63 {
64 ctx->context = LLVMContextCreate();
65 #if LLVM_VERSION_MAJOR >= 15
66 LLVMContextSetOpaquePointers(ctx->context, false);
67 #endif
68
69 ctx->gfx_level = gfx_level;
70 ctx->family = family;
71 ctx->has_3d_cube_border_color_mipmap = has_3d_cube_border_color_mipmap;
72 ctx->wave_size = wave_size;
73 ctx->ballot_mask_bits = ballot_mask_bits;
74 ctx->float_mode = float_mode;
75 ctx->module = ac_create_module(compiler->tm, ctx->context);
76 ctx->builder = ac_create_builder(ctx->context, float_mode);
77
78 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
79 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
80 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
81 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
82 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
83 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
84 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
85 ctx->intptr = ctx->i32;
86 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
87 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
88 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
89 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
90 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
91 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
92 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
93 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
94 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
95 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
96 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
97 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
98 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
99 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
100 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
101 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
102
103 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
104 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
105 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
106 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
107 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
108 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
109 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
110 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
111 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
112 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
113 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
114 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
115 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
116 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
117 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
118 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
119
120 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
121 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
122
123 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
124
125 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
126
127 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
128
129 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
130 ctx->flow = calloc(1, sizeof(*ctx->flow));
131 }
132
ac_llvm_context_dispose(struct ac_llvm_context * ctx)133 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
134 {
135 free(ctx->flow->stack);
136 free(ctx->flow);
137 ctx->flow = NULL;
138 }
139
ac_get_llvm_num_components(LLVMValueRef value)140 int ac_get_llvm_num_components(LLVMValueRef value)
141 {
142 LLVMTypeRef type = LLVMTypeOf(value);
143 unsigned num_components =
144 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
145 return num_components;
146 }
147
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)148 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
149 {
150 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
151 assert(index == 0);
152 return value;
153 }
154
155 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
156 }
157
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)158 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
159 {
160 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
161 type = LLVMGetElementType(type);
162
163 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
164 return LLVMGetIntTypeWidth(type);
165
166 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
167 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
168 return 32;
169 }
170
171 if (type == ctx->f16)
172 return 16;
173 if (type == ctx->f32)
174 return 32;
175 if (type == ctx->f64)
176 return 64;
177
178 unreachable("Unhandled type kind in get_elem_bits");
179 }
180
ac_get_type_size(LLVMTypeRef type)181 unsigned ac_get_type_size(LLVMTypeRef type)
182 {
183 LLVMTypeKind kind = LLVMGetTypeKind(type);
184
185 switch (kind) {
186 case LLVMIntegerTypeKind:
187 return LLVMGetIntTypeWidth(type) / 8;
188 case LLVMHalfTypeKind:
189 return 2;
190 case LLVMFloatTypeKind:
191 return 4;
192 case LLVMDoubleTypeKind:
193 return 8;
194 case LLVMPointerTypeKind:
195 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
196 return 4;
197 return 8;
198 case LLVMVectorTypeKind:
199 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
200 case LLVMArrayTypeKind:
201 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
202 default:
203 assert(0);
204 return 0;
205 }
206 }
207
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)208 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
209 {
210 if (t == ctx->i1)
211 return ctx->i1;
212 else if (t == ctx->i8)
213 return ctx->i8;
214 else if (t == ctx->f16 || t == ctx->i16)
215 return ctx->i16;
216 else if (t == ctx->f32 || t == ctx->i32)
217 return ctx->i32;
218 else if (t == ctx->f64 || t == ctx->i64)
219 return ctx->i64;
220 else
221 unreachable("Unhandled integer size");
222 }
223
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)224 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
225 {
226 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
227 LLVMTypeRef elem_type = LLVMGetElementType(t);
228 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
229 }
230 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
231 switch (LLVMGetPointerAddressSpace(t)) {
232 case AC_ADDR_SPACE_GLOBAL:
233 return ctx->i64;
234 case AC_ADDR_SPACE_CONST_32BIT:
235 case AC_ADDR_SPACE_LDS:
236 return ctx->i32;
237 default:
238 unreachable("unhandled address space");
239 }
240 }
241 return to_integer_type_scalar(ctx, t);
242 }
243
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)244 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
245 {
246 LLVMTypeRef type = LLVMTypeOf(v);
247 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
248 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
249 }
250 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
251 }
252
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)253 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
254 {
255 LLVMTypeRef type = LLVMTypeOf(v);
256 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
257 return v;
258 return ac_to_integer(ctx, v);
259 }
260
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)261 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
262 {
263 if (t == ctx->i8)
264 return ctx->i8;
265 else if (t == ctx->i16 || t == ctx->f16)
266 return ctx->f16;
267 else if (t == ctx->i32 || t == ctx->f32)
268 return ctx->f32;
269 else if (t == ctx->i64 || t == ctx->f64)
270 return ctx->f64;
271 else
272 unreachable("Unhandled float size");
273 }
274
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)275 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
276 {
277 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
278 LLVMTypeRef elem_type = LLVMGetElementType(t);
279 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
280 }
281 return to_float_type_scalar(ctx, t);
282 }
283
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)284 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
285 {
286 LLVMTypeRef type = LLVMTypeOf(v);
287 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
288 }
289
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)290 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
291 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
292 unsigned attrib_mask)
293 {
294 LLVMValueRef call;
295 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
296
297 LLVMTypeRef param_types[32];
298 assert(param_count <= 32);
299 for (unsigned i = 0; i < param_count; ++i) {
300 assert(params[i]);
301 param_types[i] = LLVMTypeOf(params[i]);
302 }
303
304 LLVMTypeRef function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
305 LLVMValueRef function = LLVMGetNamedFunction(ctx->module, name);
306
307 if (!function) {
308 function = LLVMAddFunction(ctx->module, name, function_type);
309
310 LLVMSetFunctionCallConv(function, LLVMCCallConv);
311 LLVMSetLinkage(function, LLVMExternalLinkage);
312
313 if (!set_callsite_attrs)
314 ac_add_func_attributes(ctx->context, function, attrib_mask);
315 }
316
317 call = LLVMBuildCall2(ctx->builder, function_type, function, params, param_count, "");
318 if (set_callsite_attrs)
319 ac_add_func_attributes(ctx->context, call, attrib_mask);
320 return call;
321 }
322
323 /**
324 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
325 * intrinsic names).
326 */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)327 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
328 {
329 LLVMTypeRef elem_type = type;
330
331 if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
332 unsigned count = LLVMCountStructElementTypes(type);
333 int ret = snprintf(buf, bufsize, "sl_");
334 buf += ret;
335 bufsize -= ret;
336
337 LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
338 LLVMGetStructElementTypes(type, elems);
339
340 for (unsigned i = 0; i < count; i++) {
341 ac_build_type_name_for_intr(elems[i], buf, bufsize);
342 ret = strlen(buf);
343 buf += ret;
344 bufsize -= ret;
345 }
346
347 snprintf(buf, bufsize, "s");
348 return;
349 }
350
351 assert(bufsize >= 8);
352 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
353 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
354 if (ret < 0) {
355 char *type_name = LLVMPrintTypeToString(type);
356 fprintf(stderr, "Error building type name for: %s\n", type_name);
357 LLVMDisposeMessage(type_name);
358 return;
359 }
360 elem_type = LLVMGetElementType(type);
361 buf += ret;
362 bufsize -= ret;
363 }
364 switch (LLVMGetTypeKind(elem_type)) {
365 default:
366 break;
367 case LLVMIntegerTypeKind:
368 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
369 break;
370 case LLVMHalfTypeKind:
371 snprintf(buf, bufsize, "f16");
372 break;
373 case LLVMFloatTypeKind:
374 snprintf(buf, bufsize, "f32");
375 break;
376 case LLVMDoubleTypeKind:
377 snprintf(buf, bufsize, "f64");
378 break;
379 }
380 }
381
382 /**
383 * Helper function that builds an LLVM IR PHI node and immediately adds
384 * incoming edges.
385 */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)386 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
387 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
388 {
389 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
390 LLVMAddIncoming(phi, values, blocks, count_incoming);
391 return phi;
392 }
393
ac_build_s_barrier(struct ac_llvm_context * ctx,gl_shader_stage stage)394 void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
395 {
396 /* GFX6 only: s_barrier isn’t needed in TCS because an entire patch always fits into
397 * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
398 */
399 if (ctx->gfx_level == GFX6 && stage == MESA_SHADER_TESS_CTRL)
400 return;
401
402 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
403 }
404
405 /* Prevent optimizations (at least of memory accesses) across the current
406 * point in the program by emitting empty inline assembly that is marked as
407 * having side effects.
408 *
409 * Optionally, a value can be passed through the inline assembly to prevent
410 * LLVM from hoisting calls to ReadNone functions.
411 */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pgpr,bool sgpr)412 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr)
413 {
414 static int counter = 0;
415
416 LLVMBuilderRef builder = ctx->builder;
417 char code[16];
418 const char *constraint = sgpr ? "=s,0" : "=v,0";
419
420 snprintf(code, sizeof(code), "; %d", (int)p_atomic_inc_return(&counter));
421
422 if (!pgpr) {
423 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
424 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
425 LLVMBuildCall2(builder, ftype, inlineasm, NULL, 0, "");
426 } else if (LLVMTypeOf(*pgpr) == ctx->i32) {
427 /* Simple version for i32 that allows the caller to set LLVM metadata on the call
428 * instruction. */
429 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
430 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
431
432 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
433 } else if (LLVMTypeOf(*pgpr) == ctx->i16) {
434 /* Simple version for i16 that allows the caller to set LLVM metadata on the call
435 * instruction. */
436 LLVMTypeRef ftype = LLVMFunctionType(ctx->i16, &ctx->i16, 1, false);
437 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
438
439 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
440 } else if (LLVMGetTypeKind(LLVMTypeOf(*pgpr)) == LLVMPointerTypeKind) {
441 LLVMTypeRef type = LLVMTypeOf(*pgpr);
442 LLVMTypeRef ftype = LLVMFunctionType(type, &type, 1, false);
443 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
444
445 *pgpr = LLVMBuildCall2(builder, ftype, inlineasm, pgpr, 1, "");
446 } else {
447 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
448 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, constraint, true, false);
449 LLVMTypeRef type = LLVMTypeOf(*pgpr);
450 unsigned bitsize = ac_get_elem_bits(ctx, type);
451 LLVMValueRef vgpr = *pgpr;
452 LLVMTypeRef vgpr_type;
453 unsigned vgpr_size;
454 LLVMValueRef vgpr0;
455
456 if (bitsize < 32)
457 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
458
459 vgpr_type = LLVMTypeOf(vgpr);
460 vgpr_size = ac_get_type_size(vgpr_type);
461
462 assert(vgpr_size % 4 == 0);
463
464 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
465 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
466 vgpr0 = LLVMBuildCall2(builder, ftype, inlineasm, &vgpr0, 1, "");
467 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
468 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
469
470 if (bitsize < 32)
471 vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
472
473 *pgpr = vgpr;
474 }
475 }
476
ac_build_shader_clock(struct ac_llvm_context * ctx,nir_scope scope)477 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
478 {
479 const char *subgroup = "llvm.readcyclecounter";
480 const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
481
482 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
483 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
484 }
485
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)486 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
487 {
488 const char *name;
489
490 if (LLVMTypeOf(value) == ctx->i1)
491 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
492
493 if (ctx->wave_size == 64)
494 name = "llvm.amdgcn.icmp.i64.i32";
495 else
496 name = "llvm.amdgcn.icmp.i32.i32";
497
498 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
499
500 /* We currently have no other way to prevent LLVM from lifting the icmp
501 * calls to a dominating basic block.
502 */
503 ac_build_optimization_barrier(ctx, &args[0], false);
504
505 args[0] = ac_to_integer(ctx, args[0]);
506
507 return ac_build_intrinsic(
508 ctx, name, ctx->iN_wavemask, args, 3,
509 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
510 }
511
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)512 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
513 {
514 const char *name;
515
516 if (ctx->wave_size == 64)
517 name = "llvm.amdgcn.icmp.i64.i1";
518 else
519 name = "llvm.amdgcn.icmp.i32.i1";
520
521 LLVMValueRef args[3] = {
522 value,
523 ctx->i1false,
524 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
525 };
526
527 return ac_build_intrinsic(
528 ctx, name, ctx->iN_wavemask, args, 3,
529 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
530 }
531
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)532 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
533 {
534 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
535 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
536 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
537 }
538
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)539 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
540 {
541 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
542 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
543 "");
544 }
545
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)546 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
547 {
548 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
549 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
550
551 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
552 LLVMValueRef none =
553 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
554 return LLVMBuildOr(ctx->builder, all, none, "");
555 }
556
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)557 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
558 unsigned value_count, unsigned component)
559 {
560 LLVMValueRef vec = NULL;
561
562 if (value_count == 1) {
563 return values[component];
564 } else if (!value_count)
565 unreachable("value_count is 0");
566
567 for (unsigned i = component; i < value_count + component; i++) {
568 LLVMValueRef value = values[i];
569
570 if (i == component)
571 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
572 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
573 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
574 }
575 return vec;
576 }
577
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool always_vector)578 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
579 unsigned value_count, unsigned value_stride,
580 bool always_vector)
581 {
582 LLVMBuilderRef builder = ctx->builder;
583 LLVMValueRef vec = NULL;
584 unsigned i;
585
586 if (value_count == 1 && !always_vector) {
587 return values[0];
588 } else if (!value_count)
589 unreachable("value_count is 0");
590
591 for (i = 0; i < value_count; i++) {
592 LLVMValueRef value = values[i * value_stride];
593
594 if (!i)
595 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
596 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
597 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
598 }
599 return vec;
600 }
601
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)602 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
603 unsigned value_count)
604 {
605 return ac_build_gather_values_extended(ctx, values, value_count, 1, false);
606 }
607
ac_build_concat(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)608 LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
609 {
610 unsigned a_size = ac_get_llvm_num_components(a);
611 unsigned b_size = ac_get_llvm_num_components(b);
612
613 LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
614 for (unsigned i = 0; i < a_size; i++)
615 elems[i] = ac_llvm_extract_elem(ctx, a, i);
616 for (unsigned i = 0; i < b_size; i++)
617 elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
618
619 return ac_build_gather_values(ctx, elems, a_size + b_size);
620 }
621
622 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
623 * channels with undef. Extract at most src_channels components from the input.
624 */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)625 LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
626 unsigned src_channels, unsigned dst_channels)
627 {
628 LLVMTypeRef elemtype;
629 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
630
631 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
632 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
633
634 if (src_channels == dst_channels && vec_size == dst_channels)
635 return value;
636
637 src_channels = MIN2(src_channels, vec_size);
638
639 for (unsigned i = 0; i < src_channels; i++)
640 chan[i] = ac_llvm_extract_elem(ctx, value, i);
641
642 elemtype = LLVMGetElementType(LLVMTypeOf(value));
643 } else {
644 if (src_channels) {
645 assert(src_channels == 1);
646 chan[0] = value;
647 }
648 elemtype = LLVMTypeOf(value);
649 }
650
651 for (unsigned i = src_channels; i < dst_channels; i++)
652 chan[i] = LLVMGetUndef(elemtype);
653
654 return ac_build_gather_values(ctx, chan, dst_channels);
655 }
656
657 /* Extract components [start, start + channels) from a vector.
658 */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)659 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
660 unsigned channels)
661 {
662 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
663
664 for (unsigned i = 0; i < channels; i++)
665 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
666
667 return ac_build_gather_values(ctx, chan, channels);
668 }
669
670 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
671 * with undef. Extract at most num_channels components from the input.
672 */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)673 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
674 unsigned num_channels)
675 {
676 return ac_build_expand(ctx, value, num_channels, 4);
677 }
678
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)679 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
680 {
681 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
682 const char *name;
683
684 if (type_size == 2)
685 name = "llvm.rint.f16";
686 else if (type_size == 4)
687 name = "llvm.rint.f32";
688 else
689 name = "llvm.rint.f64";
690
691 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
692 }
693
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)694 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
695 {
696 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
697 const char *name;
698
699 /* For doubles, we need precise division to pass GLCTS. */
700 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
701 return LLVMBuildFDiv(ctx->builder, num, den, "");
702
703 if (type_size == 2)
704 name = "llvm.amdgcn.rcp.f16";
705 else if (type_size == 4)
706 name = "llvm.amdgcn.rcp.f32";
707 else
708 name = "llvm.amdgcn.rcp.f64";
709
710 LLVMValueRef rcp =
711 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
712
713 return LLVMBuildFMul(ctx->builder, num, rcp, "");
714 }
715
716 /* See fast_idiv_by_const.h. */
717 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)718 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
719 LLVMValueRef multiplier, LLVMValueRef pre_shift,
720 LLVMValueRef post_shift, LLVMValueRef increment)
721 {
722 LLVMBuilderRef builder = ctx->builder;
723
724 num = LLVMBuildLShr(builder, num, pre_shift, "");
725 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
726 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
727 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
728 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
729 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
730 return LLVMBuildLShr(builder, num, post_shift, "");
731 }
732
733 /* See fast_idiv_by_const.h. */
734 /* If num != UINT_MAX, this more efficient version can be used. */
735 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)736 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
737 LLVMValueRef multiplier, LLVMValueRef pre_shift,
738 LLVMValueRef post_shift, LLVMValueRef increment)
739 {
740 LLVMBuilderRef builder = ctx->builder;
741
742 num = LLVMBuildLShr(builder, num, pre_shift, "");
743 num = LLVMBuildNUWAdd(builder, num, increment, "");
744 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
745 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
746 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
747 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
748 return LLVMBuildLShr(builder, num, post_shift, "");
749 }
750
751 /* See fast_idiv_by_const.h. */
752 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)753 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
754 LLVMValueRef multiplier, LLVMValueRef post_shift)
755 {
756 LLVMBuilderRef builder = ctx->builder;
757
758 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
759 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
760 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
761 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
762 return LLVMBuildLShr(builder, num, post_shift, "");
763 }
764
765 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
766 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
767 * already multiplied by two. id is the cube face number.
768 */
769 struct cube_selection_coords {
770 LLVMValueRef stc[2];
771 LLVMValueRef ma;
772 LLVMValueRef id;
773 };
774
build_cube_intrinsic(struct ac_llvm_context * ctx,LLVMValueRef in[3],struct cube_selection_coords * out)775 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
776 struct cube_selection_coords *out)
777 {
778 LLVMTypeRef f32 = ctx->f32;
779
780 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
781 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
782 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
783 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
784 }
785
786 /**
787 * Build a manual selection sequence for cube face sc/tc coordinates and
788 * major axis vector (multiplied by 2 for consistency) for the given
789 * vec3 \p coords, for the face implied by \p selcoords.
790 *
791 * For the major axis, we always adjust the sign to be in the direction of
792 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
793 * the selcoords major axis.
794 */
build_cube_select(struct ac_llvm_context * ctx,const struct cube_selection_coords * selcoords,const LLVMValueRef * coords,LLVMValueRef * out_st,LLVMValueRef * out_ma)795 static void build_cube_select(struct ac_llvm_context *ctx,
796 const struct cube_selection_coords *selcoords,
797 const LLVMValueRef *coords, LLVMValueRef *out_st,
798 LLVMValueRef *out_ma)
799 {
800 LLVMBuilderRef builder = ctx->builder;
801 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
802 LLVMValueRef is_ma_positive;
803 LLVMValueRef sgn_ma;
804 LLVMValueRef is_ma_z, is_not_ma_z;
805 LLVMValueRef is_ma_y;
806 LLVMValueRef is_ma_x;
807 LLVMValueRef sgn;
808 LLVMValueRef tmp;
809
810 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
811 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
812 LLVMConstReal(f32, -1.0), "");
813
814 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
815 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
816 is_ma_y = LLVMBuildAnd(
817 builder, is_not_ma_z,
818 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
819 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
820
821 /* Select sc */
822 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
823 sgn = LLVMBuildSelect(
824 builder, is_ma_y, LLVMConstReal(f32, 1.0),
825 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
826 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
827
828 /* Select tc */
829 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
830 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
831 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
832
833 /* Select ma */
834 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
835 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
836 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
837 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
838 }
839
ac_prepare_cube_coords(struct ac_llvm_context * ctx,bool is_deriv,bool is_array,bool is_lod,LLVMValueRef * coords_arg,LLVMValueRef * derivs_arg)840 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
841 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
842 {
843
844 LLVMBuilderRef builder = ctx->builder;
845 struct cube_selection_coords selcoords;
846 LLVMValueRef coords[3];
847 LLVMValueRef invma;
848
849 if (is_array && !is_lod) {
850 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
851
852 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
853 *
854 * "For Array forms, the array layer used will be
855 *
856 * max(0, min(d−1, floor(layer+0.5)))
857 *
858 * where d is the depth of the texture array and layer
859 * comes from the component indicated in the tables below.
860 * Workaroudn for an issue where the layer is taken from a
861 * helper invocation which happens to fall on a different
862 * layer due to extrapolation."
863 *
864 * GFX8 and earlier attempt to implement this in hardware by
865 * clamping the value of coords[2] = (8 * layer) + face.
866 * Unfortunately, this means that the we end up with the wrong
867 * face when clamping occurs.
868 *
869 * Clamp the layer earlier to work around the issue.
870 */
871 if (ctx->gfx_level <= GFX8) {
872 LLVMValueRef ge0;
873 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
874 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
875 }
876
877 coords_arg[3] = tmp;
878 }
879
880 build_cube_intrinsic(ctx, coords_arg, &selcoords);
881
882 invma =
883 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
884 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
885
886 for (int i = 0; i < 2; ++i)
887 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
888
889 coords[2] = selcoords.id;
890
891 if (is_deriv && derivs_arg) {
892 LLVMValueRef derivs[4];
893 int axis;
894
895 /* Convert cube derivatives to 2D derivatives. */
896 for (axis = 0; axis < 2; axis++) {
897 LLVMValueRef deriv_st[2];
898 LLVMValueRef deriv_ma;
899
900 /* Transform the derivative alongside the texture
901 * coordinate. Mathematically, the correct formula is
902 * as follows. Assume we're projecting onto the +Z face
903 * and denote by dx/dh the derivative of the (original)
904 * X texture coordinate with respect to horizontal
905 * window coordinates. The projection onto the +Z face
906 * plane is:
907 *
908 * f(x,z) = x/z
909 *
910 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
911 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
912 *
913 * This motivatives the implementation below.
914 *
915 * Whether this actually gives the expected results for
916 * apps that might feed in derivatives obtained via
917 * finite differences is anyone's guess. The OpenGL spec
918 * seems awfully quiet about how textureGrad for cube
919 * maps should be handled.
920 */
921 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
922
923 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
924
925 for (int i = 0; i < 2; ++i)
926 derivs[axis * 2 + i] =
927 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
928 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
929 }
930
931 memcpy(derivs_arg, derivs, sizeof(derivs));
932 }
933
934 /* Shift the texture coordinate. This must be applied after the
935 * derivative calculation.
936 */
937 for (int i = 0; i < 2; ++i)
938 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
939
940 if (is_array) {
941 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
942 /* coords_arg.w component - array_index for cube arrays */
943 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
944 }
945
946 memcpy(coords_arg, coords, sizeof(coords));
947 }
948
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)949 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
950 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
951 LLVMValueRef j)
952 {
953 LLVMValueRef args[5];
954
955 if (ctx->gfx_level >= GFX11) {
956 LLVMValueRef p;
957 LLVMValueRef p10;
958
959 args[0] = llvm_chan;
960 args[1] = attr_number;
961 args[2] = params;
962
963 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
964 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
965
966 args[0] = p;
967 args[1] = i;
968 args[2] = p;
969
970 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10",
971 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
972
973 args[0] = p;
974 args[1] = j;
975 args[2] = p10;
976
977 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2",
978 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
979
980 } else {
981 LLVMValueRef p1;
982
983 args[0] = i;
984 args[1] = llvm_chan;
985 args[2] = attr_number;
986 args[3] = params;
987
988 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
989 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
990
991 args[0] = p1;
992 args[1] = j;
993 args[2] = llvm_chan;
994 args[3] = attr_number;
995 args[4] = params;
996
997 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
998 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
999 }
1000 }
1001
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j,bool high_16bits)1002 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
1003 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
1004 LLVMValueRef j, bool high_16bits)
1005 {
1006 LLVMValueRef args[6];
1007
1008 if (ctx->gfx_level >= GFX11) {
1009 LLVMValueRef p;
1010 LLVMValueRef p10;
1011
1012 args[0] = llvm_chan;
1013 args[1] = attr_number;
1014 args[2] = params;
1015
1016 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1017 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1018
1019 args[0] = p;
1020 args[1] = i;
1021 args[2] = p;
1022 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1023
1024 p10 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p10.f16",
1025 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
1026
1027 args[0] = p;
1028 args[1] = j;
1029 args[2] = p10;
1030 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1031
1032 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.inreg.p2.f16",
1033 ctx->f16, args, 4, AC_FUNC_ATTR_READNONE);
1034
1035 } else {
1036 LLVMValueRef p1;
1037
1038 args[0] = i;
1039 args[1] = llvm_chan;
1040 args[2] = attr_number;
1041 args[3] = high_16bits ? ctx->i1true : ctx->i1false;
1042 args[4] = params;
1043
1044 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
1045 AC_FUNC_ATTR_READNONE);
1046
1047 args[0] = p1;
1048 args[1] = j;
1049 args[2] = llvm_chan;
1050 args[3] = attr_number;
1051 args[4] = high_16bits ? ctx->i1true : ctx->i1false;
1052 args[5] = params;
1053
1054 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
1055 AC_FUNC_ATTR_READNONE);
1056 }
1057 }
1058
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,LLVMValueRef parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)1059 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
1060 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
1061 LLVMValueRef params)
1062 {
1063 LLVMValueRef args[4];
1064
1065 if (ctx->gfx_level >= GFX11) {
1066 LLVMValueRef p;
1067
1068 args[0] = llvm_chan;
1069 args[1] = attr_number;
1070 args[2] = params;
1071
1072 p = ac_build_intrinsic(ctx, "llvm.amdgcn.lds.param.load",
1073 ctx->f32, args, 3, AC_FUNC_ATTR_READNONE);
1074 p = ac_build_quad_swizzle(ctx, p, 0, 0, 0 ,0);
1075 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32, &p, 1, AC_FUNC_ATTR_READNONE);
1076 } else {
1077 args[0] = parameter;
1078 args[1] = llvm_chan;
1079 args[2] = attr_number;
1080 args[3] = params;
1081
1082 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
1083 AC_FUNC_ATTR_READNONE);
1084 }
1085 }
1086
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1087 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1088 LLVMValueRef index)
1089 {
1090 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1091 }
1092
ac_build_gep0(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1093 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1094 {
1095 LLVMValueRef indices[2] = {
1096 ctx->i32_0,
1097 index,
1098 };
1099 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
1100 }
1101
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef index)1102 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
1103 {
1104 LLVMValueRef offset_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
1105 return LLVMBuildPointerCast(ctx->builder, offset_ptr, LLVMTypeOf(ptr), "");
1106 }
1107
ac_build_indexed_store(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,LLVMValueRef value)1108 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
1109 LLVMValueRef value)
1110 {
1111 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
1112 }
1113
1114 /**
1115 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
1116 * It's equivalent to doing a load from &base_ptr[index].
1117 *
1118 * \param base_ptr Where the array starts.
1119 * \param index The element index into the array.
1120 * \param uniform Whether the base_ptr and index can be assumed to be
1121 * dynamically uniform (i.e. load to an SGPR)
1122 * \param invariant Whether the load is invariant (no other opcodes affect it)
1123 * \param no_unsigned_wraparound
1124 * For all possible re-associations and re-distributions of an expression
1125 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1126 * without inbounds in base_ptr), this parameter is true if "addr + offset"
1127 * does not result in an unsigned integer wraparound. This is used for
1128 * optimal code generation of 32-bit pointer arithmetic.
1129 *
1130 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1131 * integer wraparound can't be an imm offset in s_load_dword, because
1132 * the instruction performs "addr + offset" in 64 bits.
1133 *
1134 * Expected usage for bindless textures by chaining GEPs:
1135 * // possible unsigned wraparound, don't use InBounds:
1136 * ptr1 = LLVMBuildGEP(base_ptr, index);
1137 * image = load(ptr1); // becomes "s_load ptr1, 0"
1138 *
1139 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1140 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1141 */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)1142 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1143 LLVMValueRef index, bool uniform, bool invariant,
1144 bool no_unsigned_wraparound)
1145 {
1146 LLVMValueRef pointer, result;
1147
1148 if (no_unsigned_wraparound &&
1149 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1150 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1151 else
1152 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1153
1154 if (uniform)
1155 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1156 result = LLVMBuildLoad(ctx->builder, pointer, "");
1157 if (invariant)
1158 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1159 LLVMSetAlignment(result, 4);
1160 return result;
1161 }
1162
ac_build_load(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1163 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1164 {
1165 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1166 }
1167
ac_build_load_invariant(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1168 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1169 LLVMValueRef index)
1170 {
1171 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1172 }
1173
1174 /* This assumes that there is no unsigned integer wraparound during the address
1175 * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1176 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1177 LLVMValueRef index)
1178 {
1179 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1180 }
1181
1182 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1183 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1184 LLVMValueRef base_ptr, LLVMValueRef index)
1185 {
1186 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1187 }
1188
get_load_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1189 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1190 {
1191 return cache_policy |
1192 (ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0);
1193 }
1194
get_store_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1195 static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1196 {
1197 if (ctx->gfx_level >= GFX11)
1198 cache_policy &= ~ac_glc; /* GLC has no effect on stores */
1199 return cache_policy;
1200 }
1201
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool use_format)1202 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1203 LLVMValueRef data, LLVMValueRef vindex,
1204 LLVMValueRef voffset, LLVMValueRef soffset,
1205 unsigned cache_policy, bool use_format)
1206 {
1207 LLVMValueRef args[6];
1208 int idx = 0;
1209 args[idx++] = data;
1210 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1211 if (vindex)
1212 args[idx++] = vindex ? vindex : ctx->i32_0;
1213 args[idx++] = voffset ? voffset : ctx->i32_0;
1214 args[idx++] = soffset ? soffset : ctx->i32_0;
1215 args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0);
1216 const char *indexing_kind = vindex ? "struct" : "raw";
1217 char name[256], type_name[8];
1218
1219 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1220
1221 if (use_format) {
1222 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1223 type_name);
1224 } else {
1225 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1226 }
1227
1228 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1229 }
1230
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,unsigned cache_policy)1231 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1232 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1233 {
1234 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true);
1235 }
1236
1237 /* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1238 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1239 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1240 unsigned cache_policy)
1241 {
1242 unsigned num_channels = ac_get_llvm_num_components(vdata);
1243
1244 /* Split 3 channel stores if unsupported. */
1245 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false)) {
1246 LLVMValueRef v[3], v01, voffset2;
1247
1248 for (int i = 0; i < 3; i++) {
1249 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1250 }
1251 v01 = ac_build_gather_values(ctx, v, 2);
1252
1253 voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
1254 LLVMConstInt(ctx->i32, 8, 0), "");
1255
1256 ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy);
1257 ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy);
1258 return;
1259 }
1260
1261 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
1262 cache_policy, false);
1263 }
1264
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool use_format,bool structurized)1265 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1266 LLVMValueRef vindex, LLVMValueRef voffset,
1267 LLVMValueRef soffset, unsigned num_channels,
1268 LLVMTypeRef channel_type, unsigned cache_policy,
1269 bool can_speculate, bool use_format,
1270 bool structurized)
1271 {
1272 LLVMValueRef args[5];
1273 int idx = 0;
1274 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1275 if (structurized)
1276 args[idx++] = vindex ? vindex : ctx->i32_0;
1277 args[idx++] = voffset ? voffset : ctx->i32_0;
1278 args[idx++] = soffset ? soffset : ctx->i32_0;
1279 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1280 unsigned func =
1281 !ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
1282 const char *indexing_kind = structurized ? "struct" : "raw";
1283 char name[256], type_name[8];
1284
1285 /* D16 is only supported on gfx8+ */
1286 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1287 ctx->gfx_level >= GFX8);
1288
1289 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1290 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1291
1292 if (use_format) {
1293 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1294 type_name);
1295 } else {
1296 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1297 }
1298
1299 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1300 }
1301
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool allow_smem)1302 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1303 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1304 LLVMTypeRef channel_type, unsigned cache_policy,
1305 bool can_speculate, bool allow_smem)
1306 {
1307 if (allow_smem && !(cache_policy & ac_slc) &&
1308 (!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) {
1309 assert(vindex == NULL);
1310
1311 LLVMValueRef result[8];
1312
1313 LLVMValueRef offset = voffset ? voffset : ctx->i32_0;
1314 if (soffset)
1315 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1316
1317 for (int i = 0; i < num_channels; i++) {
1318 if (i) {
1319 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1320 }
1321 LLVMValueRef args[3] = {
1322 rsrc,
1323 offset,
1324 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1325 };
1326 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1327 AC_FUNC_ATTR_READNONE);
1328 }
1329 if (num_channels == 1)
1330 return result[0];
1331
1332 if (num_channels == 3 && !ac_has_vec3_support(ctx->gfx_level, false))
1333 result[num_channels++] = LLVMGetUndef(ctx->f32);
1334 return ac_build_gather_values(ctx, result, num_channels);
1335 }
1336
1337 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, soffset, num_channels,
1338 channel_type, cache_policy, can_speculate, false, false);
1339 }
1340
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,unsigned cache_policy,bool can_speculate,bool d16,bool tfe)1341 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1342 LLVMValueRef vindex, LLVMValueRef voffset,
1343 unsigned num_channels, unsigned cache_policy,
1344 bool can_speculate, bool d16, bool tfe)
1345 {
1346 if (tfe) {
1347 assert(!d16);
1348
1349 cache_policy = get_load_cache_policy(ctx, cache_policy);
1350
1351 char code[256];
1352 /* The definition in the assembly and the one in the constraint string
1353 * differs because of an assembler bug.
1354 */
1355 snprintf(code, sizeof(code),
1356 "v_mov_b32 v0, 0\n"
1357 "v_mov_b32 v1, 0\n"
1358 "v_mov_b32 v2, 0\n"
1359 "v_mov_b32 v3, 0\n"
1360 "v_mov_b32 v4, 0\n"
1361 "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
1362 "s_waitcnt vmcnt(0)",
1363 cache_policy & ac_glc ? "glc" : "",
1364 cache_policy & ac_slc ? "slc" : "",
1365 cache_policy & ac_dlc ? "dlc" : "");
1366
1367 LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
1368 LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
1369 LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
1370
1371 LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
1372 voffset ? voffset : ctx->i32_0};
1373
1374 LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
1375 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
1376 LLVMValueRef res = LLVMBuildCall2(ctx->builder, calltype, inlineasm, args, 2, "");
1377
1378 return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
1379 ac_llvm_extract_elem(ctx, res, 4));
1380 }
1381
1382 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1383 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1384 true);
1385 }
1386
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate,bool structurized)1387 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1388 LLVMValueRef vindex, LLVMValueRef voffset,
1389 LLVMValueRef soffset, unsigned num_channels,
1390 unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1391 bool can_speculate, bool structurized)
1392 {
1393 LLVMValueRef args[6];
1394 int idx = 0;
1395 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1396 if (structurized)
1397 args[idx++] = vindex ? vindex : ctx->i32_0;
1398 args[idx++] = voffset ? voffset : ctx->i32_0;
1399 args[idx++] = soffset ? soffset : ctx->i32_0;
1400 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->gfx_level, dfmt, nfmt), 0);
1401 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1402 unsigned func =
1403 !ac_has_vec3_support(ctx->gfx_level, true) && num_channels == 3 ? 4 : num_channels;
1404 const char *indexing_kind = structurized ? "struct" : "raw";
1405 char name[256], type_name[8];
1406
1407 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1408 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1409
1410 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1411
1412 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1413 }
1414
ac_build_struct_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1415 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1416 LLVMValueRef vindex, LLVMValueRef voffset,
1417 LLVMValueRef soffset, unsigned num_channels,
1418 unsigned dfmt, unsigned nfmt, unsigned cache_policy,
1419 bool can_speculate)
1420 {
1421 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, num_channels, dfmt,
1422 nfmt, cache_policy, can_speculate, true);
1423 }
1424
ac_build_buffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1425 LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1426 LLVMValueRef voffset, LLVMValueRef soffset,
1427 unsigned cache_policy)
1428 {
1429 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1430 cache_policy, false, false, false);
1431 }
1432
ac_build_buffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1433 LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1434 LLVMValueRef voffset, LLVMValueRef soffset,
1435 unsigned cache_policy)
1436 {
1437 return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1438 false, false, false);
1439 }
1440
1441 /**
1442 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1443 *
1444 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1445 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1446 */
ac_ufN_to_float(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned exp_bits,unsigned mant_bits)1447 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1448 unsigned exp_bits, unsigned mant_bits)
1449 {
1450 assert(LLVMTypeOf(src) == ctx->i32);
1451
1452 LLVMValueRef tmp;
1453 LLVMValueRef mantissa;
1454 mantissa =
1455 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1456
1457 /* Converting normal numbers is just a shift + correcting the exponent bias */
1458 unsigned normal_shift = 23 - mant_bits;
1459 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1460 LLVMValueRef shifted, normal;
1461
1462 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1463 normal =
1464 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1465
1466 /* Converting nan/inf numbers is the same, but with a different exponent update */
1467 LLVMValueRef naninf;
1468 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1469
1470 /* Converting denormals is the complex case: determine the leading zeros of the
1471 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1472 */
1473 LLVMValueRef denormal;
1474 LLVMValueRef params[2] = {
1475 mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1476 };
1477 LLVMValueRef ctlz =
1478 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1479
1480 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1481 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1482 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1483
1484 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1485 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1486 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1487 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1488
1489 /* Select the final result. */
1490 LLVMValueRef result;
1491
1492 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1493 LLVMConstInt(ctx->i32, ((1ULL << exp_bits) - 1) << mant_bits, false), "");
1494 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1495
1496 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1497 LLVMConstInt(ctx->i32, 1ULL << mant_bits, false), "");
1498 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1499
1500 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1501 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1502
1503 return ac_to_float(ctx, result);
1504 }
1505
1506 /**
1507 * Generate a fully general open coded buffer format fetch with all required
1508 * fixups suitable for vertex fetch, using non-format buffer loads.
1509 *
1510 * Some combinations of argument values have special interpretations:
1511 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1512 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1513 *
1514 * \param log_size log(size of channel in bytes)
1515 * \param num_channels number of channels (1 to 4)
1516 * \param format AC_FETCH_FORMAT_xxx value
1517 * \param reverse whether XYZ channels are reversed
1518 * \param known_aligned whether the source is known to be aligned to hardware's
1519 * effective element size for loading the given format
1520 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1521 * \param rsrc buffer resource descriptor
1522 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1523 */
ac_build_opencoded_load_format(struct ac_llvm_context * ctx,unsigned log_size,unsigned num_channels,unsigned format,bool reverse,bool known_aligned,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool can_speculate)1524 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1525 unsigned num_channels, unsigned format, bool reverse,
1526 bool known_aligned, LLVMValueRef rsrc,
1527 LLVMValueRef vindex, LLVMValueRef voffset,
1528 LLVMValueRef soffset, unsigned cache_policy,
1529 bool can_speculate)
1530 {
1531 LLVMValueRef tmp;
1532 unsigned load_log_size = log_size;
1533 unsigned load_num_channels = num_channels;
1534 if (log_size == 3) {
1535 load_log_size = 2;
1536 if (format == AC_FETCH_FORMAT_FLOAT) {
1537 load_num_channels = 2 * num_channels;
1538 } else {
1539 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1540 }
1541 }
1542
1543 int log_recombine = 0;
1544 if ((ctx->gfx_level == GFX6 || ctx->gfx_level >= GFX10) && !known_aligned) {
1545 /* Avoid alignment restrictions by loading one byte at a time. */
1546 load_num_channels <<= load_log_size;
1547 log_recombine = load_log_size;
1548 load_log_size = 0;
1549 } else if (load_num_channels == 2 || load_num_channels == 4) {
1550 log_recombine = -util_logbase2(load_num_channels);
1551 load_num_channels = 1;
1552 load_log_size += -log_recombine;
1553 }
1554
1555 LLVMValueRef loads[32]; /* up to 32 bytes */
1556 for (unsigned i = 0; i < load_num_channels; ++i) {
1557 tmp =
1558 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1559 LLVMTypeRef channel_type =
1560 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1561 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1562 loads[i] =
1563 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1564 cache_policy, can_speculate, false, true);
1565 if (load_log_size >= 2)
1566 loads[i] = ac_to_integer(ctx, loads[i]);
1567 }
1568
1569 if (log_recombine > 0) {
1570 /* Recombine bytes if necessary (GFX6 only) */
1571 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1572
1573 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1574 LLVMValueRef accum = NULL;
1575 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1576 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1577 if (i == 0) {
1578 accum = tmp;
1579 } else {
1580 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1581 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1582 }
1583 }
1584 loads[dst] = accum;
1585 }
1586 } else if (log_recombine < 0) {
1587 /* Split vectors of dwords */
1588 if (load_log_size > 2) {
1589 assert(load_num_channels == 1);
1590 LLVMValueRef loaded = loads[0];
1591 unsigned log_split = load_log_size - 2;
1592 log_recombine += log_split;
1593 load_num_channels = 1 << log_split;
1594 load_log_size = 2;
1595 for (unsigned i = 0; i < load_num_channels; ++i) {
1596 tmp = LLVMConstInt(ctx->i32, i, false);
1597 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1598 }
1599 }
1600
1601 /* Further split dwords and shorts if required */
1602 if (log_recombine < 0) {
1603 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1604 --src) {
1605 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1606 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1607 LLVMValueRef loaded = loads[src - 1];
1608 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1609 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1610 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1611 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1612 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1613 }
1614 }
1615 }
1616 }
1617
1618 if (log_size == 3) {
1619 if (format == AC_FETCH_FORMAT_FLOAT) {
1620 for (unsigned i = 0; i < num_channels; ++i) {
1621 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1622 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1623 }
1624 } else if (format == AC_FETCH_FORMAT_FIXED) {
1625 /* 10_11_11_FLOAT */
1626 LLVMValueRef data = loads[0];
1627 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1628 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1629 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1630 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1631 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1632
1633 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1634 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1635 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1636
1637 num_channels = 3;
1638 log_size = 2;
1639 format = AC_FETCH_FORMAT_FLOAT;
1640 } else {
1641 /* 2_10_10_10 data formats */
1642 LLVMValueRef data = loads[0];
1643 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1644 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1645 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1646 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1647 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1648 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1649 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1650 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1651 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1652
1653 num_channels = 4;
1654 }
1655 }
1656
1657 if (format == AC_FETCH_FORMAT_FLOAT) {
1658 if (log_size != 2) {
1659 for (unsigned chan = 0; chan < num_channels; ++chan) {
1660 tmp = ac_to_float(ctx, loads[chan]);
1661 if (log_size == 3)
1662 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1663 else if (log_size == 1)
1664 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1665 loads[chan] = ac_to_integer(ctx, tmp);
1666 }
1667 }
1668 } else if (format == AC_FETCH_FORMAT_UINT) {
1669 if (log_size != 2) {
1670 for (unsigned chan = 0; chan < num_channels; ++chan)
1671 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1672 }
1673 } else if (format == AC_FETCH_FORMAT_SINT) {
1674 if (log_size != 2) {
1675 for (unsigned chan = 0; chan < num_channels; ++chan)
1676 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1677 }
1678 } else {
1679 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1680 format == AC_FETCH_FORMAT_UINT;
1681
1682 for (unsigned chan = 0; chan < num_channels; ++chan) {
1683 if (unsign) {
1684 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1685 } else {
1686 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1687 }
1688
1689 LLVMValueRef scale = NULL;
1690 if (format == AC_FETCH_FORMAT_FIXED) {
1691 assert(log_size == 2);
1692 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1693 } else if (format == AC_FETCH_FORMAT_UNORM) {
1694 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1695 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1696 } else if (format == AC_FETCH_FORMAT_SNORM) {
1697 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1698 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1699 }
1700 if (scale)
1701 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1702
1703 if (format == AC_FETCH_FORMAT_SNORM) {
1704 /* Clamp to [-1, 1] */
1705 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1706 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1707 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1708 }
1709
1710 loads[chan] = ac_to_integer(ctx, tmp);
1711 }
1712 }
1713
1714 while (num_channels < 4) {
1715 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1716 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1717 } else {
1718 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1719 }
1720 num_channels++;
1721 }
1722
1723 if (reverse) {
1724 tmp = loads[0];
1725 loads[0] = loads[2];
1726 loads[2] = tmp;
1727 }
1728
1729 return ac_build_gather_values(ctx, loads, 4);
1730 }
1731
ac_build_buffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1732 void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1733 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1734 unsigned cache_policy)
1735 {
1736 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1737
1738 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1739 }
1740
ac_build_buffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1741 void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1742 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1743 {
1744 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1745
1746 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
1747 }
1748
1749 /**
1750 * Set range metadata on an instruction. This can only be used on load and
1751 * call instructions. If you know an instruction can only produce the values
1752 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1753 * \p lo is the minimum value inclusive.
1754 * \p hi is the maximum value exclusive.
1755 */
ac_set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1756 void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1757 unsigned hi)
1758 {
1759 LLVMValueRef range_md, md_args[2];
1760 LLVMTypeRef type = LLVMTypeOf(value);
1761 LLVMContextRef context = LLVMGetTypeContext(type);
1762
1763 md_args[0] = LLVMConstInt(type, lo, false);
1764 md_args[1] = LLVMConstInt(type, hi, false);
1765 range_md = LLVMMDNodeInContext(context, md_args, 2);
1766 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1767 }
1768
ac_get_thread_id(struct ac_llvm_context * ctx)1769 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1770 {
1771 return ac_build_mbcnt(ctx, LLVMConstInt(ctx->iN_wavemask, ~0ull, 0));
1772 }
1773
1774 /*
1775 * AMD GCN implements derivatives using the local data store (LDS)
1776 * All writes to the LDS happen in all executing threads at
1777 * the same time. TID is the Thread ID for the current
1778 * thread and is a value between 0 and 63, representing
1779 * the thread's position in the wavefront.
1780 *
1781 * For the pixel shader threads are grouped into quads of four pixels.
1782 * The TIDs of the pixels of a quad are:
1783 *
1784 * +------+------+
1785 * |4n + 0|4n + 1|
1786 * +------+------+
1787 * |4n + 2|4n + 3|
1788 * +------+------+
1789 *
1790 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1791 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1792 * the current pixel's column, and masking with 0xfffffffe yields the TID
1793 * of the left pixel of the current pixel's row.
1794 *
1795 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1796 * adding 2 yields the TID of the pixel below the top pixel.
1797 */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1798 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1799 {
1800 unsigned tl_lanes[4], trbl_lanes[4];
1801 char name[32], type[8];
1802 LLVMValueRef tl, trbl;
1803 LLVMTypeRef result_type;
1804 LLVMValueRef result;
1805
1806 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1807
1808 if (result_type == ctx->f16)
1809 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1810 else if (result_type == ctx->v2f16)
1811 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1812
1813 for (unsigned i = 0; i < 4; ++i) {
1814 tl_lanes[i] = i & mask;
1815 trbl_lanes[i] = (i & mask) + idx;
1816 }
1817
1818 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1819 trbl =
1820 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1821
1822 if (result_type == ctx->f16) {
1823 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1824 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1825 }
1826
1827 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1828 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1829 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1830
1831 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1832 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1833
1834 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1835 }
1836
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t msg,LLVMValueRef wave_id)1837 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1838 {
1839 LLVMValueRef args[2];
1840 args[0] = LLVMConstInt(ctx->i32, msg, false);
1841 args[1] = wave_id;
1842 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1843 }
1844
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1845 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1846 {
1847 LLVMValueRef msb =
1848 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1849
1850 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1851 * the index from LSB. Invert it by doing "31 - msb". */
1852 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1853
1854 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1855 LLVMValueRef cond =
1856 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1857 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1858
1859 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1860 }
1861
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1862 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1863 {
1864 const char *intrin_name;
1865 LLVMTypeRef type;
1866 LLVMValueRef highest_bit;
1867 LLVMValueRef zero;
1868 unsigned bitsize;
1869
1870 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1871 switch (bitsize) {
1872 case 64:
1873 intrin_name = "llvm.ctlz.i64";
1874 type = ctx->i64;
1875 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1876 zero = ctx->i64_0;
1877 break;
1878 case 32:
1879 intrin_name = "llvm.ctlz.i32";
1880 type = ctx->i32;
1881 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1882 zero = ctx->i32_0;
1883 break;
1884 case 16:
1885 intrin_name = "llvm.ctlz.i16";
1886 type = ctx->i16;
1887 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1888 zero = ctx->i16_0;
1889 break;
1890 case 8:
1891 intrin_name = "llvm.ctlz.i8";
1892 type = ctx->i8;
1893 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1894 zero = ctx->i8_0;
1895 break;
1896 default:
1897 unreachable(!"invalid bitsize");
1898 break;
1899 }
1900
1901 LLVMValueRef params[2] = {
1902 arg,
1903 ctx->i1true,
1904 };
1905
1906 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1907
1908 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1909 * the index from LSB. Invert it by doing "31 - msb". */
1910 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1911
1912 if (bitsize == 64) {
1913 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1914 } else if (bitsize < 32) {
1915 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1916 }
1917
1918 /* check for zero */
1919 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1920 LLVMConstInt(ctx->i32, -1, true), msb, "");
1921 }
1922
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1923 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1924 {
1925 char name[64], type[64];
1926
1927 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1928 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1929 LLVMValueRef args[2] = {a, b};
1930 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1931 }
1932
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1933 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935 char name[64], type[64];
1936
1937 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1938 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1939 LLVMValueRef args[2] = {a, b};
1940 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1941 }
1942
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1943 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1944 {
1945 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1946 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1947 }
1948
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1949 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1950 {
1951 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1952 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1953 }
1954
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1955 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1956 {
1957 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1958 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1959 }
1960
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1961 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1962 {
1963 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1964 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1965 }
1966
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1967 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1968 {
1969 LLVMTypeRef t = LLVMTypeOf(value);
1970 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1971 LLVMConstReal(t, 1.0));
1972 }
1973
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1974 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1975 {
1976 LLVMValueRef args[9];
1977
1978 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1979 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1980
1981 if (a->compr) {
1982 assert(ctx->gfx_level < GFX11);
1983
1984 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1985 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1986 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1987 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1988
1989 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1990 } else {
1991 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->f32, "");
1992 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->f32, "");
1993 args[4] = LLVMBuildBitCast(ctx->builder, a->out[2], ctx->f32, "");
1994 args[5] = LLVMBuildBitCast(ctx->builder, a->out[3], ctx->f32, "");
1995 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1996 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1997
1998 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1999 }
2000 }
2001
ac_build_export_null(struct ac_llvm_context * ctx,bool uses_discard)2002 void ac_build_export_null(struct ac_llvm_context *ctx, bool uses_discard)
2003 {
2004 struct ac_export_args args;
2005
2006 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
2007 * for discard.
2008 */
2009 if (ctx->gfx_level >= GFX10 && !uses_discard)
2010 return;
2011
2012 args.enabled_channels = 0x0; /* enabled channels */
2013 args.valid_mask = 1; /* whether the EXEC mask is valid */
2014 args.done = 1; /* DONE bit */
2015 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
2016 args.target = ctx->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2017 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
2018 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2019 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2020 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2021 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2022
2023 ac_build_export(ctx, &args);
2024 }
2025
ac_num_coords(enum ac_image_dim dim)2026 static unsigned ac_num_coords(enum ac_image_dim dim)
2027 {
2028 switch (dim) {
2029 case ac_image_1d:
2030 return 1;
2031 case ac_image_2d:
2032 case ac_image_1darray:
2033 return 2;
2034 case ac_image_3d:
2035 case ac_image_cube:
2036 case ac_image_2darray:
2037 case ac_image_2dmsaa:
2038 return 3;
2039 case ac_image_2darraymsaa:
2040 return 4;
2041 default:
2042 unreachable("ac_num_coords: bad dim");
2043 }
2044 }
2045
ac_num_derivs(enum ac_image_dim dim)2046 static unsigned ac_num_derivs(enum ac_image_dim dim)
2047 {
2048 switch (dim) {
2049 case ac_image_1d:
2050 case ac_image_1darray:
2051 return 2;
2052 case ac_image_2d:
2053 case ac_image_2darray:
2054 case ac_image_cube:
2055 return 4;
2056 case ac_image_3d:
2057 return 6;
2058 case ac_image_2dmsaa:
2059 case ac_image_2darraymsaa:
2060 default:
2061 unreachable("derivatives not supported");
2062 }
2063 }
2064
get_atomic_name(enum ac_atomic_op op)2065 static const char *get_atomic_name(enum ac_atomic_op op)
2066 {
2067 switch (op) {
2068 case ac_atomic_swap:
2069 return "swap";
2070 case ac_atomic_add:
2071 return "add";
2072 case ac_atomic_sub:
2073 return "sub";
2074 case ac_atomic_smin:
2075 return "smin";
2076 case ac_atomic_umin:
2077 return "umin";
2078 case ac_atomic_smax:
2079 return "smax";
2080 case ac_atomic_umax:
2081 return "umax";
2082 case ac_atomic_and:
2083 return "and";
2084 case ac_atomic_or:
2085 return "or";
2086 case ac_atomic_xor:
2087 return "xor";
2088 case ac_atomic_inc_wrap:
2089 return "inc";
2090 case ac_atomic_dec_wrap:
2091 return "dec";
2092 case ac_atomic_fmin:
2093 return "fmin";
2094 case ac_atomic_fmax:
2095 return "fmax";
2096 }
2097 unreachable("bad atomic op");
2098 }
2099
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)2100 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2101 {
2102 const char *overload[3] = {"", "", ""};
2103 unsigned num_overloads = 0;
2104 LLVMValueRef args[18];
2105 unsigned num_args = 0;
2106 enum ac_image_dim dim = a->dim;
2107
2108 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2109 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2110 a->opcode != ac_image_store_mip) ||
2111 a->lod);
2112 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2113 (!a->compare && !a->offset));
2114 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2115 a->opcode == ac_image_get_lod) ||
2116 !a->bias);
2117 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2118 1);
2119 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2120 assert(!a->d16 || (ctx->gfx_level >= GFX8 && a->opcode != ac_image_atomic &&
2121 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2122 a->opcode != ac_image_get_resinfo));
2123 assert(!a->a16 || ctx->gfx_level >= GFX9);
2124 assert(a->g16 == a->a16 || ctx->gfx_level >= GFX10);
2125
2126 assert(!a->offset ||
2127 ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32);
2128 assert(!a->bias ||
2129 ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32);
2130 assert(!a->compare ||
2131 ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32);
2132 assert(!a->derivs[0] ||
2133 ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) &&
2134 (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32)));
2135 assert(!a->coords[0] ||
2136 ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) &&
2137 (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32)));
2138 assert(!a->lod ||
2139 ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) &&
2140 (a->opcode == ac_image_get_resinfo ||
2141 ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) ==
2142 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])))));
2143 assert(!a->min_lod ||
2144 ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) ==
2145 ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])));
2146
2147 if (a->opcode == ac_image_get_lod) {
2148 switch (dim) {
2149 case ac_image_1darray:
2150 dim = ac_image_1d;
2151 break;
2152 case ac_image_2darray:
2153 case ac_image_cube:
2154 dim = ac_image_2d;
2155 break;
2156 default:
2157 break;
2158 }
2159 }
2160
2161 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2162 a->opcode == ac_image_get_lod;
2163 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2164 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2165 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2166 LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32);
2167 uint8_t dmask = a->dmask;
2168 LLVMTypeRef data_type;
2169 char data_type_str[32];
2170
2171 if (atomic) {
2172 data_type = LLVMTypeOf(a->data[0]);
2173 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2174 /* Image stores might have been shrinked using the format. */
2175 data_type = LLVMTypeOf(a->data[0]);
2176 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2177 } else {
2178 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2179 }
2180
2181 if (a->tfe) {
2182 data_type = LLVMStructTypeInContext(
2183 ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
2184 }
2185
2186 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2187 args[num_args++] = a->data[0];
2188 if (a->opcode == ac_image_atomic_cmpswap)
2189 args[num_args++] = a->data[1];
2190 }
2191
2192 if (!atomic)
2193 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2194
2195 if (a->offset)
2196 args[num_args++] = ac_to_integer(ctx, a->offset);
2197 if (a->bias) {
2198 args[num_args++] = ac_to_float(ctx, a->bias);
2199 overload[num_overloads++] = ".f32";
2200 }
2201 if (a->compare)
2202 args[num_args++] = ac_to_float(ctx, a->compare);
2203 if (a->derivs[0]) {
2204 unsigned count = ac_num_derivs(dim);
2205 for (unsigned i = 0; i < count; ++i)
2206 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2207 overload[num_overloads++] = a->g16 ? ".f16" : ".f32";
2208 }
2209 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2210 for (unsigned i = 0; i < num_coords; ++i)
2211 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2212 if (a->lod)
2213 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2214 if (a->min_lod)
2215 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2216
2217 overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32");
2218
2219 args[num_args++] = a->resource;
2220 if (sample) {
2221 args[num_args++] = a->sampler;
2222 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2223 }
2224
2225 args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
2226 args[num_args++] = LLVMConstInt(
2227 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2228
2229 const char *name;
2230 const char *atomic_subop = "";
2231 switch (a->opcode) {
2232 case ac_image_sample:
2233 name = "sample";
2234 break;
2235 case ac_image_gather4:
2236 name = "gather4";
2237 break;
2238 case ac_image_load:
2239 name = "load";
2240 break;
2241 case ac_image_load_mip:
2242 name = "load.mip";
2243 break;
2244 case ac_image_store:
2245 name = "store";
2246 break;
2247 case ac_image_store_mip:
2248 name = "store.mip";
2249 break;
2250 case ac_image_atomic:
2251 name = "atomic.";
2252 atomic_subop = get_atomic_name(a->atomic);
2253 break;
2254 case ac_image_atomic_cmpswap:
2255 name = "atomic.";
2256 atomic_subop = "cmpswap";
2257 break;
2258 case ac_image_get_lod:
2259 name = "getlod";
2260 break;
2261 case ac_image_get_resinfo:
2262 name = "getresinfo";
2263 break;
2264 default:
2265 unreachable("invalid image opcode");
2266 }
2267
2268 const char *dimname;
2269 switch (dim) {
2270 case ac_image_1d:
2271 dimname = "1d";
2272 break;
2273 case ac_image_2d:
2274 dimname = "2d";
2275 break;
2276 case ac_image_3d:
2277 dimname = "3d";
2278 break;
2279 case ac_image_cube:
2280 dimname = "cube";
2281 break;
2282 case ac_image_1darray:
2283 dimname = "1darray";
2284 break;
2285 case ac_image_2darray:
2286 dimname = "2darray";
2287 break;
2288 case ac_image_2dmsaa:
2289 dimname = "2dmsaa";
2290 break;
2291 case ac_image_2darraymsaa:
2292 dimname = "2darraymsaa";
2293 break;
2294 default:
2295 unreachable("invalid dim");
2296 }
2297
2298 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2299
2300 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2301 char intr_name[96];
2302 snprintf(intr_name, sizeof(intr_name),
2303 "llvm.amdgcn.image.%s%s" /* base name */
2304 "%s%s%s%s" /* sample/gather modifiers */
2305 ".%s.%s%s%s%s", /* dimension and type overloads */
2306 name, atomic_subop, a->compare ? ".c" : "",
2307 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2308 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2309 data_type_str, overload[0], overload[1], overload[2]);
2310
2311 LLVMTypeRef retty;
2312 if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2313 retty = ctx->voidt;
2314 else
2315 retty = data_type;
2316
2317 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2318 if (a->tfe) {
2319 LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
2320 LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
2321 result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
2322 }
2323
2324 if (!sample && !atomic && retty != ctx->voidt)
2325 result = ac_to_integer(ctx, result);
2326
2327 return result;
2328 }
2329
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)2330 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2331 {
2332 LLVMValueRef samples;
2333
2334 /* Read the samples from the descriptor directly.
2335 * Hardware doesn't have any instruction for this.
2336 */
2337 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2338 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2339 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2340 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2341 return samples;
2342 }
2343
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2344 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2345 {
2346 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2347 AC_FUNC_ATTR_READNONE);
2348 }
2349
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2350 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2351 {
2352 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2353 AC_FUNC_ATTR_READNONE);
2354 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2355 }
2356
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2357 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2358 {
2359 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2360 AC_FUNC_ATTR_READNONE);
2361 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2362 }
2363
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2364 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2365 LLVMValueRef args[2])
2366 {
2367 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2368 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2369 LLVMValueRef code = LLVMConstInlineAsm(calltype,
2370 ctx->gfx_level >= GFX11 ?
2371 "v_cvt_pk_norm_i16_f16 $0, $1, $2" :
2372 "v_cvt_pknorm_i16_f16 $0, $1, $2",
2373 "=v,v,v", false, false);
2374 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2375 }
2376
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2377 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2378 LLVMValueRef args[2])
2379 {
2380 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2381 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2382 LLVMValueRef code = LLVMConstInlineAsm(calltype,
2383 ctx->gfx_level >= GFX11 ?
2384 "v_cvt_pk_norm_u16_f16 $0, $1, $2" :
2385 "v_cvt_pknorm_u16_f16 $0, $1, $2",
2386 "=v,v,v", false, false);
2387 return LLVMBuildCall2(ctx->builder, calltype, code, args, 2, "");
2388 }
2389
2390 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2391 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2392 bool hi)
2393 {
2394 assert(bits == 8 || bits == 10 || bits == 16);
2395
2396 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2397 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2398 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2399 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2400
2401 /* Clamp. */
2402 if (bits != 16) {
2403 for (int i = 0; i < 2; i++) {
2404 bool alpha = hi && i == 1;
2405 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2406 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2407 }
2408 }
2409
2410 LLVMValueRef res =
2411 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2412 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2413 }
2414
2415 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2416 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2417 bool hi)
2418 {
2419 assert(bits == 8 || bits == 10 || bits == 16);
2420
2421 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2422 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2423
2424 /* Clamp. */
2425 if (bits != 16) {
2426 for (int i = 0; i < 2; i++) {
2427 bool alpha = hi && i == 1;
2428 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2429 }
2430 }
2431
2432 LLVMValueRef res =
2433 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2434 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2435 }
2436
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2437 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2438 {
2439 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2440 }
2441
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2442 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2443 {
2444 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2445 }
2446
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2447 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2448 LLVMValueRef width, bool is_signed)
2449 {
2450 LLVMValueRef args[] = {
2451 input,
2452 offset,
2453 width,
2454 };
2455
2456 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2457 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2458 }
2459
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2460 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2461 LLVMValueRef s2)
2462 {
2463 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2464 }
2465
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2466 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2467 LLVMValueRef s2)
2468 {
2469 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2470 if (ctx->gfx_level >= GFX10) {
2471 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2472 AC_FUNC_ATTR_READNONE);
2473 }
2474
2475 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2476 }
2477
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2478 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2479 {
2480 if (!wait_flags)
2481 return;
2482
2483 unsigned expcnt = 7;
2484 unsigned lgkmcnt = 63;
2485 unsigned vmcnt = ctx->gfx_level >= GFX9 ? 63 : 15;
2486 unsigned vscnt = 63;
2487
2488 if (wait_flags & AC_WAIT_EXP)
2489 expcnt = 0;
2490 if (wait_flags & AC_WAIT_LGKM)
2491 lgkmcnt = 0;
2492 if (wait_flags & AC_WAIT_VLOAD)
2493 vmcnt = 0;
2494
2495 if (wait_flags & AC_WAIT_VSTORE) {
2496 if (ctx->gfx_level >= GFX10)
2497 vscnt = 0;
2498 else
2499 vmcnt = 0;
2500 }
2501
2502 /* There is no intrinsic for vscnt(0), so use a fence. */
2503 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2504 vscnt == 0) {
2505 assert(!(wait_flags & AC_WAIT_EXP));
2506 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2507 return;
2508 }
2509
2510 unsigned simm16;
2511
2512 if (ctx->gfx_level >= GFX11)
2513 simm16 = expcnt | (lgkmcnt << 4) | (vmcnt << 10);
2514 else
2515 simm16 = (lgkmcnt << 8) | (expcnt << 4) | (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2516
2517 LLVMValueRef args[1] = {
2518 LLVMConstInt(ctx->i32, simm16, false),
2519 };
2520 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2521 }
2522
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2523 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2524 LLVMTypeRef type)
2525 {
2526 unsigned bitsize = ac_get_elem_bits(ctx, type);
2527 LLVMValueRef zero = LLVMConstReal(type, 0.0);
2528 LLVMValueRef one = LLVMConstReal(type, 1.0);
2529 LLVMValueRef result;
2530
2531 if (bitsize == 64 || (bitsize == 16 && ctx->gfx_level <= GFX8) || type == ctx->v2f16) {
2532 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2533 * doesn't expose an intrinsic.
2534 */
2535 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2536 } else {
2537 LLVMTypeRef type;
2538 char *intr;
2539
2540 if (bitsize == 16) {
2541 intr = "llvm.amdgcn.fmed3.f16";
2542 type = ctx->f16;
2543 } else {
2544 assert(bitsize == 32);
2545 intr = "llvm.amdgcn.fmed3.f32";
2546 type = ctx->f32;
2547 }
2548
2549 LLVMValueRef params[] = {
2550 zero,
2551 one,
2552 src,
2553 };
2554
2555 result = ac_build_intrinsic(ctx, intr, type, params, 3,
2556 AC_FUNC_ATTR_READNONE);
2557 }
2558
2559 if (ctx->gfx_level < GFX9 && bitsize == 32) {
2560 /* Only pre-GFX9 chips do not flush denorms. */
2561 result = ac_build_canonicalize(ctx, result, bitsize);
2562 }
2563
2564 return result;
2565 }
2566
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2567 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2568 {
2569 LLVMTypeRef type;
2570 char *intr;
2571
2572 if (bitsize == 16) {
2573 intr = "llvm.amdgcn.fract.f16";
2574 type = ctx->f16;
2575 } else if (bitsize == 32) {
2576 intr = "llvm.amdgcn.fract.f32";
2577 type = ctx->f32;
2578 } else {
2579 intr = "llvm.amdgcn.fract.f64";
2580 type = ctx->f64;
2581 }
2582
2583 LLVMValueRef params[] = {
2584 src0,
2585 };
2586 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2587 }
2588
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2589 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2590 {
2591
2592 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2593 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2594 unsigned vec_size = LLVMGetVectorSize(type);
2595 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2596
2597 for (unsigned i = 0; i < vec_size; i++)
2598 scalars[i] = scalar;
2599 return LLVMConstVector(scalars, vec_size);
2600 }
2601 return LLVMConstInt(type, value, 0);
2602 }
2603
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2604 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2605 {
2606 LLVMTypeRef type = LLVMTypeOf(src0);
2607 LLVMValueRef val;
2608
2609 /* v_med3 is selected only when max is first. (LLVM bug?) */
2610 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2611 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2612 }
2613
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2614 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2615 {
2616 ac_enable_signed_zeros(ctx);
2617 /* (val + 0) converts negative zero to positive zero. */
2618 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2619 ac_disable_signed_zeros(ctx);
2620 return val;
2621 }
2622
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2623 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2624 {
2625 LLVMTypeRef type = LLVMTypeOf(src);
2626 LLVMValueRef pos, neg, dw[2], val;
2627 unsigned bitsize = ac_get_elem_bits(ctx, type);
2628
2629 /* The standard version leads to this:
2630 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2631 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2632 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2633 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2634 *
2635 * The isign version:
2636 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2637 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2638 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2639 *
2640 * (src0 + 0) converts negative zero to positive zero.
2641 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2642 *
2643 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2644 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2645 */
2646 if (bitsize == 16 || bitsize == 32) {
2647 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2648 val = ac_build_isign(ctx, val);
2649 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2650 }
2651
2652 assert(bitsize == 64);
2653 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2654 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2655 dw[0] = ctx->i32_0;
2656 dw[1] = LLVMBuildSelect(
2657 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2658 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2659 "");
2660 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2661 }
2662
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2663 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2664 {
2665 LLVMValueRef result;
2666 unsigned bitsize;
2667
2668 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2669
2670 switch (bitsize) {
2671 case 128:
2672 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2673 AC_FUNC_ATTR_READNONE);
2674 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2675 break;
2676 case 64:
2677 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2678 AC_FUNC_ATTR_READNONE);
2679
2680 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2681 break;
2682 case 32:
2683 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2684 AC_FUNC_ATTR_READNONE);
2685 break;
2686 case 16:
2687 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2688 AC_FUNC_ATTR_READNONE);
2689
2690 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2691 break;
2692 case 8:
2693 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2694 AC_FUNC_ATTR_READNONE);
2695
2696 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2697 break;
2698 default:
2699 unreachable(!"invalid bitsize");
2700 break;
2701 }
2702
2703 return result;
2704 }
2705
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2706 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2707 {
2708 LLVMValueRef result;
2709 unsigned bitsize;
2710
2711 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2712
2713 switch (bitsize) {
2714 case 64:
2715 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2716 AC_FUNC_ATTR_READNONE);
2717
2718 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2719 break;
2720 case 32:
2721 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2722 AC_FUNC_ATTR_READNONE);
2723 break;
2724 case 16:
2725 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2726 AC_FUNC_ATTR_READNONE);
2727
2728 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2729 break;
2730 case 8:
2731 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2732 AC_FUNC_ATTR_READNONE);
2733
2734 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2735 break;
2736 default:
2737 unreachable(!"invalid bitsize");
2738 break;
2739 }
2740
2741 return result;
2742 }
2743
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2744 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2745 {
2746 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2747 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2748 AC_FUNC_ATTR_CONVERGENT);
2749 }
2750
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2751 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2752 {
2753 unsigned lds_size = ctx->gfx_level >= GFX7 ? 65536 : 32768;
2754 ctx->lds = LLVMBuildIntToPtr(
2755 ctx->builder, ctx->i32_0,
2756 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2757 }
2758
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2759 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2760 {
2761 return LLVMBuildLoad2(ctx->builder, ctx->i32, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2762 }
2763
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2764 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2765 {
2766 value = ac_to_integer(ctx, value);
2767 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2768 }
2769
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2770 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2771 {
2772 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2773 const char *intrin_name;
2774 LLVMTypeRef type;
2775 LLVMValueRef zero;
2776
2777 switch (src0_bitsize) {
2778 case 64:
2779 intrin_name = "llvm.cttz.i64";
2780 type = ctx->i64;
2781 zero = ctx->i64_0;
2782 break;
2783 case 32:
2784 intrin_name = "llvm.cttz.i32";
2785 type = ctx->i32;
2786 zero = ctx->i32_0;
2787 break;
2788 case 16:
2789 intrin_name = "llvm.cttz.i16";
2790 type = ctx->i16;
2791 zero = ctx->i16_0;
2792 break;
2793 case 8:
2794 intrin_name = "llvm.cttz.i8";
2795 type = ctx->i8;
2796 zero = ctx->i8_0;
2797 break;
2798 default:
2799 unreachable(!"invalid bitsize");
2800 }
2801
2802 LLVMValueRef params[2] = {
2803 src0,
2804
2805 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2806 * add special code to check for x=0. The reason is that
2807 * the LLVM behavior for x=0 is different from what we
2808 * need here. However, LLVM also assumes that ffs(x) is
2809 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2810 * a conditional assignment to handle 0 is still required.
2811 *
2812 * The hardware already implements the correct behavior.
2813 */
2814 ctx->i1true,
2815 };
2816
2817 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
2818
2819 if (src0_bitsize == 64) {
2820 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2821 } else if (src0_bitsize < 32) {
2822 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
2823 }
2824
2825 /* TODO: We need an intrinsic to skip this conditional. */
2826 /* Check for zero: */
2827 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
2828 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2829 }
2830
ac_array_in_const_addr_space(LLVMTypeRef elem_type)2831 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2832 {
2833 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
2834 }
2835
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)2836 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2837 {
2838 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
2839 }
2840
get_current_flow(struct ac_llvm_context * ctx)2841 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
2842 {
2843 if (ctx->flow->depth > 0)
2844 return &ctx->flow->stack[ctx->flow->depth - 1];
2845 return NULL;
2846 }
2847
get_innermost_loop(struct ac_llvm_context * ctx)2848 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
2849 {
2850 for (unsigned i = ctx->flow->depth; i > 0; --i) {
2851 if (ctx->flow->stack[i - 1].loop_entry_block)
2852 return &ctx->flow->stack[i - 1];
2853 }
2854 return NULL;
2855 }
2856
push_flow(struct ac_llvm_context * ctx)2857 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
2858 {
2859 struct ac_llvm_flow *flow;
2860
2861 if (ctx->flow->depth >= ctx->flow->depth_max) {
2862 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
2863
2864 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
2865 ctx->flow->depth_max = new_max;
2866 }
2867
2868 flow = &ctx->flow->stack[ctx->flow->depth];
2869 ctx->flow->depth++;
2870
2871 flow->next_block = NULL;
2872 flow->loop_entry_block = NULL;
2873 return flow;
2874 }
2875
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)2876 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
2877 {
2878 char buf[32];
2879 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2880 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2881 }
2882
2883 /* Append a basic block at the level of the parent flow.
2884 */
append_basic_block(struct ac_llvm_context * ctx,const char * name)2885 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
2886 {
2887 assert(ctx->flow->depth >= 1);
2888
2889 if (ctx->flow->depth >= 2) {
2890 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
2891
2892 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
2893 }
2894
2895 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2896 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2897 }
2898
2899 /* Emit a branch to the given default target for the current block if
2900 * applicable -- that is, if the current block does not already contain a
2901 * branch from a break or continue.
2902 */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)2903 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
2904 {
2905 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2906 LLVMBuildBr(builder, target);
2907 }
2908
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)2909 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2910 {
2911 struct ac_llvm_flow *flow = push_flow(ctx);
2912 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2913 flow->next_block = append_basic_block(ctx, "ENDLOOP");
2914 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2915 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2916 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2917 }
2918
ac_build_break(struct ac_llvm_context * ctx)2919 void ac_build_break(struct ac_llvm_context *ctx)
2920 {
2921 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2922 LLVMBuildBr(ctx->builder, flow->next_block);
2923 }
2924
ac_build_continue(struct ac_llvm_context * ctx)2925 void ac_build_continue(struct ac_llvm_context *ctx)
2926 {
2927 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2928 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2929 }
2930
ac_build_else(struct ac_llvm_context * ctx,int label_id)2931 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2932 {
2933 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2934 LLVMBasicBlockRef endif_block;
2935
2936 assert(!current_branch->loop_entry_block);
2937
2938 endif_block = append_basic_block(ctx, "ENDIF");
2939 emit_default_branch(ctx->builder, endif_block);
2940
2941 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2942 set_basicblock_name(current_branch->next_block, "else", label_id);
2943
2944 current_branch->next_block = endif_block;
2945 }
2946
2947 /* Invoked after a branch is exited. */
ac_branch_exited(struct ac_llvm_context * ctx)2948 static void ac_branch_exited(struct ac_llvm_context *ctx)
2949 {
2950 if (ctx->flow->depth == 0 && ctx->conditional_demote_seen) {
2951 /* The previous conditional branch contained demote. Kill threads
2952 * after all conditional blocks because amdgcn.wqm.vote doesn't
2953 * return usable values inside the blocks.
2954 *
2955 * This is an optional optimization that only kills whole inactive quads.
2956 */
2957 LLVMValueRef cond = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
2958 ac_build_kill_if_false(ctx, ac_build_wqm_vote(ctx, cond));
2959 ctx->conditional_demote_seen = false;
2960 }
2961 }
2962
ac_build_endif(struct ac_llvm_context * ctx,int label_id)2963 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2964 {
2965 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2966
2967 assert(!current_branch->loop_entry_block);
2968
2969 emit_default_branch(ctx->builder, current_branch->next_block);
2970 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2971 set_basicblock_name(current_branch->next_block, "endif", label_id);
2972
2973 ctx->flow->depth--;
2974 ac_branch_exited(ctx);
2975 }
2976
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)2977 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2978 {
2979 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2980
2981 assert(current_loop->loop_entry_block);
2982
2983 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2984
2985 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2986 set_basicblock_name(current_loop->next_block, "endloop", label_id);
2987 ctx->flow->depth--;
2988 ac_branch_exited(ctx);
2989 }
2990
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)2991 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
2992 {
2993 struct ac_llvm_flow *flow = push_flow(ctx);
2994 LLVMBasicBlockRef if_block;
2995
2996 if_block = append_basic_block(ctx, "IF");
2997 flow->next_block = append_basic_block(ctx, "ELSE");
2998 set_basicblock_name(if_block, "if", label_id);
2999 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3000 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3001 }
3002
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3003 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3004 {
3005 LLVMBuilderRef builder = ac->builder;
3006 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3007 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3008 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3009 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3010 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3011 LLVMValueRef res;
3012
3013 if (first_instr) {
3014 LLVMPositionBuilderBefore(first_builder, first_instr);
3015 } else {
3016 LLVMPositionBuilderAtEnd(first_builder, first_block);
3017 }
3018
3019 res = LLVMBuildAlloca(first_builder, type, name);
3020 LLVMDisposeBuilder(first_builder);
3021 return res;
3022 }
3023
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3024 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3025 {
3026 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3027 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3028 return ptr;
3029 }
3030
ac_build_alloca_init(struct ac_llvm_context * ac,LLVMValueRef val,const char * name)3031 LLVMValueRef ac_build_alloca_init(struct ac_llvm_context *ac, LLVMValueRef val, const char *name)
3032 {
3033 LLVMValueRef ptr = ac_build_alloca_undef(ac, LLVMTypeOf(val), name);
3034 LLVMBuildStore(ac->builder, val, ptr);
3035 return ptr;
3036 }
3037
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)3038 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3039 {
3040 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3041 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3042 }
3043
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)3044 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3045 {
3046 unsigned num_components = ac_get_llvm_num_components(value);
3047 if (count == num_components)
3048 return value;
3049
3050 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3051 masks[0] = ctx->i32_0;
3052 masks[1] = ctx->i32_1;
3053 for (unsigned i = 2; i < count; i++)
3054 masks[i] = LLVMConstInt(ctx->i32, i, false);
3055
3056 if (count == 1)
3057 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3058
3059 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3060 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3061 }
3062
3063 /* If param is i64 and bitwidth <= 32, the return value will be i32. */
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)3064 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3065 unsigned bitwidth)
3066 {
3067 LLVMValueRef value = param;
3068 if (rshift)
3069 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), rshift, false), "");
3070
3071 if (rshift + bitwidth < 32) {
3072 uint64_t mask = (1ull << bitwidth) - 1;
3073 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(LLVMTypeOf(param), mask, false), "");
3074 }
3075
3076 if (bitwidth <= 32 && LLVMTypeOf(param) == ctx->i64)
3077 value = LLVMBuildTrunc(ctx->builder, value, ctx->i32, "");
3078 return value;
3079 }
3080
3081 /* Adjust the sample index according to FMASK.
3082 *
3083 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3084 * which is the identity mapping. Each nibble says which physical sample
3085 * should be fetched to get that sample.
3086 *
3087 * For example, 0x11111100 means there are only 2 samples stored and
3088 * the second sample covers 3/4 of the pixel. When reading samples 0
3089 * and 1, return physical sample 0 (determined by the first two 0s
3090 * in FMASK), otherwise return physical sample 1.
3091 *
3092 * The sample index should be adjusted as follows:
3093 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3094 */
ac_apply_fmask_to_sample(struct ac_llvm_context * ac,LLVMValueRef fmask,LLVMValueRef * addr,bool is_array_tex)3095 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3096 bool is_array_tex)
3097 {
3098 struct ac_image_args fmask_load = {0};
3099 fmask_load.opcode = ac_image_load;
3100 fmask_load.resource = fmask;
3101 fmask_load.dmask = 0xf;
3102 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3103 fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3104
3105 fmask_load.coords[0] = addr[0];
3106 fmask_load.coords[1] = addr[1];
3107 if (is_array_tex)
3108 fmask_load.coords[2] = addr[2];
3109 fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16;
3110
3111 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3112 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3113
3114 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3115 * resource descriptor is 0 (invalid).
3116 */
3117 LLVMValueRef tmp;
3118 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3119 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3120 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3121 fmask_value =
3122 LLVMBuildSelect(ac->builder, tmp, fmask_value, LLVMConstInt(ac->i32, 0x76543210, false), "");
3123
3124 /* Apply the formula. */
3125 unsigned sample_chan = is_array_tex ? 3 : 2;
3126 LLVMValueRef final_sample;
3127 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
3128 LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), "");
3129 final_sample = LLVMBuildLShr(ac->builder, fmask_value,
3130 LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), "");
3131 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3132 * with EQAA, so those will map to 0. */
3133 addr[sample_chan] = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3134 if (fmask_load.a16)
3135 addr[sample_chan] = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, "");
3136 }
3137
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3138 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3139 LLVMValueRef lane, bool with_opt_barrier)
3140 {
3141 LLVMTypeRef type = LLVMTypeOf(src);
3142 LLVMValueRef result;
3143
3144 if (with_opt_barrier)
3145 ac_build_optimization_barrier(ctx, &src, false);
3146
3147 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3148 if (lane)
3149 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3150
3151 result =
3152 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3153 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3154 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3155
3156 return LLVMBuildTrunc(ctx->builder, result, type, "");
3157 }
3158
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3159 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3160 LLVMValueRef lane, bool with_opt_barrier)
3161 {
3162 LLVMTypeRef src_type = LLVMTypeOf(src);
3163 src = ac_to_integer(ctx, src);
3164 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3165 LLVMValueRef ret;
3166
3167 if (bits > 32) {
3168 assert(bits % 32 == 0);
3169 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3170 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3171 ret = LLVMGetUndef(vec_type);
3172 for (unsigned i = 0; i < bits / 32; i++) {
3173 LLVMValueRef ret_comp;
3174
3175 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3176
3177 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3178
3179 ret =
3180 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3181 }
3182 } else {
3183 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3184 }
3185
3186 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3187 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3188 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3189 }
3190
3191 /**
3192 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3193 *
3194 * The optimization barrier is not needed if the value is the same in all lanes
3195 * or if this is called in the outermost block.
3196 *
3197 * @param ctx
3198 * @param src
3199 * @param lane - id of the lane or NULL for the first active lane
3200 * @return value of the lane
3201 */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3202 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3203 LLVMValueRef lane)
3204 {
3205 return ac_build_readlane_common(ctx, src, lane, false);
3206 }
3207
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3208 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3209 {
3210 return ac_build_readlane_common(ctx, src, lane, true);
3211 }
3212
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)3213 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3214 LLVMValueRef lane)
3215 {
3216 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3217 (LLVMValueRef[]){value, lane, src}, 3,
3218 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3219 }
3220
ac_build_mbcnt_add(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef add_src)3221 LLVMValueRef ac_build_mbcnt_add(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef add_src)
3222 {
3223 LLVMValueRef val;
3224
3225 if (ctx->wave_size == 32) {
3226 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3227 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3228 } else {
3229 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3230 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3231 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3232 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3233 (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3234 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3235 2, AC_FUNC_ATTR_READNONE);
3236 }
3237
3238 /* Bug workaround. LLVM always believes the upper bound of mbcnt to be the wave size,
3239 * regardless of ac_set_range_metadata. Use an extra add instruction to work around it.
3240 */
3241 if (add_src != NULL && add_src != ctx->i32_0) {
3242 return LLVMBuildAdd(ctx->builder, val, add_src, "");
3243 }
3244
3245 return val;
3246 }
3247
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)3248 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3249 {
3250 return ac_build_mbcnt_add(ctx, mask, ctx->i32_0);
3251 }
3252
3253 enum dpp_ctrl
3254 {
3255 _dpp_quad_perm = 0x000,
3256 _dpp_row_sl = 0x100,
3257 _dpp_row_sr = 0x110,
3258 _dpp_row_rr = 0x120,
3259 dpp_wf_sl1 = 0x130,
3260 dpp_wf_rl1 = 0x134,
3261 dpp_wf_sr1 = 0x138,
3262 dpp_wf_rr1 = 0x13C,
3263 dpp_row_mirror = 0x140,
3264 dpp_row_half_mirror = 0x141,
3265 dpp_row_bcast15 = 0x142,
3266 dpp_row_bcast31 = 0x143
3267 };
3268
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3269 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3270 unsigned lane3)
3271 {
3272 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3273 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3274 }
3275
dpp_row_sr(unsigned amount)3276 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3277 {
3278 assert(amount > 0 && amount < 16);
3279 return _dpp_row_sr | amount;
3280 }
3281
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3282 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3283 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3284 bool bound_ctrl)
3285 {
3286 LLVMTypeRef type = LLVMTypeOf(src);
3287 LLVMValueRef res;
3288
3289 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3290 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3291
3292 res = ac_build_intrinsic(
3293 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3294 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3295 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3296 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3297 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3298
3299 return LLVMBuildTrunc(ctx->builder, res, type, "");
3300 }
3301
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3302 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3303 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3304 bool bound_ctrl)
3305 {
3306 LLVMTypeRef src_type = LLVMTypeOf(src);
3307 src = ac_to_integer(ctx, src);
3308 old = ac_to_integer(ctx, old);
3309 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3310 LLVMValueRef ret;
3311 if (bits > 32) {
3312 assert(bits % 32 == 0);
3313 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3314 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3315 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3316 ret = LLVMGetUndef(vec_type);
3317 for (unsigned i = 0; i < bits / 32; i++) {
3318 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3319 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3320 LLVMValueRef ret_comp =
3321 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3322 ret =
3323 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3324 }
3325 } else {
3326 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3327 }
3328 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3329 }
3330
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3331 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3332 uint64_t sel, bool exchange_rows, bool bound_ctrl)
3333 {
3334 LLVMTypeRef type = LLVMTypeOf(src);
3335 LLVMValueRef result;
3336
3337 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3338
3339 LLVMValueRef args[6] = {
3340 src,
3341 src,
3342 LLVMConstInt(ctx->i32, sel, false),
3343 LLVMConstInt(ctx->i32, sel >> 32, false),
3344 ctx->i1true, /* fi */
3345 bound_ctrl ? ctx->i1true : ctx->i1false,
3346 };
3347
3348 result =
3349 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3350 ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3351
3352 return LLVMBuildTrunc(ctx->builder, result, type, "");
3353 }
3354
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3355 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3356 bool exchange_rows, bool bound_ctrl)
3357 {
3358 LLVMTypeRef src_type = LLVMTypeOf(src);
3359 src = ac_to_integer(ctx, src);
3360 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3361 LLVMValueRef ret;
3362 if (bits > 32) {
3363 assert(bits % 32 == 0);
3364 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3365 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3366 ret = LLVMGetUndef(vec_type);
3367 for (unsigned i = 0; i < bits / 32; i++) {
3368 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3369 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3370 ret =
3371 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3372 }
3373 } else {
3374 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3375 }
3376 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3377 }
3378
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)3379 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3380 {
3381 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3382 return and_mask | (or_mask << 5) | (xor_mask << 10);
3383 }
3384
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3385 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3386 unsigned mask)
3387 {
3388 LLVMTypeRef src_type = LLVMTypeOf(src);
3389 LLVMValueRef ret;
3390
3391 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3392
3393 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3394 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3395 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3396
3397 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3398 }
3399
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3400 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3401 {
3402 LLVMTypeRef src_type = LLVMTypeOf(src);
3403 src = ac_to_integer(ctx, src);
3404 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3405 LLVMValueRef ret;
3406 if (bits > 32) {
3407 assert(bits % 32 == 0);
3408 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3409 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3410 ret = LLVMGetUndef(vec_type);
3411 for (unsigned i = 0; i < bits / 32; i++) {
3412 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3413 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3414 ret =
3415 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3416 }
3417 } else {
3418 ret = _ac_build_ds_swizzle(ctx, src, mask);
3419 }
3420 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3421 }
3422
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3423 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3424 {
3425 LLVMTypeRef src_type = LLVMTypeOf(src);
3426 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3427 char name[32], type[8];
3428 LLVMValueRef ret;
3429
3430 src = ac_to_integer(ctx, src);
3431
3432 if (bitsize < 32)
3433 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3434
3435 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3436 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3437 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3438 AC_FUNC_ATTR_READNONE);
3439
3440 if (bitsize < 32)
3441 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3442
3443 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3444 }
3445
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3446 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3447 LLVMValueRef inactive)
3448 {
3449 char name[33], type[8];
3450 LLVMTypeRef src_type = LLVMTypeOf(src);
3451 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3452 src = ac_to_integer(ctx, src);
3453 inactive = ac_to_integer(ctx, inactive);
3454
3455 if (bitsize < 32) {
3456 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3457 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3458 }
3459
3460 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3461 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3462 LLVMValueRef ret =
3463 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3464 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3465 if (bitsize < 32)
3466 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3467
3468 return ret;
3469 }
3470
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3471 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3472 unsigned type_size)
3473 {
3474
3475 if (type_size == 0) {
3476 switch (op) {
3477 case nir_op_ior:
3478 case nir_op_ixor:
3479 return LLVMConstInt(ctx->i1, 0, 0);
3480 case nir_op_iand:
3481 return LLVMConstInt(ctx->i1, 1, 0);
3482 default:
3483 unreachable("bad reduction intrinsic");
3484 }
3485 } else if (type_size == 1) {
3486 switch (op) {
3487 case nir_op_iadd:
3488 return ctx->i8_0;
3489 case nir_op_imul:
3490 return ctx->i8_1;
3491 case nir_op_imin:
3492 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3493 case nir_op_umin:
3494 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3495 case nir_op_imax:
3496 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3497 case nir_op_umax:
3498 return ctx->i8_0;
3499 case nir_op_iand:
3500 return LLVMConstInt(ctx->i8, -1, 0);
3501 case nir_op_ior:
3502 return ctx->i8_0;
3503 case nir_op_ixor:
3504 return ctx->i8_0;
3505 default:
3506 unreachable("bad reduction intrinsic");
3507 }
3508 } else if (type_size == 2) {
3509 switch (op) {
3510 case nir_op_iadd:
3511 return ctx->i16_0;
3512 case nir_op_fadd:
3513 return ctx->f16_0;
3514 case nir_op_imul:
3515 return ctx->i16_1;
3516 case nir_op_fmul:
3517 return ctx->f16_1;
3518 case nir_op_imin:
3519 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3520 case nir_op_umin:
3521 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3522 case nir_op_fmin:
3523 return LLVMConstReal(ctx->f16, INFINITY);
3524 case nir_op_imax:
3525 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3526 case nir_op_umax:
3527 return ctx->i16_0;
3528 case nir_op_fmax:
3529 return LLVMConstReal(ctx->f16, -INFINITY);
3530 case nir_op_iand:
3531 return LLVMConstInt(ctx->i16, -1, 0);
3532 case nir_op_ior:
3533 return ctx->i16_0;
3534 case nir_op_ixor:
3535 return ctx->i16_0;
3536 default:
3537 unreachable("bad reduction intrinsic");
3538 }
3539 } else if (type_size == 4) {
3540 switch (op) {
3541 case nir_op_iadd:
3542 return ctx->i32_0;
3543 case nir_op_fadd:
3544 return ctx->f32_0;
3545 case nir_op_imul:
3546 return ctx->i32_1;
3547 case nir_op_fmul:
3548 return ctx->f32_1;
3549 case nir_op_imin:
3550 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3551 case nir_op_umin:
3552 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3553 case nir_op_fmin:
3554 return LLVMConstReal(ctx->f32, INFINITY);
3555 case nir_op_imax:
3556 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3557 case nir_op_umax:
3558 return ctx->i32_0;
3559 case nir_op_fmax:
3560 return LLVMConstReal(ctx->f32, -INFINITY);
3561 case nir_op_iand:
3562 return LLVMConstInt(ctx->i32, -1, 0);
3563 case nir_op_ior:
3564 return ctx->i32_0;
3565 case nir_op_ixor:
3566 return ctx->i32_0;
3567 default:
3568 unreachable("bad reduction intrinsic");
3569 }
3570 } else { /* type_size == 64bit */
3571 switch (op) {
3572 case nir_op_iadd:
3573 return ctx->i64_0;
3574 case nir_op_fadd:
3575 return ctx->f64_0;
3576 case nir_op_imul:
3577 return ctx->i64_1;
3578 case nir_op_fmul:
3579 return ctx->f64_1;
3580 case nir_op_imin:
3581 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3582 case nir_op_umin:
3583 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3584 case nir_op_fmin:
3585 return LLVMConstReal(ctx->f64, INFINITY);
3586 case nir_op_imax:
3587 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3588 case nir_op_umax:
3589 return ctx->i64_0;
3590 case nir_op_fmax:
3591 return LLVMConstReal(ctx->f64, -INFINITY);
3592 case nir_op_iand:
3593 return LLVMConstInt(ctx->i64, -1, 0);
3594 case nir_op_ior:
3595 return ctx->i64_0;
3596 case nir_op_ixor:
3597 return ctx->i64_0;
3598 default:
3599 unreachable("bad reduction intrinsic");
3600 }
3601 }
3602 }
3603
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3604 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3605 nir_op op)
3606 {
3607 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3608 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3609 switch (op) {
3610 case nir_op_iadd:
3611 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3612 case nir_op_fadd:
3613 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3614 case nir_op_imul:
3615 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3616 case nir_op_fmul:
3617 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3618 case nir_op_imin:
3619 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3620 lhs, rhs, "");
3621 case nir_op_umin:
3622 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3623 lhs, rhs, "");
3624 case nir_op_fmin:
3625 return ac_build_intrinsic(
3626 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3627 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3628 AC_FUNC_ATTR_READNONE);
3629 case nir_op_imax:
3630 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3631 lhs, rhs, "");
3632 case nir_op_umax:
3633 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3634 lhs, rhs, "");
3635 case nir_op_fmax:
3636 return ac_build_intrinsic(
3637 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3638 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3639 AC_FUNC_ATTR_READNONE);
3640 case nir_op_iand:
3641 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3642 case nir_op_ior:
3643 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3644 case nir_op_ixor:
3645 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3646 default:
3647 unreachable("bad reduction intrinsic");
3648 }
3649 }
3650
3651 /**
3652 * \param src The value to shift.
3653 * \param identity The value to use the first lane.
3654 * \param maxprefix specifies that the result only needs to be correct for a
3655 * prefix of this many threads
3656 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3657 */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3658 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3659 LLVMValueRef identity, unsigned maxprefix)
3660 {
3661 if (ctx->gfx_level >= GFX10) {
3662 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3663 LLVMValueRef active, tmp1, tmp2;
3664 LLVMValueRef tid = ac_get_thread_id(ctx);
3665
3666 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3667
3668 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3669
3670 if (maxprefix > 32) {
3671 active =
3672 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3673
3674 tmp2 = LLVMBuildSelect(ctx->builder, active,
3675 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3676 tmp2, "");
3677
3678 active = LLVMBuildOr(
3679 ctx->builder, active,
3680 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3681 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3682 LLVMConstInt(ctx->i32, 0x10, false), ""),
3683 "");
3684 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3685 } else if (maxprefix > 16) {
3686 active =
3687 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3688
3689 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3690 }
3691 } else if (ctx->gfx_level >= GFX8) {
3692 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3693 }
3694
3695 /* wavefront shift_right by 1 on SI/CI */
3696 LLVMValueRef active, tmp1, tmp2;
3697 LLVMValueRef tid = ac_get_thread_id(ctx);
3698 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3699 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3700 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3701 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3702 LLVMConstInt(ctx->i32, 0x4, 0), "");
3703 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3704 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3705 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3706 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3707 LLVMConstInt(ctx->i32, 0x8, 0), "");
3708 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3709 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3710 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3711 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3712 LLVMConstInt(ctx->i32, 0x10, 0), "");
3713 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3714 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3715 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3716 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3717 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3718 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3719 }
3720
3721 /**
3722 * \param maxprefix specifies that the result only needs to be correct for a
3723 * prefix of this many threads
3724 */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3725 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3726 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3727 {
3728 LLVMValueRef result, tmp;
3729
3730 if (!inclusive)
3731 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3732
3733 result = src;
3734
3735 if (ctx->gfx_level <= GFX7) {
3736 assert(maxprefix == 64);
3737 LLVMValueRef tid = ac_get_thread_id(ctx);
3738 LLVMValueRef active;
3739 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3740 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3741 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3742 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3743 result = ac_build_alu_op(ctx, result, tmp, op);
3744 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3745 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3746 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3747 ctx->i32_0, "");
3748 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3749 result = ac_build_alu_op(ctx, result, tmp, op);
3750 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3751 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3752 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3753 ctx->i32_0, "");
3754 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3755 result = ac_build_alu_op(ctx, result, tmp, op);
3756 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3757 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3758 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3759 ctx->i32_0, "");
3760 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3761 result = ac_build_alu_op(ctx, result, tmp, op);
3762 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3763 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3764 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3765 ctx->i32_0, "");
3766 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3767 result = ac_build_alu_op(ctx, result, tmp, op);
3768 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3769 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3770 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3771 ctx->i32_0, "");
3772 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3773 result = ac_build_alu_op(ctx, result, tmp, op);
3774 return result;
3775 }
3776
3777 if (maxprefix <= 1)
3778 return result;
3779 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3780 result = ac_build_alu_op(ctx, result, tmp, op);
3781 if (maxprefix <= 2)
3782 return result;
3783 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3784 result = ac_build_alu_op(ctx, result, tmp, op);
3785 if (maxprefix <= 3)
3786 return result;
3787 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3788 result = ac_build_alu_op(ctx, result, tmp, op);
3789 if (maxprefix <= 4)
3790 return result;
3791 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3792 result = ac_build_alu_op(ctx, result, tmp, op);
3793 if (maxprefix <= 8)
3794 return result;
3795 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3796 result = ac_build_alu_op(ctx, result, tmp, op);
3797 if (maxprefix <= 16)
3798 return result;
3799
3800 if (ctx->gfx_level >= GFX10) {
3801 LLVMValueRef tid = ac_get_thread_id(ctx);
3802 LLVMValueRef active;
3803
3804 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3805
3806 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3807 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3808 ctx->i32_0, "");
3809
3810 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3811
3812 result = ac_build_alu_op(ctx, result, tmp, op);
3813
3814 if (maxprefix <= 32)
3815 return result;
3816
3817 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3818
3819 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3820
3821 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3822
3823 result = ac_build_alu_op(ctx, result, tmp, op);
3824 return result;
3825 }
3826
3827 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3828 result = ac_build_alu_op(ctx, result, tmp, op);
3829 if (maxprefix <= 32)
3830 return result;
3831 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3832 result = ac_build_alu_op(ctx, result, tmp, op);
3833 return result;
3834 }
3835
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3836 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3837 {
3838 LLVMValueRef result;
3839
3840 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3841 LLVMBuilderRef builder = ctx->builder;
3842 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3843 result = ac_build_ballot(ctx, src);
3844 result = ac_build_mbcnt(ctx, result);
3845 result = LLVMBuildAdd(builder, result, src, "");
3846 return result;
3847 }
3848
3849 ac_build_optimization_barrier(ctx, &src, false);
3850
3851 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3852 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3853 LLVMTypeOf(identity), "");
3854 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
3855
3856 return ac_build_wwm(ctx, result);
3857 }
3858
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3859 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3860 {
3861 LLVMValueRef result;
3862
3863 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3864 LLVMBuilderRef builder = ctx->builder;
3865 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3866 result = ac_build_ballot(ctx, src);
3867 result = ac_build_mbcnt(ctx, result);
3868 return result;
3869 }
3870
3871 ac_build_optimization_barrier(ctx, &src, false);
3872
3873 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3874 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3875 LLVMTypeOf(identity), "");
3876 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
3877
3878 return ac_build_wwm(ctx, result);
3879 }
3880
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)3881 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
3882 unsigned cluster_size)
3883 {
3884 if (cluster_size == 1)
3885 return src;
3886 ac_build_optimization_barrier(ctx, &src, false);
3887 LLVMValueRef result, swap;
3888 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
3889 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
3890 LLVMTypeOf(identity), "");
3891 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3892 result = ac_build_alu_op(ctx, result, swap, op);
3893 if (cluster_size == 2)
3894 return ac_build_wwm(ctx, result);
3895
3896 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3897 result = ac_build_alu_op(ctx, result, swap, op);
3898 if (cluster_size == 4)
3899 return ac_build_wwm(ctx, result);
3900
3901 if (ctx->gfx_level >= GFX8)
3902 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3903 else
3904 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3905 result = ac_build_alu_op(ctx, result, swap, op);
3906 if (cluster_size == 8)
3907 return ac_build_wwm(ctx, result);
3908
3909 if (ctx->gfx_level >= GFX8)
3910 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3911 else
3912 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3913 result = ac_build_alu_op(ctx, result, swap, op);
3914 if (cluster_size == 16)
3915 return ac_build_wwm(ctx, result);
3916
3917 if (ctx->gfx_level >= GFX10)
3918 swap = ac_build_permlane16(ctx, result, 0, true, false);
3919 else if (ctx->gfx_level >= GFX8 && cluster_size != 32)
3920 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3921 else
3922 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3923 result = ac_build_alu_op(ctx, result, swap, op);
3924 if (cluster_size == 32)
3925 return ac_build_wwm(ctx, result);
3926
3927 if (ctx->gfx_level >= GFX8) {
3928 if (ctx->wave_size == 64) {
3929 if (ctx->gfx_level >= GFX10)
3930 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3931 else
3932 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3933 result = ac_build_alu_op(ctx, result, swap, op);
3934 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3935 }
3936
3937 return ac_build_wwm(ctx, result);
3938 } else {
3939 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3940 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3941 result = ac_build_alu_op(ctx, result, swap, op);
3942 return ac_build_wwm(ctx, result);
3943 }
3944 }
3945
3946 /**
3947 * "Top half" of a scan that reduces per-wave values across an entire
3948 * workgroup.
3949 *
3950 * The source value must be present in the highest lane of the wave, and the
3951 * highest lane must be live.
3952 */
ac_build_wg_wavescan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)3953 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3954 {
3955 if (ws->maxwaves <= 1)
3956 return;
3957
3958 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
3959 LLVMBuilderRef builder = ctx->builder;
3960 LLVMValueRef tid = ac_get_thread_id(ctx);
3961 LLVMValueRef tmp;
3962
3963 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
3964 ac_build_ifcc(ctx, tmp, 1000);
3965 LLVMBuildStore(builder, ws->src,
3966 LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &ws->waveidx, 1, ""));
3967 ac_build_endif(ctx, 1000);
3968 }
3969
3970 /**
3971 * "Bottom half" of a scan that reduces per-wave values across an entire
3972 * workgroup.
3973 *
3974 * The caller must place a barrier between the top and bottom halves.
3975 */
ac_build_wg_wavescan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)3976 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
3977 {
3978 const LLVMTypeRef type = LLVMTypeOf(ws->src);
3979 const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
3980
3981 if (ws->maxwaves <= 1) {
3982 ws->result_reduce = ws->src;
3983 ws->result_inclusive = ws->src;
3984 ws->result_exclusive = identity;
3985 return;
3986 }
3987 assert(ws->maxwaves <= 32);
3988
3989 LLVMBuilderRef builder = ctx->builder;
3990 LLVMValueRef tid = ac_get_thread_id(ctx);
3991 LLVMBasicBlockRef bbs[2];
3992 LLVMValueRef phivalues_scan[2];
3993 LLVMValueRef tmp, tmp2;
3994
3995 bbs[0] = LLVMGetInsertBlock(builder);
3996 phivalues_scan[0] = LLVMGetUndef(type);
3997
3998 if (ws->enable_reduce)
3999 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4000 else if (ws->enable_inclusive)
4001 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4002 else
4003 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4004 ac_build_ifcc(ctx, tmp, 1001);
4005 {
4006 tmp = LLVMBuildLoad2(builder, LLVMTypeOf(ws->src),
4007 LLVMBuildGEP2(builder, LLVMTypeOf(ws->src), ws->scratch, &tid, 1, ""), "");
4008
4009 ac_build_optimization_barrier(ctx, &tmp, false);
4010
4011 bbs[1] = LLVMGetInsertBlock(builder);
4012 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4013 }
4014 ac_build_endif(ctx, 1001);
4015
4016 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4017
4018 if (ws->enable_reduce) {
4019 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4020 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4021 }
4022 if (ws->enable_inclusive)
4023 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4024 if (ws->enable_exclusive) {
4025 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4026 tmp = ac_build_readlane(ctx, scan, tmp);
4027 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4028 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4029 }
4030 }
4031
4032 /**
4033 * Inclusive scan of a per-wave value across an entire workgroup.
4034 *
4035 * This implies an s_barrier instruction.
4036 *
4037 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4038 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4039 * useful manner because of the barrier in the algorithm.)
4040 */
ac_build_wg_wavescan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4041 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4042 {
4043 ac_build_wg_wavescan_top(ctx, ws);
4044 ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4045 ac_build_s_barrier(ctx, ws->stage);
4046 ac_build_wg_wavescan_bottom(ctx, ws);
4047 }
4048
4049 /**
4050 * "Top half" of a scan that reduces per-thread values across an entire
4051 * workgroup.
4052 *
4053 * All lanes must be active when this code runs.
4054 */
ac_build_wg_scan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4055 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4056 {
4057 if (ws->enable_exclusive) {
4058 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4059 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4060 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4061 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4062 } else {
4063 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4064 }
4065
4066 bool enable_inclusive = ws->enable_inclusive;
4067 bool enable_exclusive = ws->enable_exclusive;
4068 ws->enable_inclusive = false;
4069 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4070 ac_build_wg_wavescan_top(ctx, ws);
4071 ws->enable_inclusive = enable_inclusive;
4072 ws->enable_exclusive = enable_exclusive;
4073 }
4074
4075 /**
4076 * "Bottom half" of a scan that reduces per-thread values across an entire
4077 * workgroup.
4078 *
4079 * The caller must place a barrier between the top and bottom halves.
4080 */
ac_build_wg_scan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4081 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4082 {
4083 bool enable_inclusive = ws->enable_inclusive;
4084 bool enable_exclusive = ws->enable_exclusive;
4085 ws->enable_inclusive = false;
4086 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4087 ac_build_wg_wavescan_bottom(ctx, ws);
4088 ws->enable_inclusive = enable_inclusive;
4089 ws->enable_exclusive = enable_exclusive;
4090
4091 /* ws->result_reduce is already the correct value */
4092 if (ws->enable_inclusive)
4093 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4094 if (ws->enable_exclusive)
4095 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4096 }
4097
4098 /**
4099 * A scan that reduces per-thread values across an entire workgroup.
4100 *
4101 * The caller must ensure that all lanes are active when this code runs
4102 * (WWM is insufficient!), because there is an implied barrier.
4103 */
ac_build_wg_scan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4104 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4105 {
4106 ac_build_wg_scan_top(ctx, ws);
4107 ac_build_waitcnt(ctx, AC_WAIT_LGKM);
4108 ac_build_s_barrier(ctx, ws->stage);
4109 ac_build_wg_scan_bottom(ctx, ws);
4110 }
4111
_ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,LLVMValueRef * arg0,LLVMValueRef * arg1)4112 static void _ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4113 LLVMValueRef *arg0, LLVMValueRef *arg1)
4114 {
4115 LLVMValueRef tid;
4116 LLVMValueRef src0, src1;
4117 LLVMValueRef tmp0;
4118 LLVMValueRef params[2];
4119 LLVMValueRef is_even;
4120
4121 src0 = LLVMBuildBitCast(ctx->builder, *arg0, ctx->i32, "");
4122 src1 = LLVMBuildBitCast(ctx->builder, *arg1, ctx->i32, "");
4123
4124 /* swap odd,even lanes of arg_0*/
4125 params[0] = src0;
4126 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4127 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4128 ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4129
4130 /* swap even lanes between arg_0 and arg_1 */
4131 tid = ac_get_thread_id(ctx);
4132 is_even = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
4133 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""),
4134 ctx->i32_0, "");
4135 tmp0 = src0;
4136 src0 = LLVMBuildSelect(ctx->builder, is_even, src1, src0, "");
4137 src1 = LLVMBuildSelect(ctx->builder, is_even, tmp0, src1, "");
4138
4139 /* swap odd,even lanes again for arg_0*/
4140 params[0] = src0;
4141 params[1] = LLVMConstInt(ctx->i32, 0xde54c1, 0);
4142 src0 = ac_build_intrinsic(ctx, "llvm.amdgcn.mov.dpp8.i32",
4143 ctx->i32, params, 2, AC_FUNC_ATTR_CONVERGENT);
4144
4145 *arg0 = src0;
4146 *arg1 = src1;
4147 }
4148
ac_build_dual_src_blend_swizzle(struct ac_llvm_context * ctx,struct ac_export_args * mrt0,struct ac_export_args * mrt1)4149 void ac_build_dual_src_blend_swizzle(struct ac_llvm_context *ctx,
4150 struct ac_export_args *mrt0,
4151 struct ac_export_args *mrt1)
4152 {
4153 assert(ctx->gfx_level >= GFX11);
4154 assert(mrt0->enabled_channels == mrt1->enabled_channels);
4155
4156 for (int i = 0; i < 4; i++) {
4157 if (mrt0->enabled_channels & (1 << i) && mrt1->enabled_channels & (1 << i))
4158 _ac_build_dual_src_blend_swizzle(ctx, &mrt0->out[i], &mrt1->out[i]);
4159 }
4160 }
4161
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)4162 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4163 unsigned lane1, unsigned lane2, unsigned lane3)
4164 {
4165 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4166 if (ctx->gfx_level >= GFX8) {
4167 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4168 } else {
4169 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4170 }
4171 }
4172
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)4173 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4174 {
4175 LLVMTypeRef type = LLVMTypeOf(src);
4176 LLVMValueRef result;
4177
4178 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4179 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4180
4181 result =
4182 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4183 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4184 return LLVMBuildTrunc(ctx->builder, result, type, "");
4185 }
4186
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4187 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4188 {
4189 LLVMTypeRef type;
4190 char *intr;
4191
4192 if (bitsize == 16) {
4193 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4194 type = ctx->i16;
4195 } else if (bitsize == 32) {
4196 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4197 type = ctx->i32;
4198 } else {
4199 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4200 type = ctx->i32;
4201 }
4202
4203 LLVMValueRef params[] = {
4204 src0,
4205 };
4206 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4207 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4208 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4209 {
4210 LLVMTypeRef type;
4211 char *intr;
4212
4213 if (bitsize == 16) {
4214 intr = "llvm.amdgcn.frexp.mant.f16";
4215 type = ctx->f16;
4216 } else if (bitsize == 32) {
4217 intr = "llvm.amdgcn.frexp.mant.f32";
4218 type = ctx->f32;
4219 } else {
4220 intr = "llvm.amdgcn.frexp.mant.f64";
4221 type = ctx->f64;
4222 }
4223
4224 LLVMValueRef params[] = {
4225 src0,
4226 };
4227 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4228 }
4229
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4230 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4231 {
4232 LLVMTypeRef type;
4233 char *intr;
4234
4235 if (bitsize == 16) {
4236 intr = "llvm.canonicalize.f16";
4237 type = ctx->f16;
4238 } else if (bitsize == 32) {
4239 intr = "llvm.canonicalize.f32";
4240 type = ctx->f32;
4241 } else {
4242 intr = "llvm.canonicalize.f64";
4243 type = ctx->f64;
4244 }
4245
4246 LLVMValueRef params[] = {
4247 src0,
4248 };
4249 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4250 }
4251
4252 /*
4253 * this takes an I,J coordinate pair,
4254 * and works out the X and Y derivatives.
4255 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4256 */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)4257 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4258 {
4259 LLVMValueRef result[4], a;
4260 unsigned i;
4261
4262 for (i = 0; i < 2; i++) {
4263 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4264 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4265 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4266 }
4267 return ac_build_gather_values(ctx, result, 4);
4268 }
4269
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)4270 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4271 {
4272 LLVMValueRef result;
4273
4274 if (LLVM_VERSION_MAJOR >= 13) {
4275 result = ac_build_intrinsic(ctx, "llvm.amdgcn.live.mask", ctx->i1, NULL, 0,
4276 AC_FUNC_ATTR_READONLY | AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
4277 } else {
4278 result = ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0,
4279 AC_FUNC_ATTR_READNONE);
4280 }
4281 return LLVMBuildNot(ctx->builder, result, "");
4282 }
4283
ac_build_is_helper_invocation(struct ac_llvm_context * ctx)4284 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4285 {
4286 if (!ctx->postponed_kill)
4287 return ac_build_load_helper_invocation(ctx);
4288
4289 /* postponed_kill should be NULL on LLVM 13+ */
4290 assert(LLVM_VERSION_MAJOR < 13);
4291
4292 /* !(exact && postponed) */
4293 LLVMValueRef exact =
4294 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4295
4296 LLVMValueRef postponed = LLVMBuildLoad2(ctx->builder, ctx->i1, ctx->postponed_kill, "");
4297 return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4298 }
4299
ac_build_call(struct ac_llvm_context * ctx,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)4300 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4301 unsigned num_args)
4302 {
4303 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4304 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4305 return ret;
4306 }
4307
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,LLVMValueRef mrt0_alpha,bool is_last,struct ac_export_args * args)4308 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4309 LLVMValueRef samplemask, LLVMValueRef mrt0_alpha, bool is_last,
4310 struct ac_export_args *args)
4311 {
4312 unsigned mask = 0;
4313 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL,
4314 mrt0_alpha != NULL);
4315
4316 assert(depth || stencil || samplemask);
4317
4318 memset(args, 0, sizeof(*args));
4319
4320 if (is_last) {
4321 args->valid_mask = 1; /* whether the EXEC mask is valid */
4322 args->done = 1; /* DONE bit */
4323 }
4324
4325 /* Specify the target we are exporting */
4326 args->target = V_008DFC_SQ_EXP_MRTZ;
4327
4328 args->compr = 0; /* COMP flag */
4329 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4330 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4331 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4332 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4333
4334 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4335 assert(!depth);
4336 args->compr = ctx->gfx_level < GFX11; /* COMPR flag */
4337
4338 if (stencil) {
4339 /* Stencil should be in X[23:16]. */
4340 stencil = ac_to_integer(ctx, stencil);
4341 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4342 args->out[0] = ac_to_float(ctx, stencil);
4343 mask |= ctx->gfx_level >= GFX11 ? 0x1 : 0x3;
4344 }
4345 if (samplemask) {
4346 /* SampleMask should be in Y[15:0]. */
4347 args->out[1] = samplemask;
4348 mask |= ctx->gfx_level >= GFX11 ? 0x2 : 0xc;
4349 }
4350 } else {
4351 if (depth) {
4352 args->out[0] = depth;
4353 mask |= 0x1;
4354 }
4355 if (stencil) {
4356 args->out[1] = stencil;
4357 mask |= 0x2;
4358 }
4359 if (samplemask) {
4360 args->out[2] = samplemask;
4361 mask |= 0x4;
4362 }
4363 if (mrt0_alpha) {
4364 args->out[3] = mrt0_alpha;
4365 mask |= 0x8;
4366 }
4367 }
4368
4369 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4370 * at the X writemask component. */
4371 if (ctx->gfx_level == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4372 mask |= 0x1;
4373
4374 /* Specify which components to enable */
4375 args->enabled_channels = mask;
4376 }
4377
4378 /* Send GS Alloc Req message from the first wave of the group to SPI.
4379 * Message payload is:
4380 * - bits 0..10: vertices in group
4381 * - bits 12..22: primitives in group
4382 */
ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context * ctx,LLVMValueRef wave_id,LLVMValueRef vtx_cnt,LLVMValueRef prim_cnt)4383 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4384 LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4385 {
4386 LLVMBuilderRef builder = ctx->builder;
4387 LLVMValueRef tmp;
4388 bool export_dummy_prim = false;
4389
4390 /* HW workaround for a GPU hang with 100% culling.
4391 * We always have to export at least 1 primitive.
4392 * Export a degenerate triangle using vertex 0 for all 3 vertices.
4393 */
4394 if (prim_cnt == ctx->i32_0 && ctx->gfx_level == GFX10) {
4395 assert(vtx_cnt == ctx->i32_0);
4396 prim_cnt = ctx->i32_1;
4397 vtx_cnt = ctx->i32_1;
4398 export_dummy_prim = true;
4399 }
4400
4401 if (wave_id)
4402 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4403
4404 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4405 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4406 ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4407
4408 if (export_dummy_prim) {
4409 struct ac_ngg_prim prim = {0};
4410 /* The vertex indices are 0,0,0. */
4411 prim.passthrough = ctx->i32_0;
4412
4413 struct ac_export_args pos = {0};
4414 /* The hw culls primitives with NaN. */
4415 pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = LLVMConstReal(ctx->f32, NAN);
4416 pos.target = V_008DFC_SQ_EXP_POS;
4417 pos.enabled_channels = 0xf;
4418 pos.done = true;
4419
4420 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4421 5021);
4422 ac_build_export_prim(ctx, &prim);
4423 ac_build_export(ctx, &pos);
4424 ac_build_endif(ctx, 5021);
4425 }
4426
4427 if (wave_id)
4428 ac_build_endif(ctx, 5020);
4429 }
4430
4431
ac_pack_edgeflags_for_export(struct ac_llvm_context * ctx,const struct ac_shader_args * args)4432 LLVMValueRef ac_pack_edgeflags_for_export(struct ac_llvm_context *ctx,
4433 const struct ac_shader_args *args)
4434 {
4435 /* Use the following trick to extract the edge flags:
4436 * extracted = v_and_b32 gs_invocation_id, 0x700 ; get edge flags at bits 8, 9, 10
4437 * shifted = v_mul_u32_u24 extracted, 0x80402u ; shift the bits: 8->9, 9->19, 10->29
4438 * result = v_and_b32 shifted, 0x20080200 ; remove garbage
4439 */
4440 LLVMValueRef tmp = LLVMBuildAnd(ctx->builder,
4441 ac_get_arg(ctx, args->gs_invocation_id),
4442 LLVMConstInt(ctx->i32, 0x700, 0), "");
4443 tmp = LLVMBuildMul(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x80402u, 0), "");
4444 return LLVMBuildAnd(ctx->builder, tmp, LLVMConstInt(ctx->i32, 0x20080200, 0), "");
4445 }
4446
ac_pack_prim_export(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4447 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4448 {
4449 /* The prim export format is:
4450 * - bits 0..8: index 0
4451 * - bit 9: edge flag 0
4452 * - bits 10..18: index 1
4453 * - bit 19: edge flag 1
4454 * - bits 20..28: index 2
4455 * - bit 29: edge flag 2
4456 * - bit 31: null primitive (skip)
4457 */
4458 LLVMBuilderRef builder = ctx->builder;
4459 LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4460 LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4461 result = LLVMBuildOr(ctx->builder, result, prim->edgeflags, "");
4462
4463 for (unsigned i = 0; i < prim->num_vertices; ++i) {
4464 tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4465 result = LLVMBuildOr(builder, result, tmp, "");
4466 }
4467 return result;
4468 }
4469
ac_build_export_prim(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4470 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4471 {
4472 struct ac_export_args args;
4473
4474 if (prim->passthrough) {
4475 args.out[0] = prim->passthrough;
4476 } else {
4477 args.out[0] = ac_pack_prim_export(ctx, prim);
4478 }
4479
4480 args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4481 args.out[1] = LLVMGetUndef(ctx->f32);
4482 args.out[2] = LLVMGetUndef(ctx->f32);
4483 args.out[3] = LLVMGetUndef(ctx->f32);
4484
4485 args.target = V_008DFC_SQ_EXP_PRIM;
4486 args.enabled_channels = 1;
4487 args.done = true;
4488 args.valid_mask = false;
4489 args.compr = false;
4490
4491 ac_build_export(ctx, &args);
4492 }
4493
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)4494 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4495 {
4496 if (type == AC_ARG_FLOAT) {
4497 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4498 } else if (type == AC_ARG_INT) {
4499 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4500 } else {
4501 LLVMTypeRef ptr_type;
4502 switch (type) {
4503 case AC_ARG_CONST_PTR:
4504 ptr_type = ctx->i8;
4505 break;
4506 case AC_ARG_CONST_FLOAT_PTR:
4507 ptr_type = ctx->f32;
4508 break;
4509 case AC_ARG_CONST_PTR_PTR:
4510 ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4511 break;
4512 case AC_ARG_CONST_DESC_PTR:
4513 ptr_type = ctx->v4i32;
4514 break;
4515 case AC_ARG_CONST_IMAGE_PTR:
4516 ptr_type = ctx->v8i32;
4517 break;
4518 default:
4519 unreachable("unknown arg type");
4520 }
4521 if (size == 1) {
4522 return ac_array_in_const32_addr_space(ptr_type);
4523 } else {
4524 assert(size == 2);
4525 return ac_array_in_const_addr_space(ptr_type);
4526 }
4527 }
4528 }
4529
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)4530 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4531 enum ac_llvm_calling_convention convention, const char *name,
4532 LLVMTypeRef ret_type, LLVMModuleRef module)
4533 {
4534 LLVMTypeRef arg_types[AC_MAX_ARGS];
4535
4536 for (unsigned i = 0; i < args->arg_count; i++) {
4537 arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4538 }
4539
4540 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4541
4542 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4543 LLVMBasicBlockRef main_function_body =
4544 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4545 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4546
4547 LLVMSetFunctionCallConv(main_function, convention);
4548 for (unsigned i = 0; i < args->arg_count; ++i) {
4549 LLVMValueRef P = LLVMGetParam(main_function, i);
4550
4551 if (args->args[i].file != AC_ARG_SGPR)
4552 continue;
4553
4554 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4555
4556 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4557 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4558 ac_add_attr_dereferenceable(P, UINT64_MAX);
4559 ac_add_attr_alignment(P, 4);
4560 }
4561 }
4562
4563 ctx->main_function = main_function;
4564
4565 /* Enable denormals for FP16 and FP64: */
4566 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4567 /* Disable denormals for FP32: */
4568 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4569 "preserve-sign,preserve-sign");
4570 return main_function;
4571 }
4572
ac_build_s_endpgm(struct ac_llvm_context * ctx)4573 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4574 {
4575 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4576 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4577 LLVMBuildCall2(ctx->builder, calltype, code, NULL, 0, "");
4578 }
4579
4580 /**
4581 * Convert triangle strip indices to triangle indices. This is used to decompose
4582 * triangle strips into triangles.
4583 */
ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context * ctx,LLVMValueRef is_odd,LLVMValueRef flatshade_first,LLVMValueRef index[3])4584 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4585 LLVMValueRef flatshade_first,
4586 LLVMValueRef index[3])
4587 {
4588 LLVMBuilderRef builder = ctx->builder;
4589 LLVMValueRef out[3];
4590
4591 /* We need to change the vertex order for odd triangles to get correct
4592 * front/back facing by swapping 2 vertex indices, but we also have to
4593 * keep the provoking vertex in the same place.
4594 *
4595 * If the first vertex is provoking, swap index 1 and 2.
4596 * If the last vertex is provoking, swap index 0 and 1.
4597 */
4598 out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4599 LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4600 out[1] = LLVMBuildSelect(builder, flatshade_first,
4601 LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4602 LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4603 out[2] = LLVMBuildSelect(builder, flatshade_first,
4604 LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4605 memcpy(index, out, sizeof(out));
4606 }
4607
ac_build_is_inf_or_nan(struct ac_llvm_context * ctx,LLVMValueRef a)4608 LLVMValueRef ac_build_is_inf_or_nan(struct ac_llvm_context *ctx, LLVMValueRef a)
4609 {
4610 LLVMValueRef args[2] = {
4611 a,
4612 LLVMConstInt(ctx->i32, S_NAN | Q_NAN | N_INFINITY | P_INFINITY, 0),
4613 };
4614 return ac_build_intrinsic(ctx, "llvm.amdgcn.class.f32", ctx->i1, args, 2,
4615 AC_FUNC_ATTR_READNONE);
4616 }
4617