1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
19 *
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
22 * of the Software.
23 *
24 */
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
27
28 #include "ac_exp_param.h"
29 #include "ac_llvm_util.h"
30 #include "ac_shader_util.h"
31 #include "c11/threads.h"
32 #include "shader_enums.h"
33 #include "sid.h"
34 #include "util/bitscan.h"
35 #include "util/macros.h"
36 #include "util/u_atomic.h"
37 #include "util/u_math.h"
38 #include <llvm-c/Core.h>
39 #include <llvm/Config/llvm-config.h>
40
41 #include <assert.h>
42 #include <stdio.h>
43
44 #define AC_LLVM_INITIAL_CF_DEPTH 4
45
46 /* Data for if/else/endif and bgnloop/endloop control flow structures.
47 */
48 struct ac_llvm_flow {
49 /* Loop exit or next part of if/else/endif. */
50 LLVMBasicBlockRef next_block;
51 LLVMBasicBlockRef loop_entry_block;
52 };
53
54 /* Initialize module-independent parts of the context.
55 *
56 * The caller is responsible for initializing ctx::module and ctx::builder.
57 */
ac_llvm_context_init(struct ac_llvm_context * ctx,struct ac_llvm_compiler * compiler,enum chip_class chip_class,enum radeon_family family,enum ac_float_mode float_mode,unsigned wave_size,unsigned ballot_mask_bits)58 void ac_llvm_context_init(struct ac_llvm_context *ctx, struct ac_llvm_compiler *compiler,
59 enum chip_class chip_class, enum radeon_family family,
60 enum ac_float_mode float_mode, unsigned wave_size,
61 unsigned ballot_mask_bits)
62 {
63 ctx->context = LLVMContextCreate();
64
65 ctx->chip_class = chip_class;
66 ctx->family = family;
67 ctx->wave_size = wave_size;
68 ctx->ballot_mask_bits = ballot_mask_bits;
69 ctx->float_mode = float_mode;
70 ctx->module =
71 ac_create_module(wave_size == 32 ? compiler->tm_wave32 : compiler->tm, ctx->context);
72 ctx->builder = ac_create_builder(ctx->context, float_mode);
73
74 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
75 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
76 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
77 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
78 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
79 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
80 ctx->i128 = LLVMIntTypeInContext(ctx->context, 128);
81 ctx->intptr = ctx->i32;
82 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
83 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
84 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
85 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
86 ctx->v4i16 = LLVMVectorType(ctx->i16, 4);
87 ctx->v2f16 = LLVMVectorType(ctx->f16, 2);
88 ctx->v4f16 = LLVMVectorType(ctx->f16, 4);
89 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
90 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
91 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
92 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
93 ctx->v3f32 = LLVMVectorType(ctx->f32, 3);
94 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
95 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
96 ctx->iN_wavemask = LLVMIntTypeInContext(ctx->context, ctx->wave_size);
97 ctx->iN_ballotmask = LLVMIntTypeInContext(ctx->context, ballot_mask_bits);
98
99 ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
100 ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
101 ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
102 ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
103 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
104 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
105 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
106 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
107 ctx->i128_0 = LLVMConstInt(ctx->i128, 0, false);
108 ctx->i128_1 = LLVMConstInt(ctx->i128, 1, false);
109 ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
110 ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
111 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
112 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
113 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
114 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
115
116 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
117 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
118
119 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context, "range", 5);
120
121 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context, "invariant.load", 14);
122
123 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
124
125 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
126 ctx->flow = calloc(1, sizeof(*ctx->flow));
127 }
128
ac_llvm_context_dispose(struct ac_llvm_context * ctx)129 void ac_llvm_context_dispose(struct ac_llvm_context *ctx)
130 {
131 free(ctx->flow->stack);
132 free(ctx->flow);
133 ctx->flow = NULL;
134 }
135
ac_get_llvm_num_components(LLVMValueRef value)136 int ac_get_llvm_num_components(LLVMValueRef value)
137 {
138 LLVMTypeRef type = LLVMTypeOf(value);
139 unsigned num_components =
140 LLVMGetTypeKind(type) == LLVMVectorTypeKind ? LLVMGetVectorSize(type) : 1;
141 return num_components;
142 }
143
ac_llvm_extract_elem(struct ac_llvm_context * ac,LLVMValueRef value,int index)144 LLVMValueRef ac_llvm_extract_elem(struct ac_llvm_context *ac, LLVMValueRef value, int index)
145 {
146 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147 assert(index == 0);
148 return value;
149 }
150
151 return LLVMBuildExtractElement(ac->builder, value, LLVMConstInt(ac->i32, index, false), "");
152 }
153
ac_get_elem_bits(struct ac_llvm_context * ctx,LLVMTypeRef type)154 int ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
155 {
156 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
157 type = LLVMGetElementType(type);
158
159 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
160 return LLVMGetIntTypeWidth(type);
161
162 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
163 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_LDS)
164 return 32;
165 }
166
167 if (type == ctx->f16)
168 return 16;
169 if (type == ctx->f32)
170 return 32;
171 if (type == ctx->f64)
172 return 64;
173
174 unreachable("Unhandled type kind in get_elem_bits");
175 }
176
ac_get_type_size(LLVMTypeRef type)177 unsigned ac_get_type_size(LLVMTypeRef type)
178 {
179 LLVMTypeKind kind = LLVMGetTypeKind(type);
180
181 switch (kind) {
182 case LLVMIntegerTypeKind:
183 return LLVMGetIntTypeWidth(type) / 8;
184 case LLVMHalfTypeKind:
185 return 2;
186 case LLVMFloatTypeKind:
187 return 4;
188 case LLVMDoubleTypeKind:
189 return 8;
190 case LLVMPointerTypeKind:
191 if (LLVMGetPointerAddressSpace(type) == AC_ADDR_SPACE_CONST_32BIT)
192 return 4;
193 return 8;
194 case LLVMVectorTypeKind:
195 return LLVMGetVectorSize(type) * ac_get_type_size(LLVMGetElementType(type));
196 case LLVMArrayTypeKind:
197 return LLVMGetArrayLength(type) * ac_get_type_size(LLVMGetElementType(type));
198 default:
199 assert(0);
200 return 0;
201 }
202 }
203
to_integer_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)204 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
205 {
206 if (t == ctx->i1)
207 return ctx->i1;
208 else if (t == ctx->i8)
209 return ctx->i8;
210 else if (t == ctx->f16 || t == ctx->i16)
211 return ctx->i16;
212 else if (t == ctx->f32 || t == ctx->i32)
213 return ctx->i32;
214 else if (t == ctx->f64 || t == ctx->i64)
215 return ctx->i64;
216 else
217 unreachable("Unhandled integer size");
218 }
219
ac_to_integer_type(struct ac_llvm_context * ctx,LLVMTypeRef t)220 LLVMTypeRef ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
221 {
222 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
223 LLVMTypeRef elem_type = LLVMGetElementType(t);
224 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
225 }
226 if (LLVMGetTypeKind(t) == LLVMPointerTypeKind) {
227 switch (LLVMGetPointerAddressSpace(t)) {
228 case AC_ADDR_SPACE_GLOBAL:
229 return ctx->i64;
230 case AC_ADDR_SPACE_CONST_32BIT:
231 case AC_ADDR_SPACE_LDS:
232 return ctx->i32;
233 default:
234 unreachable("unhandled address space");
235 }
236 }
237 return to_integer_type_scalar(ctx, t);
238 }
239
ac_to_integer(struct ac_llvm_context * ctx,LLVMValueRef v)240 LLVMValueRef ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
241 {
242 LLVMTypeRef type = LLVMTypeOf(v);
243 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind) {
244 return LLVMBuildPtrToInt(ctx->builder, v, ac_to_integer_type(ctx, type), "");
245 }
246 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
247 }
248
ac_to_integer_or_pointer(struct ac_llvm_context * ctx,LLVMValueRef v)249 LLVMValueRef ac_to_integer_or_pointer(struct ac_llvm_context *ctx, LLVMValueRef v)
250 {
251 LLVMTypeRef type = LLVMTypeOf(v);
252 if (LLVMGetTypeKind(type) == LLVMPointerTypeKind)
253 return v;
254 return ac_to_integer(ctx, v);
255 }
256
to_float_type_scalar(struct ac_llvm_context * ctx,LLVMTypeRef t)257 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
258 {
259 if (t == ctx->i8)
260 return ctx->i8;
261 else if (t == ctx->i16 || t == ctx->f16)
262 return ctx->f16;
263 else if (t == ctx->i32 || t == ctx->f32)
264 return ctx->f32;
265 else if (t == ctx->i64 || t == ctx->f64)
266 return ctx->f64;
267 else
268 unreachable("Unhandled float size");
269 }
270
ac_to_float_type(struct ac_llvm_context * ctx,LLVMTypeRef t)271 LLVMTypeRef ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
272 {
273 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
274 LLVMTypeRef elem_type = LLVMGetElementType(t);
275 return LLVMVectorType(to_float_type_scalar(ctx, elem_type), LLVMGetVectorSize(t));
276 }
277 return to_float_type_scalar(ctx, t);
278 }
279
ac_to_float(struct ac_llvm_context * ctx,LLVMValueRef v)280 LLVMValueRef ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
281 {
282 LLVMTypeRef type = LLVMTypeOf(v);
283 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
284 }
285
ac_build_intrinsic(struct ac_llvm_context * ctx,const char * name,LLVMTypeRef return_type,LLVMValueRef * params,unsigned param_count,unsigned attrib_mask)286 LLVMValueRef ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
287 LLVMTypeRef return_type, LLVMValueRef *params, unsigned param_count,
288 unsigned attrib_mask)
289 {
290 LLVMValueRef function, call;
291 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
292
293 function = LLVMGetNamedFunction(ctx->module, name);
294 if (!function) {
295 LLVMTypeRef param_types[32], function_type;
296 unsigned i;
297
298 assert(param_count <= 32);
299
300 for (i = 0; i < param_count; ++i) {
301 assert(params[i]);
302 param_types[i] = LLVMTypeOf(params[i]);
303 }
304 function_type = LLVMFunctionType(return_type, param_types, param_count, 0);
305 function = LLVMAddFunction(ctx->module, name, function_type);
306
307 LLVMSetFunctionCallConv(function, LLVMCCallConv);
308 LLVMSetLinkage(function, LLVMExternalLinkage);
309
310 if (!set_callsite_attrs)
311 ac_add_func_attributes(ctx->context, function, attrib_mask);
312 }
313
314 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
315 if (set_callsite_attrs)
316 ac_add_func_attributes(ctx->context, call, attrib_mask);
317 return call;
318 }
319
320 /**
321 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
322 * intrinsic names).
323 */
ac_build_type_name_for_intr(LLVMTypeRef type,char * buf,unsigned bufsize)324 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
325 {
326 LLVMTypeRef elem_type = type;
327
328 assert(bufsize >= 8);
329
330 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
331 int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
332 if (ret < 0) {
333 char *type_name = LLVMPrintTypeToString(type);
334 fprintf(stderr, "Error building type name for: %s\n", type_name);
335 LLVMDisposeMessage(type_name);
336 return;
337 }
338 elem_type = LLVMGetElementType(type);
339 buf += ret;
340 bufsize -= ret;
341 }
342 switch (LLVMGetTypeKind(elem_type)) {
343 default:
344 break;
345 case LLVMIntegerTypeKind:
346 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
347 break;
348 case LLVMHalfTypeKind:
349 snprintf(buf, bufsize, "f16");
350 break;
351 case LLVMFloatTypeKind:
352 snprintf(buf, bufsize, "f32");
353 break;
354 case LLVMDoubleTypeKind:
355 snprintf(buf, bufsize, "f64");
356 break;
357 }
358 }
359
360 /**
361 * Helper function that builds an LLVM IR PHI node and immediately adds
362 * incoming edges.
363 */
ac_build_phi(struct ac_llvm_context * ctx,LLVMTypeRef type,unsigned count_incoming,LLVMValueRef * values,LLVMBasicBlockRef * blocks)364 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
365 LLVMValueRef *values, LLVMBasicBlockRef *blocks)
366 {
367 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
368 LLVMAddIncoming(phi, values, blocks, count_incoming);
369 return phi;
370 }
371
ac_build_s_barrier(struct ac_llvm_context * ctx)372 void ac_build_s_barrier(struct ac_llvm_context *ctx)
373 {
374 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
375 }
376
377 /* Prevent optimizations (at least of memory accesses) across the current
378 * point in the program by emitting empty inline assembly that is marked as
379 * having side effects.
380 *
381 * Optionally, a value can be passed through the inline assembly to prevent
382 * LLVM from hoisting calls to ReadNone functions.
383 */
ac_build_optimization_barrier(struct ac_llvm_context * ctx,LLVMValueRef * pvgpr)384 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pvgpr)
385 {
386 static int counter = 0;
387
388 LLVMBuilderRef builder = ctx->builder;
389 char code[16];
390
391 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
392
393 if (!pvgpr) {
394 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
395 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
396 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
397 } else {
398 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
399 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
400 LLVMTypeRef type = LLVMTypeOf(*pvgpr);
401 unsigned bitsize = ac_get_elem_bits(ctx, type);
402 LLVMValueRef vgpr = *pvgpr;
403 LLVMTypeRef vgpr_type;
404 unsigned vgpr_size;
405 LLVMValueRef vgpr0;
406
407 if (bitsize < 32)
408 vgpr = LLVMBuildZExt(ctx->builder, vgpr, ctx->i32, "");
409
410 vgpr_type = LLVMTypeOf(vgpr);
411 vgpr_size = ac_get_type_size(vgpr_type);
412
413 assert(vgpr_size % 4 == 0);
414
415 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
416 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
417 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
418 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
419 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
420
421 if (bitsize < 32)
422 vgpr = LLVMBuildTrunc(builder, vgpr, type, "");
423
424 *pvgpr = vgpr;
425 }
426 }
427
ac_build_shader_clock(struct ac_llvm_context * ctx,nir_scope scope)428 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope)
429 {
430 const char *subgroup = LLVM_VERSION_MAJOR >= 9 ? "llvm.readcyclecounter" : "llvm.amdgcn.s.memtime";
431 const char *name = scope == NIR_SCOPE_DEVICE ? "llvm.amdgcn.s.memrealtime" : subgroup;
432
433 LLVMValueRef tmp = ac_build_intrinsic(ctx, name, ctx->i64, NULL, 0, 0);
434 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
435 }
436
ac_build_ballot(struct ac_llvm_context * ctx,LLVMValueRef value)437 LLVMValueRef ac_build_ballot(struct ac_llvm_context *ctx, LLVMValueRef value)
438 {
439 const char *name;
440
441 if (LLVMTypeOf(value) == ctx->i1)
442 value = LLVMBuildZExt(ctx->builder, value, ctx->i32, "");
443
444 if (LLVM_VERSION_MAJOR >= 9) {
445 if (ctx->wave_size == 64)
446 name = "llvm.amdgcn.icmp.i64.i32";
447 else
448 name = "llvm.amdgcn.icmp.i32.i32";
449 } else {
450 name = "llvm.amdgcn.icmp.i32";
451 }
452 LLVMValueRef args[3] = {value, ctx->i32_0, LLVMConstInt(ctx->i32, LLVMIntNE, 0)};
453
454 /* We currently have no other way to prevent LLVM from lifting the icmp
455 * calls to a dominating basic block.
456 */
457 ac_build_optimization_barrier(ctx, &args[0]);
458
459 args[0] = ac_to_integer(ctx, args[0]);
460
461 return ac_build_intrinsic(
462 ctx, name, ctx->iN_wavemask, args, 3,
463 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
464 }
465
ac_get_i1_sgpr_mask(struct ac_llvm_context * ctx,LLVMValueRef value)466 LLVMValueRef ac_get_i1_sgpr_mask(struct ac_llvm_context *ctx, LLVMValueRef value)
467 {
468 const char *name;
469
470 if (LLVM_VERSION_MAJOR >= 9) {
471 if (ctx->wave_size == 64)
472 name = "llvm.amdgcn.icmp.i64.i1";
473 else
474 name = "llvm.amdgcn.icmp.i32.i1";
475 } else {
476 name = "llvm.amdgcn.icmp.i1";
477 }
478 LLVMValueRef args[3] = {
479 value,
480 ctx->i1false,
481 LLVMConstInt(ctx->i32, LLVMIntNE, 0),
482 };
483
484 return ac_build_intrinsic(
485 ctx, name, ctx->iN_wavemask, args, 3,
486 AC_FUNC_ATTR_NOUNWIND | AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
487 }
488
ac_build_vote_all(struct ac_llvm_context * ctx,LLVMValueRef value)489 LLVMValueRef ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
490 {
491 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
492 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
493 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
494 }
495
ac_build_vote_any(struct ac_llvm_context * ctx,LLVMValueRef value)496 LLVMValueRef ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
497 {
498 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
499 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0),
500 "");
501 }
502
ac_build_vote_eq(struct ac_llvm_context * ctx,LLVMValueRef value)503 LLVMValueRef ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
504 {
505 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
506 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
507
508 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
509 LLVMValueRef none =
510 LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, LLVMConstInt(ctx->iN_wavemask, 0, 0), "");
511 return LLVMBuildOr(ctx->builder, all, none, "");
512 }
513
ac_build_varying_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned component)514 LLVMValueRef ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
515 unsigned value_count, unsigned component)
516 {
517 LLVMValueRef vec = NULL;
518
519 if (value_count == 1) {
520 return values[component];
521 } else if (!value_count)
522 unreachable("value_count is 0");
523
524 for (unsigned i = component; i < value_count + component; i++) {
525 LLVMValueRef value = values[i];
526
527 if (i == component)
528 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
529 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
530 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
531 }
532 return vec;
533 }
534
ac_build_gather_values_extended(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count,unsigned value_stride,bool load,bool always_vector)535 LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMValueRef *values,
536 unsigned value_count, unsigned value_stride, bool load,
537 bool always_vector)
538 {
539 LLVMBuilderRef builder = ctx->builder;
540 LLVMValueRef vec = NULL;
541 unsigned i;
542
543 if (value_count == 1 && !always_vector) {
544 if (load)
545 return LLVMBuildLoad(builder, values[0], "");
546 return values[0];
547 } else if (!value_count)
548 unreachable("value_count is 0");
549
550 for (i = 0; i < value_count; i++) {
551 LLVMValueRef value = values[i * value_stride];
552 if (load)
553 value = LLVMBuildLoad(builder, value, "");
554
555 if (!i)
556 vec = LLVMGetUndef(LLVMVectorType(LLVMTypeOf(value), value_count));
557 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
558 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
559 }
560 return vec;
561 }
562
ac_build_gather_values(struct ac_llvm_context * ctx,LLVMValueRef * values,unsigned value_count)563 LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
564 unsigned value_count)
565 {
566 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
567 }
568
569 /* Expand a scalar or vector to <dst_channels x type> by filling the remaining
570 * channels with undef. Extract at most src_channels components from the input.
571 */
ac_build_expand(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned src_channels,unsigned dst_channels)572 static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
573 unsigned src_channels, unsigned dst_channels)
574 {
575 LLVMTypeRef elemtype;
576 LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
577
578 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
579 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
580
581 if (src_channels == dst_channels && vec_size == dst_channels)
582 return value;
583
584 src_channels = MIN2(src_channels, vec_size);
585
586 for (unsigned i = 0; i < src_channels; i++)
587 chan[i] = ac_llvm_extract_elem(ctx, value, i);
588
589 elemtype = LLVMGetElementType(LLVMTypeOf(value));
590 } else {
591 if (src_channels) {
592 assert(src_channels == 1);
593 chan[0] = value;
594 }
595 elemtype = LLVMTypeOf(value);
596 }
597
598 for (unsigned i = src_channels; i < dst_channels; i++)
599 chan[i] = LLVMGetUndef(elemtype);
600
601 return ac_build_gather_values(ctx, chan, dst_channels);
602 }
603
604 /* Extract components [start, start + channels) from a vector.
605 */
ac_extract_components(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned start,unsigned channels)606 LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
607 unsigned channels)
608 {
609 LLVMValueRef *const chan = alloca(channels * sizeof(LLVMValueRef));
610
611 for (unsigned i = 0; i < channels; i++)
612 chan[i] = ac_llvm_extract_elem(ctx, value, i + start);
613
614 return ac_build_gather_values(ctx, chan, channels);
615 }
616
617 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
618 * with undef. Extract at most num_channels components from the input.
619 */
ac_build_expand_to_vec4(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned num_channels)620 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
621 unsigned num_channels)
622 {
623 return ac_build_expand(ctx, value, num_channels, 4);
624 }
625
ac_build_round(struct ac_llvm_context * ctx,LLVMValueRef value)626 LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value)
627 {
628 unsigned type_size = ac_get_type_size(LLVMTypeOf(value));
629 const char *name;
630
631 if (type_size == 2)
632 name = "llvm.rint.f16";
633 else if (type_size == 4)
634 name = "llvm.rint.f32";
635 else
636 name = "llvm.rint.f64";
637
638 return ac_build_intrinsic(ctx, name, LLVMTypeOf(value), &value, 1, AC_FUNC_ATTR_READNONE);
639 }
640
ac_build_fdiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef den)641 LLVMValueRef ac_build_fdiv(struct ac_llvm_context *ctx, LLVMValueRef num, LLVMValueRef den)
642 {
643 unsigned type_size = ac_get_type_size(LLVMTypeOf(den));
644 const char *name;
645
646 /* For doubles, we need precise division to pass GLCTS. */
647 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL && type_size == 8)
648 return LLVMBuildFDiv(ctx->builder, num, den, "");
649
650 if (type_size == 2)
651 name = "llvm.amdgcn.rcp.f16";
652 else if (type_size == 4)
653 name = "llvm.amdgcn.rcp.f32";
654 else
655 name = "llvm.amdgcn.rcp.f64";
656
657 LLVMValueRef rcp =
658 ac_build_intrinsic(ctx, name, LLVMTypeOf(den), &den, 1, AC_FUNC_ATTR_READNONE);
659
660 return LLVMBuildFMul(ctx->builder, num, rcp, "");
661 }
662
663 /* See fast_idiv_by_const.h. */
664 /* Set: increment = util_fast_udiv_info::increment ? multiplier : 0; */
ac_build_fast_udiv(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)665 LLVMValueRef ac_build_fast_udiv(struct ac_llvm_context *ctx, LLVMValueRef num,
666 LLVMValueRef multiplier, LLVMValueRef pre_shift,
667 LLVMValueRef post_shift, LLVMValueRef increment)
668 {
669 LLVMBuilderRef builder = ctx->builder;
670
671 num = LLVMBuildLShr(builder, num, pre_shift, "");
672 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
673 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
674 num = LLVMBuildAdd(builder, num, LLVMBuildZExt(builder, increment, ctx->i64, ""), "");
675 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
676 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
677 return LLVMBuildLShr(builder, num, post_shift, "");
678 }
679
680 /* See fast_idiv_by_const.h. */
681 /* If num != UINT_MAX, this more efficient version can be used. */
682 /* Set: increment = util_fast_udiv_info::increment; */
ac_build_fast_udiv_nuw(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef pre_shift,LLVMValueRef post_shift,LLVMValueRef increment)683 LLVMValueRef ac_build_fast_udiv_nuw(struct ac_llvm_context *ctx, LLVMValueRef num,
684 LLVMValueRef multiplier, LLVMValueRef pre_shift,
685 LLVMValueRef post_shift, LLVMValueRef increment)
686 {
687 LLVMBuilderRef builder = ctx->builder;
688
689 num = LLVMBuildLShr(builder, num, pre_shift, "");
690 num = LLVMBuildNUWAdd(builder, num, increment, "");
691 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
692 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
693 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
694 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
695 return LLVMBuildLShr(builder, num, post_shift, "");
696 }
697
698 /* See fast_idiv_by_const.h. */
699 /* Both operands must fit in 31 bits and the divisor must not be 1. */
ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context * ctx,LLVMValueRef num,LLVMValueRef multiplier,LLVMValueRef post_shift)700 LLVMValueRef ac_build_fast_udiv_u31_d_not_one(struct ac_llvm_context *ctx, LLVMValueRef num,
701 LLVMValueRef multiplier, LLVMValueRef post_shift)
702 {
703 LLVMBuilderRef builder = ctx->builder;
704
705 num = LLVMBuildMul(builder, LLVMBuildZExt(builder, num, ctx->i64, ""),
706 LLVMBuildZExt(builder, multiplier, ctx->i64, ""), "");
707 num = LLVMBuildLShr(builder, num, LLVMConstInt(ctx->i64, 32, 0), "");
708 num = LLVMBuildTrunc(builder, num, ctx->i32, "");
709 return LLVMBuildLShr(builder, num, post_shift, "");
710 }
711
712 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
713 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
714 * already multiplied by two. id is the cube face number.
715 */
716 struct cube_selection_coords {
717 LLVMValueRef stc[2];
718 LLVMValueRef ma;
719 LLVMValueRef id;
720 };
721
build_cube_intrinsic(struct ac_llvm_context * ctx,LLVMValueRef in[3],struct cube_selection_coords * out)722 static void build_cube_intrinsic(struct ac_llvm_context *ctx, LLVMValueRef in[3],
723 struct cube_selection_coords *out)
724 {
725 LLVMTypeRef f32 = ctx->f32;
726
727 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc", f32, in, 3, AC_FUNC_ATTR_READNONE);
728 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc", f32, in, 3, AC_FUNC_ATTR_READNONE);
729 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema", f32, in, 3, AC_FUNC_ATTR_READNONE);
730 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid", f32, in, 3, AC_FUNC_ATTR_READNONE);
731 }
732
733 /**
734 * Build a manual selection sequence for cube face sc/tc coordinates and
735 * major axis vector (multiplied by 2 for consistency) for the given
736 * vec3 \p coords, for the face implied by \p selcoords.
737 *
738 * For the major axis, we always adjust the sign to be in the direction of
739 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
740 * the selcoords major axis.
741 */
build_cube_select(struct ac_llvm_context * ctx,const struct cube_selection_coords * selcoords,const LLVMValueRef * coords,LLVMValueRef * out_st,LLVMValueRef * out_ma)742 static void build_cube_select(struct ac_llvm_context *ctx,
743 const struct cube_selection_coords *selcoords,
744 const LLVMValueRef *coords, LLVMValueRef *out_st,
745 LLVMValueRef *out_ma)
746 {
747 LLVMBuilderRef builder = ctx->builder;
748 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
749 LLVMValueRef is_ma_positive;
750 LLVMValueRef sgn_ma;
751 LLVMValueRef is_ma_z, is_not_ma_z;
752 LLVMValueRef is_ma_y;
753 LLVMValueRef is_ma_x;
754 LLVMValueRef sgn;
755 LLVMValueRef tmp;
756
757 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->ma, LLVMConstReal(f32, 0.0), "");
758 sgn_ma = LLVMBuildSelect(builder, is_ma_positive, LLVMConstReal(f32, 1.0),
759 LLVMConstReal(f32, -1.0), "");
760
761 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
762 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
763 is_ma_y = LLVMBuildAnd(
764 builder, is_not_ma_z,
765 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
766 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
767
768 /* Select sc */
769 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
770 sgn = LLVMBuildSelect(
771 builder, is_ma_y, LLVMConstReal(f32, 1.0),
772 LLVMBuildSelect(builder, is_ma_z, sgn_ma, LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
773 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
774
775 /* Select tc */
776 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
777 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma, LLVMConstReal(f32, -1.0), "");
778 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
779
780 /* Select ma */
781 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
782 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
783 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
784 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
785 }
786
ac_prepare_cube_coords(struct ac_llvm_context * ctx,bool is_deriv,bool is_array,bool is_lod,LLVMValueRef * coords_arg,LLVMValueRef * derivs_arg)787 void ac_prepare_cube_coords(struct ac_llvm_context *ctx, bool is_deriv, bool is_array, bool is_lod,
788 LLVMValueRef *coords_arg, LLVMValueRef *derivs_arg)
789 {
790
791 LLVMBuilderRef builder = ctx->builder;
792 struct cube_selection_coords selcoords;
793 LLVMValueRef coords[3];
794 LLVMValueRef invma;
795
796 if (is_array && !is_lod) {
797 LLVMValueRef tmp = ac_build_round(ctx, coords_arg[3]);
798
799 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
800 *
801 * "For Array forms, the array layer used will be
802 *
803 * max(0, min(d−1, floor(layer+0.5)))
804 *
805 * where d is the depth of the texture array and layer
806 * comes from the component indicated in the tables below.
807 * Workaroudn for an issue where the layer is taken from a
808 * helper invocation which happens to fall on a different
809 * layer due to extrapolation."
810 *
811 * GFX8 and earlier attempt to implement this in hardware by
812 * clamping the value of coords[2] = (8 * layer) + face.
813 * Unfortunately, this means that the we end up with the wrong
814 * face when clamping occurs.
815 *
816 * Clamp the layer earlier to work around the issue.
817 */
818 if (ctx->chip_class <= GFX8) {
819 LLVMValueRef ge0;
820 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
821 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
822 }
823
824 coords_arg[3] = tmp;
825 }
826
827 build_cube_intrinsic(ctx, coords_arg, &selcoords);
828
829 invma =
830 ac_build_intrinsic(ctx, "llvm.fabs.f32", ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
831 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
832
833 for (int i = 0; i < 2; ++i)
834 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
835
836 coords[2] = selcoords.id;
837
838 if (is_deriv && derivs_arg) {
839 LLVMValueRef derivs[4];
840 int axis;
841
842 /* Convert cube derivatives to 2D derivatives. */
843 for (axis = 0; axis < 2; axis++) {
844 LLVMValueRef deriv_st[2];
845 LLVMValueRef deriv_ma;
846
847 /* Transform the derivative alongside the texture
848 * coordinate. Mathematically, the correct formula is
849 * as follows. Assume we're projecting onto the +Z face
850 * and denote by dx/dh the derivative of the (original)
851 * X texture coordinate with respect to horizontal
852 * window coordinates. The projection onto the +Z face
853 * plane is:
854 *
855 * f(x,z) = x/z
856 *
857 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
858 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
859 *
860 * This motivatives the implementation below.
861 *
862 * Whether this actually gives the expected results for
863 * apps that might feed in derivatives obtained via
864 * finite differences is anyone's guess. The OpenGL spec
865 * seems awfully quiet about how textureGrad for cube
866 * maps should be handled.
867 */
868 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3], deriv_st, &deriv_ma);
869
870 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
871
872 for (int i = 0; i < 2; ++i)
873 derivs[axis * 2 + i] =
874 LLVMBuildFSub(builder, LLVMBuildFMul(builder, deriv_st[i], invma, ""),
875 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
876 }
877
878 memcpy(derivs_arg, derivs, sizeof(derivs));
879 }
880
881 /* Shift the texture coordinate. This must be applied after the
882 * derivative calculation.
883 */
884 for (int i = 0; i < 2; ++i)
885 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
886
887 if (is_array) {
888 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
889 /* coords_arg.w component - array_index for cube arrays */
890 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
891 }
892
893 memcpy(coords_arg, coords, sizeof(coords));
894 }
895
ac_build_fs_interp(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)896 LLVMValueRef ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
897 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
898 LLVMValueRef j)
899 {
900 LLVMValueRef args[5];
901 LLVMValueRef p1;
902
903 args[0] = i;
904 args[1] = llvm_chan;
905 args[2] = attr_number;
906 args[3] = params;
907
908 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1", ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
909
910 args[0] = p1;
911 args[1] = j;
912 args[2] = llvm_chan;
913 args[3] = attr_number;
914 args[4] = params;
915
916 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2", ctx->f32, args, 5,
917 AC_FUNC_ATTR_READNONE);
918 }
919
ac_build_fs_interp_f16(struct ac_llvm_context * ctx,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params,LLVMValueRef i,LLVMValueRef j)920 LLVMValueRef ac_build_fs_interp_f16(struct ac_llvm_context *ctx, LLVMValueRef llvm_chan,
921 LLVMValueRef attr_number, LLVMValueRef params, LLVMValueRef i,
922 LLVMValueRef j)
923 {
924 LLVMValueRef args[6];
925 LLVMValueRef p1;
926
927 args[0] = i;
928 args[1] = llvm_chan;
929 args[2] = attr_number;
930 args[3] = ctx->i1false;
931 args[4] = params;
932
933 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", ctx->f32, args, 5,
934 AC_FUNC_ATTR_READNONE);
935
936 args[0] = p1;
937 args[1] = j;
938 args[2] = llvm_chan;
939 args[3] = attr_number;
940 args[4] = ctx->i1false;
941 args[5] = params;
942
943 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", ctx->f16, args, 6,
944 AC_FUNC_ATTR_READNONE);
945 }
946
ac_build_fs_interp_mov(struct ac_llvm_context * ctx,LLVMValueRef parameter,LLVMValueRef llvm_chan,LLVMValueRef attr_number,LLVMValueRef params)947 LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter,
948 LLVMValueRef llvm_chan, LLVMValueRef attr_number,
949 LLVMValueRef params)
950 {
951 LLVMValueRef args[4];
952
953 args[0] = parameter;
954 args[1] = llvm_chan;
955 args[2] = attr_number;
956 args[3] = params;
957
958 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov", ctx->f32, args, 4,
959 AC_FUNC_ATTR_READNONE);
960 }
961
ac_build_gep_ptr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)962 LLVMValueRef ac_build_gep_ptr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
963 LLVMValueRef index)
964 {
965 return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
966 }
967
ac_build_gep0(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)968 LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
969 {
970 LLVMValueRef indices[2] = {
971 ctx->i32_0,
972 index,
973 };
974 return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
975 }
976
ac_build_pointer_add(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef index)977 LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMValueRef index)
978 {
979 return LLVMBuildPointerCast(ctx->builder, LLVMBuildGEP(ctx->builder, ptr, &index, 1, ""),
980 LLVMTypeOf(ptr), "");
981 }
982
ac_build_indexed_store(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,LLVMValueRef value)983 void ac_build_indexed_store(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index,
984 LLVMValueRef value)
985 {
986 LLVMBuildStore(ctx->builder, value, ac_build_gep0(ctx, base_ptr, index));
987 }
988
989 /**
990 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
991 * It's equivalent to doing a load from &base_ptr[index].
992 *
993 * \param base_ptr Where the array starts.
994 * \param index The element index into the array.
995 * \param uniform Whether the base_ptr and index can be assumed to be
996 * dynamically uniform (i.e. load to an SGPR)
997 * \param invariant Whether the load is invariant (no other opcodes affect it)
998 * \param no_unsigned_wraparound
999 * For all possible re-associations and re-distributions of an expression
1000 * "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
1001 * without inbounds in base_ptr), this parameter is true if "addr + offset"
1002 * does not result in an unsigned integer wraparound. This is used for
1003 * optimal code generation of 32-bit pointer arithmetic.
1004 *
1005 * For example, a 32-bit immediate offset that causes a 32-bit unsigned
1006 * integer wraparound can't be an imm offset in s_load_dword, because
1007 * the instruction performs "addr + offset" in 64 bits.
1008 *
1009 * Expected usage for bindless textures by chaining GEPs:
1010 * // possible unsigned wraparound, don't use InBounds:
1011 * ptr1 = LLVMBuildGEP(base_ptr, index);
1012 * image = load(ptr1); // becomes "s_load ptr1, 0"
1013 *
1014 * ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
1015 * sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
1016 */
ac_build_load_custom(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index,bool uniform,bool invariant,bool no_unsigned_wraparound)1017 static LLVMValueRef ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1018 LLVMValueRef index, bool uniform, bool invariant,
1019 bool no_unsigned_wraparound)
1020 {
1021 LLVMValueRef pointer, result;
1022
1023 if (no_unsigned_wraparound &&
1024 LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_ADDR_SPACE_CONST_32BIT)
1025 pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, &index, 1, "");
1026 else
1027 pointer = LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, "");
1028
1029 if (uniform)
1030 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
1031 result = LLVMBuildLoad(ctx->builder, pointer, "");
1032 if (invariant)
1033 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
1034 return result;
1035 }
1036
ac_build_load(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1037 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, LLVMValueRef index)
1038 {
1039 return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
1040 }
1041
ac_build_load_invariant(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1042 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1043 LLVMValueRef index)
1044 {
1045 return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
1046 }
1047
1048 /* This assumes that there is no unsigned integer wraparound during the address
1049 * computation, excluding all GEPs within base_ptr. */
ac_build_load_to_sgpr(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1050 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
1051 LLVMValueRef index)
1052 {
1053 return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
1054 }
1055
1056 /* See ac_build_load_custom() documentation. */
ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context * ctx,LLVMValueRef base_ptr,LLVMValueRef index)1057 LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
1058 LLVMValueRef base_ptr, LLVMValueRef index)
1059 {
1060 return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
1061 }
1062
get_load_cache_policy(struct ac_llvm_context * ctx,unsigned cache_policy)1063 static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
1064 {
1065 return cache_policy | (ctx->chip_class >= GFX10 && cache_policy & ac_glc ? ac_dlc : 0);
1066 }
1067
ac_build_buffer_store_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool use_format,bool structurized)1068 static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1069 LLVMValueRef data, LLVMValueRef vindex,
1070 LLVMValueRef voffset, LLVMValueRef soffset,
1071 unsigned cache_policy, bool use_format, bool structurized)
1072 {
1073 LLVMValueRef args[6];
1074 int idx = 0;
1075 args[idx++] = data;
1076 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1077 if (structurized)
1078 args[idx++] = vindex ? vindex : ctx->i32_0;
1079 args[idx++] = voffset ? voffset : ctx->i32_0;
1080 args[idx++] = soffset ? soffset : ctx->i32_0;
1081 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1082 const char *indexing_kind = structurized ? "struct" : "raw";
1083 char name[256], type_name[8];
1084
1085 ac_build_type_name_for_intr(LLVMTypeOf(data), type_name, sizeof(type_name));
1086
1087 if (use_format) {
1088 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.format.%s", indexing_kind,
1089 type_name);
1090 } else {
1091 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.store.%s", indexing_kind, type_name);
1092 }
1093
1094 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1095 }
1096
ac_build_buffer_store_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef data,LLVMValueRef vindex,LLVMValueRef voffset,unsigned cache_policy)1097 void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
1098 LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
1099 {
1100 ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true, true);
1101 }
1102
1103 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
1104 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
1105 * or v4i32 (num_channels=3,4).
1106 */
ac_build_buffer_store_dword(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,unsigned num_channels,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,unsigned cache_policy)1107 void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1108 unsigned num_channels, LLVMValueRef voffset, LLVMValueRef soffset,
1109 unsigned inst_offset, unsigned cache_policy)
1110 {
1111 /* Split 3 channel stores, because only LLVM 9+ support 3-channel
1112 * intrinsics. */
1113 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false)) {
1114 LLVMValueRef v[3], v01;
1115
1116 for (int i = 0; i < 3; i++) {
1117 v[i] = LLVMBuildExtractElement(ctx->builder, vdata, LLVMConstInt(ctx->i32, i, 0), "");
1118 }
1119 v01 = ac_build_gather_values(ctx, v, 2);
1120
1121 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset, soffset, inst_offset, cache_policy);
1122 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset, soffset, inst_offset + 8,
1123 cache_policy);
1124 return;
1125 }
1126
1127 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
1128 * (voffset is swizzled, but soffset isn't swizzled).
1129 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
1130 */
1131 if (!(cache_policy & ac_swizzled)) {
1132 LLVMValueRef offset = soffset;
1133
1134 if (inst_offset)
1135 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, inst_offset, 0), "");
1136
1137 ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), ctx->i32_0, voffset, offset,
1138 cache_policy, false, false);
1139 return;
1140 }
1141
1142 static const unsigned dfmts[] = {V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
1143 V_008F0C_BUF_DATA_FORMAT_32_32_32,
1144 V_008F0C_BUF_DATA_FORMAT_32_32_32_32};
1145 unsigned dfmt = dfmts[num_channels - 1];
1146 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1147 LLVMValueRef immoffset = LLVMConstInt(ctx->i32, inst_offset, 0);
1148
1149 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, immoffset, num_channels, dfmt,
1150 nfmt, cache_policy);
1151 }
1152
ac_build_buffer_load_common(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned num_channels,LLVMTypeRef channel_type,unsigned cache_policy,bool can_speculate,bool use_format,bool structurized)1153 static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1154 LLVMValueRef vindex, LLVMValueRef voffset,
1155 LLVMValueRef soffset, unsigned num_channels,
1156 LLVMTypeRef channel_type, unsigned cache_policy,
1157 bool can_speculate, bool use_format,
1158 bool structurized)
1159 {
1160 LLVMValueRef args[5];
1161 int idx = 0;
1162 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1163 if (structurized)
1164 args[idx++] = vindex ? vindex : ctx->i32_0;
1165 args[idx++] = voffset ? voffset : ctx->i32_0;
1166 args[idx++] = soffset ? soffset : ctx->i32_0;
1167 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1168 unsigned func =
1169 !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
1170 const char *indexing_kind = structurized ? "struct" : "raw";
1171 char name[256], type_name[8];
1172
1173 /* D16 is only supported on gfx8+ */
1174 assert(!use_format || (channel_type != ctx->f16 && channel_type != ctx->i16) ||
1175 ctx->chip_class >= GFX8);
1176
1177 LLVMTypeRef type = func > 1 ? LLVMVectorType(channel_type, func) : channel_type;
1178 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1179
1180 if (use_format) {
1181 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.format.%s", indexing_kind,
1182 type_name);
1183 } else {
1184 snprintf(name, sizeof(name), "llvm.amdgcn.%s.buffer.load.%s", indexing_kind, type_name);
1185 }
1186
1187 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1188 }
1189
ac_build_buffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,int num_channels,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned inst_offset,unsigned cache_policy,bool can_speculate,bool allow_smem)1190 LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
1191 LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
1192 unsigned inst_offset, unsigned cache_policy, bool can_speculate,
1193 bool allow_smem)
1194 {
1195 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1196 if (voffset)
1197 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1198 if (soffset)
1199 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1200
1201 if (allow_smem && !(cache_policy & ac_slc) &&
1202 (!(cache_policy & ac_glc) || ctx->chip_class >= GFX8)) {
1203 assert(vindex == NULL);
1204
1205 LLVMValueRef result[8];
1206
1207 for (int i = 0; i < num_channels; i++) {
1208 if (i) {
1209 offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, 4, 0), "");
1210 }
1211 LLVMValueRef args[3] = {
1212 rsrc,
1213 offset,
1214 LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
1215 };
1216 result[i] = ac_build_intrinsic(ctx, "llvm.amdgcn.s.buffer.load.f32", ctx->f32, args, 3,
1217 AC_FUNC_ATTR_READNONE);
1218 }
1219 if (num_channels == 1)
1220 return result[0];
1221
1222 if (num_channels == 3 && !ac_has_vec3_support(ctx->chip_class, false))
1223 result[num_channels++] = LLVMGetUndef(ctx->f32);
1224 return ac_build_gather_values(ctx, result, num_channels);
1225 }
1226
1227 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset, ctx->i32_0, num_channels, ctx->f32,
1228 cache_policy, can_speculate, false, false);
1229 }
1230
ac_build_buffer_load_format(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,unsigned num_channels,unsigned cache_policy,bool can_speculate,bool d16)1231 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1232 LLVMValueRef vindex, LLVMValueRef voffset,
1233 unsigned num_channels, unsigned cache_policy,
1234 bool can_speculate, bool d16)
1235 {
1236 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
1237 d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
1238 true);
1239 }
1240
ac_build_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate,bool structurized)1241 static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1242 LLVMValueRef vindex, LLVMValueRef voffset,
1243 LLVMValueRef soffset, LLVMValueRef immoffset,
1244 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1245 unsigned cache_policy, bool can_speculate,
1246 bool structurized)
1247 {
1248 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1249
1250 LLVMValueRef args[6];
1251 int idx = 0;
1252 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1253 if (structurized)
1254 args[idx++] = vindex ? vindex : ctx->i32_0;
1255 args[idx++] = voffset ? voffset : ctx->i32_0;
1256 args[idx++] = soffset ? soffset : ctx->i32_0;
1257 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1258 args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
1259 unsigned func =
1260 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1261 const char *indexing_kind = structurized ? "struct" : "raw";
1262 char name[256], type_name[8];
1263
1264 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1265 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1266
1267 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.load.%s", indexing_kind, type_name);
1268
1269 return ac_build_intrinsic(ctx, name, type, args, idx, ac_get_load_intr_attribs(can_speculate));
1270 }
1271
ac_build_struct_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1272 LLVMValueRef ac_build_struct_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1273 LLVMValueRef vindex, LLVMValueRef voffset,
1274 LLVMValueRef soffset, LLVMValueRef immoffset,
1275 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1276 unsigned cache_policy, bool can_speculate)
1277 {
1278 return ac_build_tbuffer_load(ctx, rsrc, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1279 nfmt, cache_policy, can_speculate, true);
1280 }
1281
ac_build_raw_tbuffer_load(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool can_speculate)1282 LLVMValueRef ac_build_raw_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1283 LLVMValueRef voffset, LLVMValueRef soffset,
1284 LLVMValueRef immoffset, unsigned num_channels, unsigned dfmt,
1285 unsigned nfmt, unsigned cache_policy, bool can_speculate)
1286 {
1287 return ac_build_tbuffer_load(ctx, rsrc, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1288 nfmt, cache_policy, can_speculate, false);
1289 }
1290
ac_build_tbuffer_load_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1291 LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1292 LLVMValueRef voffset, LLVMValueRef soffset,
1293 LLVMValueRef immoffset, unsigned cache_policy)
1294 {
1295 LLVMValueRef res;
1296
1297 if (LLVM_VERSION_MAJOR >= 9) {
1298 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1299
1300 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1301 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
1302 cache_policy, false, false, false);
1303 } else {
1304 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1305 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1306
1307 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1308 cache_policy, false);
1309
1310 res = LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1311 }
1312
1313 return res;
1314 }
1315
ac_build_tbuffer_load_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned cache_policy)1316 LLVMValueRef ac_build_tbuffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1317 LLVMValueRef voffset, LLVMValueRef soffset,
1318 LLVMValueRef immoffset, unsigned cache_policy)
1319 {
1320 LLVMValueRef res;
1321
1322 if (LLVM_VERSION_MAJOR >= 9) {
1323 voffset = LLVMBuildAdd(ctx->builder, voffset, immoffset, "");
1324
1325 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1326 res = ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
1327 false, false, false);
1328 } else {
1329 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1330 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1331
1332 res = ac_build_raw_tbuffer_load(ctx, rsrc, voffset, soffset, immoffset, 1, dfmt, nfmt,
1333 cache_policy, false);
1334
1335 res = LLVMBuildTrunc(ctx->builder, res, ctx->i8, "");
1336 }
1337
1338 return res;
1339 }
1340
1341 /**
1342 * Convert an 11- or 10-bit unsigned floating point number to an f32.
1343 *
1344 * The input exponent is expected to be biased analogous to IEEE-754, i.e. by
1345 * 2^(exp_bits-1) - 1 (as defined in OpenGL and other graphics APIs).
1346 */
ac_ufN_to_float(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned exp_bits,unsigned mant_bits)1347 static LLVMValueRef ac_ufN_to_float(struct ac_llvm_context *ctx, LLVMValueRef src,
1348 unsigned exp_bits, unsigned mant_bits)
1349 {
1350 assert(LLVMTypeOf(src) == ctx->i32);
1351
1352 LLVMValueRef tmp;
1353 LLVMValueRef mantissa;
1354 mantissa =
1355 LLVMBuildAnd(ctx->builder, src, LLVMConstInt(ctx->i32, (1 << mant_bits) - 1, false), "");
1356
1357 /* Converting normal numbers is just a shift + correcting the exponent bias */
1358 unsigned normal_shift = 23 - mant_bits;
1359 unsigned bias_shift = 127 - ((1 << (exp_bits - 1)) - 1);
1360 LLVMValueRef shifted, normal;
1361
1362 shifted = LLVMBuildShl(ctx->builder, src, LLVMConstInt(ctx->i32, normal_shift, false), "");
1363 normal =
1364 LLVMBuildAdd(ctx->builder, shifted, LLVMConstInt(ctx->i32, bias_shift << 23, false), "");
1365
1366 /* Converting nan/inf numbers is the same, but with a different exponent update */
1367 LLVMValueRef naninf;
1368 naninf = LLVMBuildOr(ctx->builder, normal, LLVMConstInt(ctx->i32, 0xff << 23, false), "");
1369
1370 /* Converting denormals is the complex case: determine the leading zeros of the
1371 * mantissa to obtain the correct shift for the mantissa and exponent correction.
1372 */
1373 LLVMValueRef denormal;
1374 LLVMValueRef params[2] = {
1375 mantissa, ctx->i1true, /* result can be undef when arg is 0 */
1376 };
1377 LLVMValueRef ctlz =
1378 ac_build_intrinsic(ctx, "llvm.ctlz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
1379
1380 /* Shift such that the leading 1 ends up as the LSB of the exponent field. */
1381 tmp = LLVMBuildSub(ctx->builder, ctlz, LLVMConstInt(ctx->i32, 8, false), "");
1382 denormal = LLVMBuildShl(ctx->builder, mantissa, tmp, "");
1383
1384 unsigned denormal_exp = bias_shift + (32 - mant_bits) - 1;
1385 tmp = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, denormal_exp, false), ctlz, "");
1386 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(ctx->i32, 23, false), "");
1387 denormal = LLVMBuildAdd(ctx->builder, denormal, tmp, "");
1388
1389 /* Select the final result. */
1390 LLVMValueRef result;
1391
1392 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src,
1393 LLVMConstInt(ctx->i32, ((1 << exp_bits) - 1) << mant_bits, false), "");
1394 result = LLVMBuildSelect(ctx->builder, tmp, naninf, normal, "");
1395
1396 tmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, src, LLVMConstInt(ctx->i32, 1 << mant_bits, false),
1397 "");
1398 result = LLVMBuildSelect(ctx->builder, tmp, result, denormal, "");
1399
1400 tmp = LLVMBuildICmp(ctx->builder, LLVMIntNE, src, ctx->i32_0, "");
1401 result = LLVMBuildSelect(ctx->builder, tmp, result, ctx->i32_0, "");
1402
1403 return ac_to_float(ctx, result);
1404 }
1405
1406 /**
1407 * Generate a fully general open coded buffer format fetch with all required
1408 * fixups suitable for vertex fetch, using non-format buffer loads.
1409 *
1410 * Some combinations of argument values have special interpretations:
1411 * - size = 8 bytes, format = fixed indicates PIPE_FORMAT_R11G11B10_FLOAT
1412 * - size = 8 bytes, format != {float,fixed} indicates a 2_10_10_10 data format
1413 *
1414 * \param log_size log(size of channel in bytes)
1415 * \param num_channels number of channels (1 to 4)
1416 * \param format AC_FETCH_FORMAT_xxx value
1417 * \param reverse whether XYZ channels are reversed
1418 * \param known_aligned whether the source is known to be aligned to hardware's
1419 * effective element size for loading the given format
1420 * (note: this means dword alignment for 8_8_8_8, 16_16, etc.)
1421 * \param rsrc buffer resource descriptor
1422 * \return the resulting vector of floats or integers bitcast to <4 x i32>
1423 */
ac_build_opencoded_load_format(struct ac_llvm_context * ctx,unsigned log_size,unsigned num_channels,unsigned format,bool reverse,bool known_aligned,LLVMValueRef rsrc,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy,bool can_speculate)1424 LLVMValueRef ac_build_opencoded_load_format(struct ac_llvm_context *ctx, unsigned log_size,
1425 unsigned num_channels, unsigned format, bool reverse,
1426 bool known_aligned, LLVMValueRef rsrc,
1427 LLVMValueRef vindex, LLVMValueRef voffset,
1428 LLVMValueRef soffset, unsigned cache_policy,
1429 bool can_speculate)
1430 {
1431 LLVMValueRef tmp;
1432 unsigned load_log_size = log_size;
1433 unsigned load_num_channels = num_channels;
1434 if (log_size == 3) {
1435 load_log_size = 2;
1436 if (format == AC_FETCH_FORMAT_FLOAT) {
1437 load_num_channels = 2 * num_channels;
1438 } else {
1439 load_num_channels = 1; /* 10_11_11 or 2_10_10_10 */
1440 }
1441 }
1442
1443 int log_recombine = 0;
1444 if ((ctx->chip_class == GFX6 || ctx->chip_class >= GFX10) && !known_aligned) {
1445 /* Avoid alignment restrictions by loading one byte at a time. */
1446 load_num_channels <<= load_log_size;
1447 log_recombine = load_log_size;
1448 load_log_size = 0;
1449 } else if (load_num_channels == 2 || load_num_channels == 4) {
1450 log_recombine = -util_logbase2(load_num_channels);
1451 load_num_channels = 1;
1452 load_log_size += -log_recombine;
1453 }
1454
1455 assert(load_log_size >= 2 || LLVM_VERSION_MAJOR >= 9);
1456
1457 LLVMValueRef loads[32]; /* up to 32 bytes */
1458 for (unsigned i = 0; i < load_num_channels; ++i) {
1459 tmp =
1460 LLVMBuildAdd(ctx->builder, soffset, LLVMConstInt(ctx->i32, i << load_log_size, false), "");
1461 LLVMTypeRef channel_type =
1462 load_log_size == 0 ? ctx->i8 : load_log_size == 1 ? ctx->i16 : ctx->i32;
1463 unsigned num_channels = 1 << (MAX2(load_log_size, 2) - 2);
1464 loads[i] =
1465 ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, tmp, num_channels, channel_type,
1466 cache_policy, can_speculate, false, true);
1467 if (load_log_size >= 2)
1468 loads[i] = ac_to_integer(ctx, loads[i]);
1469 }
1470
1471 if (log_recombine > 0) {
1472 /* Recombine bytes if necessary (GFX6 only) */
1473 LLVMTypeRef dst_type = log_recombine == 2 ? ctx->i32 : ctx->i16;
1474
1475 for (unsigned src = 0, dst = 0; src < load_num_channels; ++dst) {
1476 LLVMValueRef accum = NULL;
1477 for (unsigned i = 0; i < (1 << log_recombine); ++i, ++src) {
1478 tmp = LLVMBuildZExt(ctx->builder, loads[src], dst_type, "");
1479 if (i == 0) {
1480 accum = tmp;
1481 } else {
1482 tmp = LLVMBuildShl(ctx->builder, tmp, LLVMConstInt(dst_type, 8 * i, false), "");
1483 accum = LLVMBuildOr(ctx->builder, accum, tmp, "");
1484 }
1485 }
1486 loads[dst] = accum;
1487 }
1488 } else if (log_recombine < 0) {
1489 /* Split vectors of dwords */
1490 if (load_log_size > 2) {
1491 assert(load_num_channels == 1);
1492 LLVMValueRef loaded = loads[0];
1493 unsigned log_split = load_log_size - 2;
1494 log_recombine += log_split;
1495 load_num_channels = 1 << log_split;
1496 load_log_size = 2;
1497 for (unsigned i = 0; i < load_num_channels; ++i) {
1498 tmp = LLVMConstInt(ctx->i32, i, false);
1499 loads[i] = LLVMBuildExtractElement(ctx->builder, loaded, tmp, "");
1500 }
1501 }
1502
1503 /* Further split dwords and shorts if required */
1504 if (log_recombine < 0) {
1505 for (unsigned src = load_num_channels, dst = load_num_channels << -log_recombine; src > 0;
1506 --src) {
1507 unsigned dst_bits = 1 << (3 + load_log_size + log_recombine);
1508 LLVMTypeRef dst_type = LLVMIntTypeInContext(ctx->context, dst_bits);
1509 LLVMValueRef loaded = loads[src - 1];
1510 LLVMTypeRef loaded_type = LLVMTypeOf(loaded);
1511 for (unsigned i = 1 << -log_recombine; i > 0; --i, --dst) {
1512 tmp = LLVMConstInt(loaded_type, dst_bits * (i - 1), false);
1513 tmp = LLVMBuildLShr(ctx->builder, loaded, tmp, "");
1514 loads[dst - 1] = LLVMBuildTrunc(ctx->builder, tmp, dst_type, "");
1515 }
1516 }
1517 }
1518 }
1519
1520 if (log_size == 3) {
1521 if (format == AC_FETCH_FORMAT_FLOAT) {
1522 for (unsigned i = 0; i < num_channels; ++i) {
1523 tmp = ac_build_gather_values(ctx, &loads[2 * i], 2);
1524 loads[i] = LLVMBuildBitCast(ctx->builder, tmp, ctx->f64, "");
1525 }
1526 } else if (format == AC_FETCH_FORMAT_FIXED) {
1527 /* 10_11_11_FLOAT */
1528 LLVMValueRef data = loads[0];
1529 LLVMValueRef i32_2047 = LLVMConstInt(ctx->i32, 2047, false);
1530 LLVMValueRef r = LLVMBuildAnd(ctx->builder, data, i32_2047, "");
1531 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 11, false), "");
1532 LLVMValueRef g = LLVMBuildAnd(ctx->builder, tmp, i32_2047, "");
1533 LLVMValueRef b = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 22, false), "");
1534
1535 loads[0] = ac_to_integer(ctx, ac_ufN_to_float(ctx, r, 5, 6));
1536 loads[1] = ac_to_integer(ctx, ac_ufN_to_float(ctx, g, 5, 6));
1537 loads[2] = ac_to_integer(ctx, ac_ufN_to_float(ctx, b, 5, 5));
1538
1539 num_channels = 3;
1540 log_size = 2;
1541 format = AC_FETCH_FORMAT_FLOAT;
1542 } else {
1543 /* 2_10_10_10 data formats */
1544 LLVMValueRef data = loads[0];
1545 LLVMTypeRef i10 = LLVMIntTypeInContext(ctx->context, 10);
1546 LLVMTypeRef i2 = LLVMIntTypeInContext(ctx->context, 2);
1547 loads[0] = LLVMBuildTrunc(ctx->builder, data, i10, "");
1548 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 10, false), "");
1549 loads[1] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1550 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 20, false), "");
1551 loads[2] = LLVMBuildTrunc(ctx->builder, tmp, i10, "");
1552 tmp = LLVMBuildLShr(ctx->builder, data, LLVMConstInt(ctx->i32, 30, false), "");
1553 loads[3] = LLVMBuildTrunc(ctx->builder, tmp, i2, "");
1554
1555 num_channels = 4;
1556 }
1557 }
1558
1559 if (format == AC_FETCH_FORMAT_FLOAT) {
1560 if (log_size != 2) {
1561 for (unsigned chan = 0; chan < num_channels; ++chan) {
1562 tmp = ac_to_float(ctx, loads[chan]);
1563 if (log_size == 3)
1564 tmp = LLVMBuildFPTrunc(ctx->builder, tmp, ctx->f32, "");
1565 else if (log_size == 1)
1566 tmp = LLVMBuildFPExt(ctx->builder, tmp, ctx->f32, "");
1567 loads[chan] = ac_to_integer(ctx, tmp);
1568 }
1569 }
1570 } else if (format == AC_FETCH_FORMAT_UINT) {
1571 if (log_size != 2) {
1572 for (unsigned chan = 0; chan < num_channels; ++chan)
1573 loads[chan] = LLVMBuildZExt(ctx->builder, loads[chan], ctx->i32, "");
1574 }
1575 } else if (format == AC_FETCH_FORMAT_SINT) {
1576 if (log_size != 2) {
1577 for (unsigned chan = 0; chan < num_channels; ++chan)
1578 loads[chan] = LLVMBuildSExt(ctx->builder, loads[chan], ctx->i32, "");
1579 }
1580 } else {
1581 bool unsign = format == AC_FETCH_FORMAT_UNORM || format == AC_FETCH_FORMAT_USCALED ||
1582 format == AC_FETCH_FORMAT_UINT;
1583
1584 for (unsigned chan = 0; chan < num_channels; ++chan) {
1585 if (unsign) {
1586 tmp = LLVMBuildUIToFP(ctx->builder, loads[chan], ctx->f32, "");
1587 } else {
1588 tmp = LLVMBuildSIToFP(ctx->builder, loads[chan], ctx->f32, "");
1589 }
1590
1591 LLVMValueRef scale = NULL;
1592 if (format == AC_FETCH_FORMAT_FIXED) {
1593 assert(log_size == 2);
1594 scale = LLVMConstReal(ctx->f32, 1.0 / 0x10000);
1595 } else if (format == AC_FETCH_FORMAT_UNORM) {
1596 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1597 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << bits) - 1));
1598 } else if (format == AC_FETCH_FORMAT_SNORM) {
1599 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(loads[chan]));
1600 scale = LLVMConstReal(ctx->f32, 1.0 / (((uint64_t)1 << (bits - 1)) - 1));
1601 }
1602 if (scale)
1603 tmp = LLVMBuildFMul(ctx->builder, tmp, scale, "");
1604
1605 if (format == AC_FETCH_FORMAT_SNORM) {
1606 /* Clamp to [-1, 1] */
1607 LLVMValueRef neg_one = LLVMConstReal(ctx->f32, -1.0);
1608 LLVMValueRef clamp = LLVMBuildFCmp(ctx->builder, LLVMRealULT, tmp, neg_one, "");
1609 tmp = LLVMBuildSelect(ctx->builder, clamp, neg_one, tmp, "");
1610 }
1611
1612 loads[chan] = ac_to_integer(ctx, tmp);
1613 }
1614 }
1615
1616 while (num_channels < 4) {
1617 if (format == AC_FETCH_FORMAT_UINT || format == AC_FETCH_FORMAT_SINT) {
1618 loads[num_channels] = num_channels == 3 ? ctx->i32_1 : ctx->i32_0;
1619 } else {
1620 loads[num_channels] = ac_to_integer(ctx, num_channels == 3 ? ctx->f32_1 : ctx->f32_0);
1621 }
1622 num_channels++;
1623 }
1624
1625 if (reverse) {
1626 tmp = loads[0];
1627 loads[0] = loads[2];
1628 loads[2] = tmp;
1629 }
1630
1631 return ac_build_gather_values(ctx, loads, 4);
1632 }
1633
ac_build_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy,bool structurized)1634 static void ac_build_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1635 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1636 LLVMValueRef soffset, LLVMValueRef immoffset,
1637 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1638 unsigned cache_policy, bool structurized)
1639 {
1640 voffset = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0, immoffset, "");
1641
1642 LLVMValueRef args[7];
1643 int idx = 0;
1644 args[idx++] = vdata;
1645 args[idx++] = LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "");
1646 if (structurized)
1647 args[idx++] = vindex ? vindex : ctx->i32_0;
1648 args[idx++] = voffset ? voffset : ctx->i32_0;
1649 args[idx++] = soffset ? soffset : ctx->i32_0;
1650 args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx->chip_class, dfmt, nfmt), 0);
1651 args[idx++] = LLVMConstInt(ctx->i32, cache_policy, 0);
1652 unsigned func =
1653 !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
1654 const char *indexing_kind = structurized ? "struct" : "raw";
1655 char name[256], type_name[8];
1656
1657 LLVMTypeRef type = func > 1 ? LLVMVectorType(ctx->i32, func) : ctx->i32;
1658 ac_build_type_name_for_intr(type, type_name, sizeof(type_name));
1659
1660 snprintf(name, sizeof(name), "llvm.amdgcn.%s.tbuffer.store.%s", indexing_kind, type_name);
1661
1662 ac_build_intrinsic(ctx, name, ctx->voidt, args, idx, AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY);
1663 }
1664
ac_build_struct_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef vindex,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1665 void ac_build_struct_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1666 LLVMValueRef vdata, LLVMValueRef vindex, LLVMValueRef voffset,
1667 LLVMValueRef soffset, LLVMValueRef immoffset,
1668 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1669 unsigned cache_policy)
1670 {
1671 ac_build_tbuffer_store(ctx, rsrc, vdata, vindex, voffset, soffset, immoffset, num_channels, dfmt,
1672 nfmt, cache_policy, true);
1673 }
1674
ac_build_raw_tbuffer_store(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,LLVMValueRef immoffset,unsigned num_channels,unsigned dfmt,unsigned nfmt,unsigned cache_policy)1675 void ac_build_raw_tbuffer_store(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1676 LLVMValueRef voffset, LLVMValueRef soffset, LLVMValueRef immoffset,
1677 unsigned num_channels, unsigned dfmt, unsigned nfmt,
1678 unsigned cache_policy)
1679 {
1680 ac_build_tbuffer_store(ctx, rsrc, vdata, NULL, voffset, soffset, immoffset, num_channels, dfmt,
1681 nfmt, cache_policy, false);
1682 }
1683
ac_build_tbuffer_store_short(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1684 void ac_build_tbuffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
1685 LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
1686 unsigned cache_policy)
1687 {
1688 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
1689
1690 if (LLVM_VERSION_MAJOR >= 9) {
1691 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1692 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1693 false);
1694 } else {
1695 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_16;
1696 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1697
1698 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1699
1700 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1701 cache_policy);
1702 }
1703 }
1704
ac_build_tbuffer_store_byte(struct ac_llvm_context * ctx,LLVMValueRef rsrc,LLVMValueRef vdata,LLVMValueRef voffset,LLVMValueRef soffset,unsigned cache_policy)1705 void ac_build_tbuffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
1706 LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
1707 {
1708 vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
1709
1710 if (LLVM_VERSION_MAJOR >= 9) {
1711 /* LLVM 9+ supports i8/i16 with struct/raw intrinsics. */
1712 ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false,
1713 false);
1714 } else {
1715 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_8;
1716 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
1717
1718 vdata = LLVMBuildZExt(ctx->builder, vdata, ctx->i32, "");
1719
1720 ac_build_raw_tbuffer_store(ctx, rsrc, vdata, voffset, soffset, ctx->i32_0, 1, dfmt, nfmt,
1721 cache_policy);
1722 }
1723 }
1724 /**
1725 * Set range metadata on an instruction. This can only be used on load and
1726 * call instructions. If you know an instruction can only produce the values
1727 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1728 * \p lo is the minimum value inclusive.
1729 * \p hi is the maximum value exclusive.
1730 */
set_range_metadata(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned lo,unsigned hi)1731 static void set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
1732 unsigned hi)
1733 {
1734 LLVMValueRef range_md, md_args[2];
1735 LLVMTypeRef type = LLVMTypeOf(value);
1736 LLVMContextRef context = LLVMGetTypeContext(type);
1737
1738 md_args[0] = LLVMConstInt(type, lo, false);
1739 md_args[1] = LLVMConstInt(type, hi, false);
1740 range_md = LLVMMDNodeInContext(context, md_args, 2);
1741 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1742 }
1743
ac_get_thread_id(struct ac_llvm_context * ctx)1744 LLVMValueRef ac_get_thread_id(struct ac_llvm_context *ctx)
1745 {
1746 LLVMValueRef tid;
1747
1748 LLVMValueRef tid_args[2];
1749 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1750 tid_args[1] = ctx->i32_0;
1751 tid_args[1] =
1752 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32, tid_args, 2, AC_FUNC_ATTR_READNONE);
1753
1754 if (ctx->wave_size == 32) {
1755 tid = tid_args[1];
1756 } else {
1757 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, tid_args, 2,
1758 AC_FUNC_ATTR_READNONE);
1759 }
1760 set_range_metadata(ctx, tid, 0, ctx->wave_size);
1761 return tid;
1762 }
1763
1764 /*
1765 * AMD GCN implements derivatives using the local data store (LDS)
1766 * All writes to the LDS happen in all executing threads at
1767 * the same time. TID is the Thread ID for the current
1768 * thread and is a value between 0 and 63, representing
1769 * the thread's position in the wavefront.
1770 *
1771 * For the pixel shader threads are grouped into quads of four pixels.
1772 * The TIDs of the pixels of a quad are:
1773 *
1774 * +------+------+
1775 * |4n + 0|4n + 1|
1776 * +------+------+
1777 * |4n + 2|4n + 3|
1778 * +------+------+
1779 *
1780 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1781 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1782 * the current pixel's column, and masking with 0xfffffffe yields the TID
1783 * of the left pixel of the current pixel's row.
1784 *
1785 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1786 * adding 2 yields the TID of the pixel below the top pixel.
1787 */
ac_build_ddxy(struct ac_llvm_context * ctx,uint32_t mask,int idx,LLVMValueRef val)1788 LLVMValueRef ac_build_ddxy(struct ac_llvm_context *ctx, uint32_t mask, int idx, LLVMValueRef val)
1789 {
1790 unsigned tl_lanes[4], trbl_lanes[4];
1791 char name[32], type[8];
1792 LLVMValueRef tl, trbl;
1793 LLVMTypeRef result_type;
1794 LLVMValueRef result;
1795
1796 result_type = ac_to_float_type(ctx, LLVMTypeOf(val));
1797
1798 if (result_type == ctx->f16)
1799 val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
1800 else if (result_type == ctx->v2f16)
1801 val = LLVMBuildBitCast(ctx->builder, val, ctx->i32, "");
1802
1803 for (unsigned i = 0; i < 4; ++i) {
1804 tl_lanes[i] = i & mask;
1805 trbl_lanes[i] = (i & mask) + idx;
1806 }
1807
1808 tl = ac_build_quad_swizzle(ctx, val, tl_lanes[0], tl_lanes[1], tl_lanes[2], tl_lanes[3]);
1809 trbl =
1810 ac_build_quad_swizzle(ctx, val, trbl_lanes[0], trbl_lanes[1], trbl_lanes[2], trbl_lanes[3]);
1811
1812 if (result_type == ctx->f16) {
1813 tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
1814 trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
1815 }
1816
1817 tl = LLVMBuildBitCast(ctx->builder, tl, result_type, "");
1818 trbl = LLVMBuildBitCast(ctx->builder, trbl, result_type, "");
1819 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1820
1821 ac_build_type_name_for_intr(result_type, type, sizeof(type));
1822 snprintf(name, sizeof(name), "llvm.amdgcn.wqm.%s", type);
1823
1824 return ac_build_intrinsic(ctx, name, result_type, &result, 1, 0);
1825 }
1826
ac_build_sendmsg(struct ac_llvm_context * ctx,uint32_t msg,LLVMValueRef wave_id)1827 void ac_build_sendmsg(struct ac_llvm_context *ctx, uint32_t msg, LLVMValueRef wave_id)
1828 {
1829 LLVMValueRef args[2];
1830 args[0] = LLVMConstInt(ctx->i32, msg, false);
1831 args[1] = wave_id;
1832 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1833 }
1834
ac_build_imsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1835 LLVMValueRef ac_build_imsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1836 {
1837 LLVMValueRef msb =
1838 ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32", dst_type, &arg, 1, AC_FUNC_ATTR_READNONE);
1839
1840 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1841 * the index from LSB. Invert it by doing "31 - msb". */
1842 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false), msb, "");
1843
1844 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1845 LLVMValueRef cond =
1846 LLVMBuildOr(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, ctx->i32_0, ""),
1847 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, all_ones, ""), "");
1848
1849 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1850 }
1851
ac_build_umsb(struct ac_llvm_context * ctx,LLVMValueRef arg,LLVMTypeRef dst_type)1852 LLVMValueRef ac_build_umsb(struct ac_llvm_context *ctx, LLVMValueRef arg, LLVMTypeRef dst_type)
1853 {
1854 const char *intrin_name;
1855 LLVMTypeRef type;
1856 LLVMValueRef highest_bit;
1857 LLVMValueRef zero;
1858 unsigned bitsize;
1859
1860 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
1861 switch (bitsize) {
1862 case 64:
1863 intrin_name = "llvm.ctlz.i64";
1864 type = ctx->i64;
1865 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1866 zero = ctx->i64_0;
1867 break;
1868 case 32:
1869 intrin_name = "llvm.ctlz.i32";
1870 type = ctx->i32;
1871 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1872 zero = ctx->i32_0;
1873 break;
1874 case 16:
1875 intrin_name = "llvm.ctlz.i16";
1876 type = ctx->i16;
1877 highest_bit = LLVMConstInt(ctx->i16, 15, false);
1878 zero = ctx->i16_0;
1879 break;
1880 case 8:
1881 intrin_name = "llvm.ctlz.i8";
1882 type = ctx->i8;
1883 highest_bit = LLVMConstInt(ctx->i8, 7, false);
1884 zero = ctx->i8_0;
1885 break;
1886 default:
1887 unreachable(!"invalid bitsize");
1888 break;
1889 }
1890
1891 LLVMValueRef params[2] = {
1892 arg,
1893 ctx->i1true,
1894 };
1895
1896 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
1897
1898 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1899 * the index from LSB. Invert it by doing "31 - msb". */
1900 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1901
1902 if (bitsize == 64) {
1903 msb = LLVMBuildTrunc(ctx->builder, msb, ctx->i32, "");
1904 } else if (bitsize < 32) {
1905 msb = LLVMBuildSExt(ctx->builder, msb, ctx->i32, "");
1906 }
1907
1908 /* check for zero */
1909 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1910 LLVMConstInt(ctx->i32, -1, true), msb, "");
1911 }
1912
ac_build_fmin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1913 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1914 {
1915 char name[64], type[64];
1916
1917 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1918 snprintf(name, sizeof(name), "llvm.minnum.%s", type);
1919 LLVMValueRef args[2] = {a, b};
1920 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1921 }
1922
ac_build_fmax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1923 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1924 {
1925 char name[64], type[64];
1926
1927 ac_build_type_name_for_intr(LLVMTypeOf(a), type, sizeof(type));
1928 snprintf(name, sizeof(name), "llvm.maxnum.%s", type);
1929 LLVMValueRef args[2] = {a, b};
1930 return ac_build_intrinsic(ctx, name, LLVMTypeOf(a), args, 2, AC_FUNC_ATTR_READNONE);
1931 }
1932
ac_build_imin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1933 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1934 {
1935 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1936 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1937 }
1938
ac_build_imax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1939 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1940 {
1941 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1942 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1943 }
1944
ac_build_umin(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1945 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1946 {
1947 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1948 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1949 }
1950
ac_build_umax(struct ac_llvm_context * ctx,LLVMValueRef a,LLVMValueRef b)1951 LLVMValueRef ac_build_umax(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
1952 {
1953 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntUGE, a, b, "");
1954 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1955 }
1956
ac_build_clamp(struct ac_llvm_context * ctx,LLVMValueRef value)1957 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1958 {
1959 LLVMTypeRef t = LLVMTypeOf(value);
1960 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 0.0)),
1961 LLVMConstReal(t, 1.0));
1962 }
1963
ac_build_export(struct ac_llvm_context * ctx,struct ac_export_args * a)1964 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1965 {
1966 LLVMValueRef args[9];
1967
1968 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1969 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1970
1971 if (a->compr) {
1972 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0], ctx->v2i16, "");
1973 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1], ctx->v2i16, "");
1974 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1975 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1976
1977 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16", ctx->voidt, args, 6, 0);
1978 } else {
1979 args[2] = a->out[0];
1980 args[3] = a->out[1];
1981 args[4] = a->out[2];
1982 args[5] = a->out[3];
1983 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1984 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1985
1986 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32", ctx->voidt, args, 8, 0);
1987 }
1988 }
1989
ac_build_export_null(struct ac_llvm_context * ctx)1990 void ac_build_export_null(struct ac_llvm_context *ctx)
1991 {
1992 struct ac_export_args args;
1993
1994 args.enabled_channels = 0x0; /* enabled channels */
1995 args.valid_mask = 1; /* whether the EXEC mask is valid */
1996 args.done = 1; /* DONE bit */
1997 args.target = V_008DFC_SQ_EXP_NULL;
1998 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1999 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
2000 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
2001 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
2002 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
2003
2004 ac_build_export(ctx, &args);
2005 }
2006
ac_num_coords(enum ac_image_dim dim)2007 static unsigned ac_num_coords(enum ac_image_dim dim)
2008 {
2009 switch (dim) {
2010 case ac_image_1d:
2011 return 1;
2012 case ac_image_2d:
2013 case ac_image_1darray:
2014 return 2;
2015 case ac_image_3d:
2016 case ac_image_cube:
2017 case ac_image_2darray:
2018 case ac_image_2dmsaa:
2019 return 3;
2020 case ac_image_2darraymsaa:
2021 return 4;
2022 default:
2023 unreachable("ac_num_coords: bad dim");
2024 }
2025 }
2026
ac_num_derivs(enum ac_image_dim dim)2027 static unsigned ac_num_derivs(enum ac_image_dim dim)
2028 {
2029 switch (dim) {
2030 case ac_image_1d:
2031 case ac_image_1darray:
2032 return 2;
2033 case ac_image_2d:
2034 case ac_image_2darray:
2035 case ac_image_cube:
2036 return 4;
2037 case ac_image_3d:
2038 return 6;
2039 case ac_image_2dmsaa:
2040 case ac_image_2darraymsaa:
2041 default:
2042 unreachable("derivatives not supported");
2043 }
2044 }
2045
get_atomic_name(enum ac_atomic_op op)2046 static const char *get_atomic_name(enum ac_atomic_op op)
2047 {
2048 switch (op) {
2049 case ac_atomic_swap:
2050 return "swap";
2051 case ac_atomic_add:
2052 return "add";
2053 case ac_atomic_sub:
2054 return "sub";
2055 case ac_atomic_smin:
2056 return "smin";
2057 case ac_atomic_umin:
2058 return "umin";
2059 case ac_atomic_smax:
2060 return "smax";
2061 case ac_atomic_umax:
2062 return "umax";
2063 case ac_atomic_and:
2064 return "and";
2065 case ac_atomic_or:
2066 return "or";
2067 case ac_atomic_xor:
2068 return "xor";
2069 case ac_atomic_inc_wrap:
2070 return "inc";
2071 case ac_atomic_dec_wrap:
2072 return "dec";
2073 }
2074 unreachable("bad atomic op");
2075 }
2076
ac_build_image_opcode(struct ac_llvm_context * ctx,struct ac_image_args * a)2077 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_args *a)
2078 {
2079 const char *overload[3] = {"", "", ""};
2080 unsigned num_overloads = 0;
2081 LLVMValueRef args[18];
2082 unsigned num_args = 0;
2083 enum ac_image_dim dim = a->dim;
2084
2085 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 || !a->level_zero);
2086 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
2087 a->opcode != ac_image_store_mip) ||
2088 a->lod);
2089 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2090 (!a->compare && !a->offset));
2091 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2092 a->opcode == ac_image_get_lod) ||
2093 !a->bias);
2094 assert((a->bias ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) + (a->derivs[0] ? 1 : 0) <=
2095 1);
2096 assert((a->min_lod ? 1 : 0) + (a->lod ? 1 : 0) + (a->level_zero ? 1 : 0) <= 1);
2097 assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic &&
2098 a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod &&
2099 a->opcode != ac_image_get_resinfo));
2100
2101 if (a->opcode == ac_image_get_lod) {
2102 switch (dim) {
2103 case ac_image_1darray:
2104 dim = ac_image_1d;
2105 break;
2106 case ac_image_2darray:
2107 case ac_image_cube:
2108 dim = ac_image_2d;
2109 break;
2110 default:
2111 break;
2112 }
2113 }
2114
2115 bool sample = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2116 a->opcode == ac_image_get_lod;
2117 bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap;
2118 bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
2119 a->opcode == ac_image_load || a->opcode == ac_image_load_mip;
2120 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
2121 uint8_t dmask = a->dmask;
2122 LLVMTypeRef data_type;
2123 char data_type_str[8];
2124
2125 if (atomic) {
2126 data_type = LLVMTypeOf(a->data[0]);
2127 } else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2128 /* Image stores might have been shrinked using the format. */
2129 data_type = LLVMTypeOf(a->data[0]);
2130 dmask = (1 << ac_get_llvm_num_components(a->data[0])) - 1;
2131 } else {
2132 data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
2133 }
2134
2135 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
2136 args[num_args++] = a->data[0];
2137 if (a->opcode == ac_image_atomic_cmpswap)
2138 args[num_args++] = a->data[1];
2139 }
2140
2141 if (!atomic)
2142 args[num_args++] = LLVMConstInt(ctx->i32, dmask, false);
2143
2144 if (a->offset)
2145 args[num_args++] = ac_to_integer(ctx, a->offset);
2146 if (a->bias) {
2147 args[num_args++] = ac_to_float(ctx, a->bias);
2148 overload[num_overloads++] = ".f32";
2149 }
2150 if (a->compare)
2151 args[num_args++] = ac_to_float(ctx, a->compare);
2152 if (a->derivs[0]) {
2153 unsigned count = ac_num_derivs(dim);
2154 for (unsigned i = 0; i < count; ++i)
2155 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
2156 overload[num_overloads++] = ".f32";
2157 }
2158 unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
2159 for (unsigned i = 0; i < num_coords; ++i)
2160 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
2161 if (a->lod)
2162 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
2163 if (a->min_lod)
2164 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, "");
2165
2166 overload[num_overloads++] = sample ? ".f32" : ".i32";
2167
2168 args[num_args++] = a->resource;
2169 if (sample) {
2170 args[num_args++] = a->sampler;
2171 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
2172 }
2173
2174 args[num_args++] = ctx->i32_0; /* texfailctrl */
2175 args[num_args++] = LLVMConstInt(
2176 ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
2177
2178 const char *name;
2179 const char *atomic_subop = "";
2180 switch (a->opcode) {
2181 case ac_image_sample:
2182 name = "sample";
2183 break;
2184 case ac_image_gather4:
2185 name = "gather4";
2186 break;
2187 case ac_image_load:
2188 name = "load";
2189 break;
2190 case ac_image_load_mip:
2191 name = "load.mip";
2192 break;
2193 case ac_image_store:
2194 name = "store";
2195 break;
2196 case ac_image_store_mip:
2197 name = "store.mip";
2198 break;
2199 case ac_image_atomic:
2200 name = "atomic.";
2201 atomic_subop = get_atomic_name(a->atomic);
2202 break;
2203 case ac_image_atomic_cmpswap:
2204 name = "atomic.";
2205 atomic_subop = "cmpswap";
2206 break;
2207 case ac_image_get_lod:
2208 name = "getlod";
2209 break;
2210 case ac_image_get_resinfo:
2211 name = "getresinfo";
2212 break;
2213 default:
2214 unreachable("invalid image opcode");
2215 }
2216
2217 const char *dimname;
2218 switch (dim) {
2219 case ac_image_1d:
2220 dimname = "1d";
2221 break;
2222 case ac_image_2d:
2223 dimname = "2d";
2224 break;
2225 case ac_image_3d:
2226 dimname = "3d";
2227 break;
2228 case ac_image_cube:
2229 dimname = "cube";
2230 break;
2231 case ac_image_1darray:
2232 dimname = "1darray";
2233 break;
2234 case ac_image_2darray:
2235 dimname = "2darray";
2236 break;
2237 case ac_image_2dmsaa:
2238 dimname = "2dmsaa";
2239 break;
2240 case ac_image_2darraymsaa:
2241 dimname = "2darraymsaa";
2242 break;
2243 default:
2244 unreachable("invalid dim");
2245 }
2246
2247 ac_build_type_name_for_intr(data_type, data_type_str, sizeof(data_type_str));
2248
2249 bool lod_suffix = a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
2250 char intr_name[96];
2251 snprintf(intr_name, sizeof(intr_name),
2252 "llvm.amdgcn.image.%s%s" /* base name */
2253 "%s%s%s%s" /* sample/gather modifiers */
2254 ".%s.%s%s%s%s", /* dimension and type overloads */
2255 name, atomic_subop, a->compare ? ".c" : "",
2256 a->bias ? ".b" : lod_suffix ? ".l" : a->derivs[0] ? ".d" : a->level_zero ? ".lz" : "",
2257 a->min_lod ? ".cl" : "", a->offset ? ".o" : "", dimname,
2258 data_type_str, overload[0], overload[1], overload[2]);
2259
2260 LLVMTypeRef retty;
2261 if (atomic)
2262 retty = data_type;
2263 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
2264 retty = ctx->voidt;
2265 else
2266 retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
2267
2268 LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
2269 if (!sample && !atomic && retty != ctx->voidt)
2270 result = ac_to_integer(ctx, result);
2271
2272 return result;
2273 }
2274
ac_build_image_get_sample_count(struct ac_llvm_context * ctx,LLVMValueRef rsrc)2275 LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMValueRef rsrc)
2276 {
2277 LLVMValueRef samples;
2278
2279 /* Read the samples from the descriptor directly.
2280 * Hardware doesn't have any instruction for this.
2281 */
2282 samples = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 3, 0), "");
2283 samples = LLVMBuildLShr(ctx->builder, samples, LLVMConstInt(ctx->i32, 16, 0), "");
2284 samples = LLVMBuildAnd(ctx->builder, samples, LLVMConstInt(ctx->i32, 0xf, 0), "");
2285 samples = LLVMBuildShl(ctx->builder, ctx->i32_1, samples, "");
2286 return samples;
2287 }
2288
ac_build_cvt_pkrtz_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2289 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2290 {
2291 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", ctx->v2f16, args, 2,
2292 AC_FUNC_ATTR_READNONE);
2293 }
2294
ac_build_cvt_pknorm_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2295 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2296 {
2297 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16", ctx->v2i16, args, 2,
2298 AC_FUNC_ATTR_READNONE);
2299 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2300 }
2301
ac_build_cvt_pknorm_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2302 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2])
2303 {
2304 LLVMValueRef res = ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16", ctx->v2i16, args, 2,
2305 AC_FUNC_ATTR_READNONE);
2306 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2307 }
2308
ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2309 LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
2310 LLVMValueRef args[2])
2311 {
2312 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2313 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2314 LLVMValueRef code = LLVMConstInlineAsm(calltype,
2315 "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",
2316 false, false);
2317 return LLVMBuildCall(ctx->builder, code, args, 2, "");
2318 }
2319
ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context * ctx,LLVMValueRef args[2])2320 LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
2321 LLVMValueRef args[2])
2322 {
2323 LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
2324 LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
2325 LLVMValueRef code = LLVMConstInlineAsm(calltype,
2326 "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",
2327 false, false);
2328 return LLVMBuildCall(ctx->builder, code, args, 2, "");
2329 }
2330
2331 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_i16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2332 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2333 bool hi)
2334 {
2335 assert(bits == 8 || bits == 10 || bits == 16);
2336
2337 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
2338 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32, bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
2339 LLVMValueRef max_alpha = bits != 10 ? max_rgb : ctx->i32_1;
2340 LLVMValueRef min_alpha = bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
2341
2342 /* Clamp. */
2343 if (bits != 16) {
2344 for (int i = 0; i < 2; i++) {
2345 bool alpha = hi && i == 1;
2346 args[i] = ac_build_imin(ctx, args[i], alpha ? max_alpha : max_rgb);
2347 args[i] = ac_build_imax(ctx, args[i], alpha ? min_alpha : min_rgb);
2348 }
2349 }
2350
2351 LLVMValueRef res =
2352 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2353 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2354 }
2355
2356 /* The 8-bit and 10-bit clamping is for HW workarounds. */
ac_build_cvt_pk_u16(struct ac_llvm_context * ctx,LLVMValueRef args[2],unsigned bits,bool hi)2357 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
2358 bool hi)
2359 {
2360 assert(bits == 8 || bits == 10 || bits == 16);
2361
2362 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32, bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
2363 LLVMValueRef max_alpha = bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
2364
2365 /* Clamp. */
2366 if (bits != 16) {
2367 for (int i = 0; i < 2; i++) {
2368 bool alpha = hi && i == 1;
2369 args[i] = ac_build_umin(ctx, args[i], alpha ? max_alpha : max_rgb);
2370 }
2371 }
2372
2373 LLVMValueRef res =
2374 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16", ctx->v2i16, args, 2, AC_FUNC_ATTR_READNONE);
2375 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
2376 }
2377
ac_build_wqm_vote(struct ac_llvm_context * ctx,LLVMValueRef i1)2378 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
2379 {
2380 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1, &i1, 1, AC_FUNC_ATTR_READNONE);
2381 }
2382
ac_build_kill_if_false(struct ac_llvm_context * ctx,LLVMValueRef i1)2383 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
2384 {
2385 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt, &i1, 1, 0);
2386 }
2387
ac_build_bfe(struct ac_llvm_context * ctx,LLVMValueRef input,LLVMValueRef offset,LLVMValueRef width,bool is_signed)2388 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input, LLVMValueRef offset,
2389 LLVMValueRef width, bool is_signed)
2390 {
2391 LLVMValueRef args[] = {
2392 input,
2393 offset,
2394 width,
2395 };
2396
2397 return ac_build_intrinsic(ctx, is_signed ? "llvm.amdgcn.sbfe.i32" : "llvm.amdgcn.ubfe.i32",
2398 ctx->i32, args, 3, AC_FUNC_ATTR_READNONE);
2399 }
2400
ac_build_imad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2401 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2402 LLVMValueRef s2)
2403 {
2404 return LLVMBuildAdd(ctx->builder, LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
2405 }
2406
ac_build_fmad(struct ac_llvm_context * ctx,LLVMValueRef s0,LLVMValueRef s1,LLVMValueRef s2)2407 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0, LLVMValueRef s1,
2408 LLVMValueRef s2)
2409 {
2410 /* FMA is better on GFX10, because it has FMA units instead of MUL-ADD units. */
2411 if (ctx->chip_class >= GFX10) {
2412 return ac_build_intrinsic(ctx, "llvm.fma.f32", ctx->f32, (LLVMValueRef[]){s0, s1, s2}, 3,
2413 AC_FUNC_ATTR_READNONE);
2414 }
2415
2416 return LLVMBuildFAdd(ctx->builder, LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
2417 }
2418
ac_build_waitcnt(struct ac_llvm_context * ctx,unsigned wait_flags)2419 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
2420 {
2421 if (!wait_flags)
2422 return;
2423
2424 unsigned lgkmcnt = 63;
2425 unsigned vmcnt = ctx->chip_class >= GFX9 ? 63 : 15;
2426 unsigned vscnt = 63;
2427
2428 if (wait_flags & AC_WAIT_LGKM)
2429 lgkmcnt = 0;
2430 if (wait_flags & AC_WAIT_VLOAD)
2431 vmcnt = 0;
2432
2433 if (wait_flags & AC_WAIT_VSTORE) {
2434 if (ctx->chip_class >= GFX10)
2435 vscnt = 0;
2436 else
2437 vmcnt = 0;
2438 }
2439
2440 /* There is no intrinsic for vscnt(0), so use a fence. */
2441 if ((wait_flags & AC_WAIT_LGKM && wait_flags & AC_WAIT_VLOAD && wait_flags & AC_WAIT_VSTORE) ||
2442 vscnt == 0) {
2443 LLVMBuildFence(ctx->builder, LLVMAtomicOrderingRelease, false, "");
2444 return;
2445 }
2446
2447 unsigned simm16 = (lgkmcnt << 8) | (7 << 4) | /* expcnt */
2448 (vmcnt & 0xf) | ((vmcnt >> 4) << 14);
2449
2450 LLVMValueRef args[1] = {
2451 LLVMConstInt(ctx->i32, simm16, false),
2452 };
2453 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
2454 }
2455
ac_build_fsat(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMTypeRef type)2456 LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
2457 LLVMTypeRef type)
2458 {
2459 unsigned bitsize = ac_get_elem_bits(ctx, type);
2460 LLVMValueRef zero = LLVMConstReal(type, 0.0);
2461 LLVMValueRef one = LLVMConstReal(type, 1.0);
2462 LLVMValueRef result;
2463
2464 if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) {
2465 /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
2466 * doesn't expose an intrinsic.
2467 */
2468 result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
2469 } else {
2470 LLVMTypeRef type;
2471 char *intr;
2472
2473 if (bitsize == 16) {
2474 intr = "llvm.amdgcn.fmed3.f16";
2475 type = ctx->f16;
2476 } else {
2477 assert(bitsize == 32);
2478 intr = "llvm.amdgcn.fmed3.f32";
2479 type = ctx->f32;
2480 }
2481
2482 LLVMValueRef params[] = {
2483 zero,
2484 one,
2485 src,
2486 };
2487
2488 result = ac_build_intrinsic(ctx, intr, type, params, 3,
2489 AC_FUNC_ATTR_READNONE);
2490 }
2491
2492 if (ctx->chip_class < GFX9 && bitsize == 32) {
2493 /* Only pre-GFX9 chips do not flush denorms. */
2494 result = ac_build_canonicalize(ctx, result, bitsize);
2495 }
2496
2497 return result;
2498 }
2499
ac_build_fract(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)2500 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
2501 {
2502 LLVMTypeRef type;
2503 char *intr;
2504
2505 if (bitsize == 16) {
2506 intr = "llvm.amdgcn.fract.f16";
2507 type = ctx->f16;
2508 } else if (bitsize == 32) {
2509 intr = "llvm.amdgcn.fract.f32";
2510 type = ctx->f32;
2511 } else {
2512 intr = "llvm.amdgcn.fract.f64";
2513 type = ctx->f64;
2514 }
2515
2516 LLVMValueRef params[] = {
2517 src0,
2518 };
2519 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
2520 }
2521
ac_const_uint_vec(struct ac_llvm_context * ctx,LLVMTypeRef type,uint64_t value)2522 LLVMValueRef ac_const_uint_vec(struct ac_llvm_context *ctx, LLVMTypeRef type, uint64_t value)
2523 {
2524
2525 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
2526 LLVMValueRef scalar = LLVMConstInt(LLVMGetElementType(type), value, 0);
2527 unsigned vec_size = LLVMGetVectorSize(type);
2528 LLVMValueRef *scalars = alloca(vec_size * sizeof(LLVMValueRef));
2529
2530 for (unsigned i = 0; i < vec_size; i++)
2531 scalars[i] = scalar;
2532 return LLVMConstVector(scalars, vec_size);
2533 }
2534 return LLVMConstInt(type, value, 0);
2535 }
2536
ac_build_isign(struct ac_llvm_context * ctx,LLVMValueRef src0)2537 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0)
2538 {
2539 LLVMTypeRef type = LLVMTypeOf(src0);
2540 LLVMValueRef val;
2541
2542 /* v_med3 is selected only when max is first. (LLVM bug?) */
2543 val = ac_build_imax(ctx, src0, ac_const_uint_vec(ctx, type, -1));
2544 return ac_build_imin(ctx, val, ac_const_uint_vec(ctx, type, 1));
2545 }
2546
ac_eliminate_negative_zero(struct ac_llvm_context * ctx,LLVMValueRef val)2547 static LLVMValueRef ac_eliminate_negative_zero(struct ac_llvm_context *ctx, LLVMValueRef val)
2548 {
2549 ac_enable_signed_zeros(ctx);
2550 /* (val + 0) converts negative zero to positive zero. */
2551 val = LLVMBuildFAdd(ctx->builder, val, LLVMConstNull(LLVMTypeOf(val)), "");
2552 ac_disable_signed_zeros(ctx);
2553 return val;
2554 }
2555
ac_build_fsign(struct ac_llvm_context * ctx,LLVMValueRef src)2556 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src)
2557 {
2558 LLVMTypeRef type = LLVMTypeOf(src);
2559 LLVMValueRef pos, neg, dw[2], val;
2560 unsigned bitsize = ac_get_elem_bits(ctx, type);
2561
2562 /* The standard version leads to this:
2563 * v_cmp_ngt_f32_e64 s[0:1], s4, 0 ; D40B0000 00010004
2564 * v_cndmask_b32_e64 v4, 1.0, s4, s[0:1] ; D5010004 000008F2
2565 * v_cmp_le_f32_e32 vcc, 0, v4 ; 7C060880
2566 * v_cndmask_b32_e32 v4, -1.0, v4, vcc ; 020808F3
2567 *
2568 * The isign version:
2569 * v_add_f32_e64 v4, s4, 0 ; D5030004 00010004
2570 * v_med3_i32 v4, v4, -1, 1 ; D5580004 02058304
2571 * v_cvt_f32_i32_e32 v4, v4 ; 7E080B04
2572 *
2573 * (src0 + 0) converts negative zero to positive zero.
2574 * After that, int(fsign(x)) == isign(floatBitsToInt(x)).
2575 *
2576 * For FP64, use the standard version, which doesn't suffer from the huge DP rate
2577 * reduction. (FP64 comparisons are as fast as int64 comparisons)
2578 */
2579 if (bitsize == 16 || bitsize == 32) {
2580 val = ac_to_integer(ctx, ac_eliminate_negative_zero(ctx, src));
2581 val = ac_build_isign(ctx, val);
2582 return LLVMBuildSIToFP(ctx->builder, val, type, "");
2583 }
2584
2585 assert(bitsize == 64);
2586 pos = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src, ctx->f64_0, "");
2587 neg = LLVMBuildFCmp(ctx->builder, LLVMRealOLT, src, ctx->f64_0, "");
2588 dw[0] = ctx->i32_0;
2589 dw[1] = LLVMBuildSelect(
2590 ctx->builder, pos, LLVMConstInt(ctx->i32, 0x3FF00000, 0),
2591 LLVMBuildSelect(ctx->builder, neg, LLVMConstInt(ctx->i32, 0xBFF00000, 0), ctx->i32_0, ""),
2592 "");
2593 return LLVMBuildBitCast(ctx->builder, ac_build_gather_values(ctx, dw, 2), ctx->f64, "");
2594 }
2595
ac_build_bit_count(struct ac_llvm_context * ctx,LLVMValueRef src0)2596 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
2597 {
2598 LLVMValueRef result;
2599 unsigned bitsize;
2600
2601 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2602
2603 switch (bitsize) {
2604 case 128:
2605 result = ac_build_intrinsic(ctx, "llvm.ctpop.i128", ctx->i128, (LLVMValueRef[]){src0}, 1,
2606 AC_FUNC_ATTR_READNONE);
2607 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2608 break;
2609 case 64:
2610 result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2611 AC_FUNC_ATTR_READNONE);
2612
2613 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2614 break;
2615 case 32:
2616 result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2617 AC_FUNC_ATTR_READNONE);
2618 break;
2619 case 16:
2620 result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2621 AC_FUNC_ATTR_READNONE);
2622
2623 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2624 break;
2625 case 8:
2626 result = ac_build_intrinsic(ctx, "llvm.ctpop.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2627 AC_FUNC_ATTR_READNONE);
2628
2629 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2630 break;
2631 default:
2632 unreachable(!"invalid bitsize");
2633 break;
2634 }
2635
2636 return result;
2637 }
2638
ac_build_bitfield_reverse(struct ac_llvm_context * ctx,LLVMValueRef src0)2639 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0)
2640 {
2641 LLVMValueRef result;
2642 unsigned bitsize;
2643
2644 bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2645
2646 switch (bitsize) {
2647 case 64:
2648 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i64", ctx->i64, (LLVMValueRef[]){src0}, 1,
2649 AC_FUNC_ATTR_READNONE);
2650
2651 result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
2652 break;
2653 case 32:
2654 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", ctx->i32, (LLVMValueRef[]){src0}, 1,
2655 AC_FUNC_ATTR_READNONE);
2656 break;
2657 case 16:
2658 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", ctx->i16, (LLVMValueRef[]){src0}, 1,
2659 AC_FUNC_ATTR_READNONE);
2660
2661 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2662 break;
2663 case 8:
2664 result = ac_build_intrinsic(ctx, "llvm.bitreverse.i8", ctx->i8, (LLVMValueRef[]){src0}, 1,
2665 AC_FUNC_ATTR_READNONE);
2666
2667 result = LLVMBuildZExt(ctx->builder, result, ctx->i32, "");
2668 break;
2669 default:
2670 unreachable(!"invalid bitsize");
2671 break;
2672 }
2673
2674 return result;
2675 }
2676
2677 #define AC_EXP_TARGET 0
2678 #define AC_EXP_ENABLED_CHANNELS 1
2679 #define AC_EXP_OUT0 2
2680
2681 enum ac_ir_type
2682 {
2683 AC_IR_UNDEF,
2684 AC_IR_CONST,
2685 AC_IR_VALUE,
2686 };
2687
2688 struct ac_vs_exp_chan {
2689 LLVMValueRef value;
2690 float const_float;
2691 enum ac_ir_type type;
2692 };
2693
2694 struct ac_vs_exp_inst {
2695 unsigned offset;
2696 LLVMValueRef inst;
2697 struct ac_vs_exp_chan chan[4];
2698 };
2699
2700 struct ac_vs_exports {
2701 unsigned num;
2702 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2703 };
2704
2705 /* Return true if the PARAM export has been eliminated. */
ac_eliminate_const_output(uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exp_inst * exp)2706 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset, uint32_t num_outputs,
2707 struct ac_vs_exp_inst *exp)
2708 {
2709 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2710 bool is_zero[4] = {0}, is_one[4] = {0};
2711
2712 for (i = 0; i < 4; i++) {
2713 /* It's a constant expression. Undef outputs are eliminated too. */
2714 if (exp->chan[i].type == AC_IR_UNDEF) {
2715 is_zero[i] = true;
2716 is_one[i] = true;
2717 } else if (exp->chan[i].type == AC_IR_CONST) {
2718 if (exp->chan[i].const_float == 0)
2719 is_zero[i] = true;
2720 else if (exp->chan[i].const_float == 1)
2721 is_one[i] = true;
2722 else
2723 return false; /* other constant */
2724 } else
2725 return false;
2726 }
2727
2728 /* Only certain combinations of 0 and 1 can be eliminated. */
2729 if (is_zero[0] && is_zero[1] && is_zero[2])
2730 default_val = is_zero[3] ? 0 : 1;
2731 else if (is_one[0] && is_one[1] && is_one[2])
2732 default_val = is_zero[3] ? 2 : 3;
2733 else
2734 return false;
2735
2736 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2737 LLVMInstructionEraseFromParent(exp->inst);
2738
2739 /* Change OFFSET to DEFAULT_VAL. */
2740 for (i = 0; i < num_outputs; i++) {
2741 if (vs_output_param_offset[i] == exp->offset) {
2742 vs_output_param_offset[i] = AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2743 break;
2744 }
2745 }
2746 return true;
2747 }
2748
ac_eliminate_duplicated_output(struct ac_llvm_context * ctx,uint8_t * vs_output_param_offset,uint32_t num_outputs,struct ac_vs_exports * processed,struct ac_vs_exp_inst * exp)2749 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2750 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2751 struct ac_vs_exports *processed,
2752 struct ac_vs_exp_inst *exp)
2753 {
2754 unsigned p, copy_back_channels = 0;
2755
2756 /* See if the output is already in the list of processed outputs.
2757 * The LLVMValueRef comparison relies on SSA.
2758 */
2759 for (p = 0; p < processed->num; p++) {
2760 bool different = false;
2761
2762 for (unsigned j = 0; j < 4; j++) {
2763 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2764 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2765
2766 /* Treat undef as a match. */
2767 if (c2->type == AC_IR_UNDEF)
2768 continue;
2769
2770 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2771 * and consider the instruction duplicated.
2772 */
2773 if (c1->type == AC_IR_UNDEF) {
2774 copy_back_channels |= 1 << j;
2775 continue;
2776 }
2777
2778 /* Test whether the channels are not equal. */
2779 if (c1->type != c2->type ||
2780 (c1->type == AC_IR_CONST && c1->const_float != c2->const_float) ||
2781 (c1->type == AC_IR_VALUE && c1->value != c2->value)) {
2782 different = true;
2783 break;
2784 }
2785 }
2786 if (!different)
2787 break;
2788
2789 copy_back_channels = 0;
2790 }
2791 if (p == processed->num)
2792 return false;
2793
2794 /* If a match was found, but the matching export has undef where the new
2795 * one has a normal value, copy the normal value to the undef channel.
2796 */
2797 struct ac_vs_exp_inst *match = &processed->exp[p];
2798
2799 /* Get current enabled channels mask. */
2800 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2801 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2802
2803 while (copy_back_channels) {
2804 unsigned chan = u_bit_scan(©_back_channels);
2805
2806 assert(match->chan[chan].type == AC_IR_UNDEF);
2807 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan, exp->chan[chan].value);
2808 match->chan[chan] = exp->chan[chan];
2809
2810 /* Update number of enabled channels because the original mask
2811 * is not always 0xf.
2812 */
2813 enabled_channels |= (1 << chan);
2814 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2815 LLVMConstInt(ctx->i32, enabled_channels, 0));
2816 }
2817
2818 /* The PARAM export is duplicated. Kill it. */
2819 LLVMInstructionEraseFromParent(exp->inst);
2820
2821 /* Change OFFSET to the matching export. */
2822 for (unsigned i = 0; i < num_outputs; i++) {
2823 if (vs_output_param_offset[i] == exp->offset) {
2824 vs_output_param_offset[i] = match->offset;
2825 break;
2826 }
2827 }
2828 return true;
2829 }
2830
ac_optimize_vs_outputs(struct ac_llvm_context * ctx,LLVMValueRef main_fn,uint8_t * vs_output_param_offset,uint32_t num_outputs,uint32_t skip_output_mask,uint8_t * num_param_exports)2831 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx, LLVMValueRef main_fn,
2832 uint8_t *vs_output_param_offset, uint32_t num_outputs,
2833 uint32_t skip_output_mask, uint8_t *num_param_exports)
2834 {
2835 LLVMBasicBlockRef bb;
2836 bool removed_any = false;
2837 struct ac_vs_exports exports;
2838
2839 exports.num = 0;
2840
2841 /* Process all LLVM instructions. */
2842 bb = LLVMGetFirstBasicBlock(main_fn);
2843 while (bb) {
2844 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2845
2846 while (inst) {
2847 LLVMValueRef cur = inst;
2848 inst = LLVMGetNextInstruction(inst);
2849 struct ac_vs_exp_inst exp;
2850
2851 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2852 continue;
2853
2854 LLVMValueRef callee = ac_llvm_get_called_value(cur);
2855
2856 if (!ac_llvm_is_function(callee))
2857 continue;
2858
2859 const char *name = LLVMGetValueName(callee);
2860 unsigned num_args = LLVMCountParams(callee);
2861
2862 /* Check if this is an export instruction. */
2863 if ((num_args != 9 && num_args != 8) ||
2864 (strcmp(name, "llvm.SI.export") && strcmp(name, "llvm.amdgcn.exp.f32")))
2865 continue;
2866
2867 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2868 unsigned target = LLVMConstIntGetZExtValue(arg);
2869
2870 if (target < V_008DFC_SQ_EXP_PARAM)
2871 continue;
2872
2873 target -= V_008DFC_SQ_EXP_PARAM;
2874
2875 /* Parse the instruction. */
2876 memset(&exp, 0, sizeof(exp));
2877 exp.offset = target;
2878 exp.inst = cur;
2879
2880 for (unsigned i = 0; i < 4; i++) {
2881 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2882
2883 exp.chan[i].value = v;
2884
2885 if (LLVMIsUndef(v)) {
2886 exp.chan[i].type = AC_IR_UNDEF;
2887 } else if (LLVMIsAConstantFP(v)) {
2888 LLVMBool loses_info;
2889 exp.chan[i].type = AC_IR_CONST;
2890 exp.chan[i].const_float = LLVMConstRealGetDouble(v, &loses_info);
2891 } else {
2892 exp.chan[i].type = AC_IR_VALUE;
2893 }
2894 }
2895
2896 /* Eliminate constant and duplicated PARAM exports. */
2897 if (!((1u << target) & skip_output_mask) &&
2898 (ac_eliminate_const_output(vs_output_param_offset, num_outputs, &exp) ||
2899 ac_eliminate_duplicated_output(ctx, vs_output_param_offset, num_outputs, &exports,
2900 &exp))) {
2901 removed_any = true;
2902 } else {
2903 exports.exp[exports.num++] = exp;
2904 }
2905 }
2906 bb = LLVMGetNextBasicBlock(bb);
2907 }
2908
2909 /* Remove holes in export memory due to removed PARAM exports.
2910 * This is done by renumbering all PARAM exports.
2911 */
2912 if (removed_any) {
2913 uint8_t old_offset[VARYING_SLOT_MAX];
2914 unsigned out, i;
2915
2916 /* Make a copy of the offsets. We need the old version while
2917 * we are modifying some of them. */
2918 memcpy(old_offset, vs_output_param_offset, sizeof(old_offset));
2919
2920 for (i = 0; i < exports.num; i++) {
2921 unsigned offset = exports.exp[i].offset;
2922
2923 /* Update vs_output_param_offset. Multiple outputs can
2924 * have the same offset.
2925 */
2926 for (out = 0; out < num_outputs; out++) {
2927 if (old_offset[out] == offset)
2928 vs_output_param_offset[out] = i;
2929 }
2930
2931 /* Change the PARAM offset in the instruction. */
2932 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2933 LLVMConstInt(ctx->i32, V_008DFC_SQ_EXP_PARAM + i, 0));
2934 }
2935 *num_param_exports = exports.num;
2936 }
2937 }
2938
ac_init_exec_full_mask(struct ac_llvm_context * ctx)2939 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2940 {
2941 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2942 ac_build_intrinsic(ctx, "llvm.amdgcn.init.exec", ctx->voidt, &full_mask, 1,
2943 AC_FUNC_ATTR_CONVERGENT);
2944 }
2945
ac_declare_lds_as_pointer(struct ac_llvm_context * ctx)2946 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2947 {
2948 unsigned lds_size = ctx->chip_class >= GFX7 ? 65536 : 32768;
2949 ctx->lds = LLVMBuildIntToPtr(
2950 ctx->builder, ctx->i32_0,
2951 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_ADDR_SPACE_LDS), "lds");
2952 }
2953
ac_lds_load(struct ac_llvm_context * ctx,LLVMValueRef dw_addr)2954 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx, LLVMValueRef dw_addr)
2955 {
2956 return LLVMBuildLoad(ctx->builder, ac_build_gep0(ctx, ctx->lds, dw_addr), "");
2957 }
2958
ac_lds_store(struct ac_llvm_context * ctx,LLVMValueRef dw_addr,LLVMValueRef value)2959 void ac_lds_store(struct ac_llvm_context *ctx, LLVMValueRef dw_addr, LLVMValueRef value)
2960 {
2961 value = ac_to_integer(ctx, value);
2962 ac_build_indexed_store(ctx, ctx->lds, dw_addr, value);
2963 }
2964
ac_find_lsb(struct ac_llvm_context * ctx,LLVMTypeRef dst_type,LLVMValueRef src0)2965 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx, LLVMTypeRef dst_type, LLVMValueRef src0)
2966 {
2967 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2968 const char *intrin_name;
2969 LLVMTypeRef type;
2970 LLVMValueRef zero;
2971
2972 switch (src0_bitsize) {
2973 case 64:
2974 intrin_name = "llvm.cttz.i64";
2975 type = ctx->i64;
2976 zero = ctx->i64_0;
2977 break;
2978 case 32:
2979 intrin_name = "llvm.cttz.i32";
2980 type = ctx->i32;
2981 zero = ctx->i32_0;
2982 break;
2983 case 16:
2984 intrin_name = "llvm.cttz.i16";
2985 type = ctx->i16;
2986 zero = ctx->i16_0;
2987 break;
2988 case 8:
2989 intrin_name = "llvm.cttz.i8";
2990 type = ctx->i8;
2991 zero = ctx->i8_0;
2992 break;
2993 default:
2994 unreachable(!"invalid bitsize");
2995 }
2996
2997 LLVMValueRef params[2] = {
2998 src0,
2999
3000 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
3001 * add special code to check for x=0. The reason is that
3002 * the LLVM behavior for x=0 is different from what we
3003 * need here. However, LLVM also assumes that ffs(x) is
3004 * in [0, 31], but GLSL expects that ffs(0) = -1, so
3005 * a conditional assignment to handle 0 is still required.
3006 *
3007 * The hardware already implements the correct behavior.
3008 */
3009 ctx->i1true,
3010 };
3011
3012 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type, params, 2, AC_FUNC_ATTR_READNONE);
3013
3014 if (src0_bitsize == 64) {
3015 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
3016 } else if (src0_bitsize < 32) {
3017 lsb = LLVMBuildSExt(ctx->builder, lsb, ctx->i32, "");
3018 }
3019
3020 /* TODO: We need an intrinsic to skip this conditional. */
3021 /* Check for zero: */
3022 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntEQ, src0, zero, ""),
3023 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
3024 }
3025
ac_array_in_const_addr_space(LLVMTypeRef elem_type)3026 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
3027 {
3028 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST);
3029 }
3030
ac_array_in_const32_addr_space(LLVMTypeRef elem_type)3031 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
3032 {
3033 return LLVMPointerType(elem_type, AC_ADDR_SPACE_CONST_32BIT);
3034 }
3035
get_current_flow(struct ac_llvm_context * ctx)3036 static struct ac_llvm_flow *get_current_flow(struct ac_llvm_context *ctx)
3037 {
3038 if (ctx->flow->depth > 0)
3039 return &ctx->flow->stack[ctx->flow->depth - 1];
3040 return NULL;
3041 }
3042
get_innermost_loop(struct ac_llvm_context * ctx)3043 static struct ac_llvm_flow *get_innermost_loop(struct ac_llvm_context *ctx)
3044 {
3045 for (unsigned i = ctx->flow->depth; i > 0; --i) {
3046 if (ctx->flow->stack[i - 1].loop_entry_block)
3047 return &ctx->flow->stack[i - 1];
3048 }
3049 return NULL;
3050 }
3051
push_flow(struct ac_llvm_context * ctx)3052 static struct ac_llvm_flow *push_flow(struct ac_llvm_context *ctx)
3053 {
3054 struct ac_llvm_flow *flow;
3055
3056 if (ctx->flow->depth >= ctx->flow->depth_max) {
3057 unsigned new_max = MAX2(ctx->flow->depth << 1, AC_LLVM_INITIAL_CF_DEPTH);
3058
3059 ctx->flow->stack = realloc(ctx->flow->stack, new_max * sizeof(*ctx->flow->stack));
3060 ctx->flow->depth_max = new_max;
3061 }
3062
3063 flow = &ctx->flow->stack[ctx->flow->depth];
3064 ctx->flow->depth++;
3065
3066 flow->next_block = NULL;
3067 flow->loop_entry_block = NULL;
3068 return flow;
3069 }
3070
set_basicblock_name(LLVMBasicBlockRef bb,const char * base,int label_id)3071 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base, int label_id)
3072 {
3073 char buf[32];
3074 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
3075 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
3076 }
3077
3078 /* Append a basic block at the level of the parent flow.
3079 */
append_basic_block(struct ac_llvm_context * ctx,const char * name)3080 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx, const char *name)
3081 {
3082 assert(ctx->flow->depth >= 1);
3083
3084 if (ctx->flow->depth >= 2) {
3085 struct ac_llvm_flow *flow = &ctx->flow->stack[ctx->flow->depth - 2];
3086
3087 return LLVMInsertBasicBlockInContext(ctx->context, flow->next_block, name);
3088 }
3089
3090 LLVMValueRef main_fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
3091 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
3092 }
3093
3094 /* Emit a branch to the given default target for the current block if
3095 * applicable -- that is, if the current block does not already contain a
3096 * branch from a break or continue.
3097 */
emit_default_branch(LLVMBuilderRef builder,LLVMBasicBlockRef target)3098 static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
3099 {
3100 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
3101 LLVMBuildBr(builder, target);
3102 }
3103
ac_build_bgnloop(struct ac_llvm_context * ctx,int label_id)3104 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
3105 {
3106 struct ac_llvm_flow *flow = push_flow(ctx);
3107 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
3108 flow->next_block = append_basic_block(ctx, "ENDLOOP");
3109 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
3110 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3111 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
3112 }
3113
ac_build_break(struct ac_llvm_context * ctx)3114 void ac_build_break(struct ac_llvm_context *ctx)
3115 {
3116 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3117 LLVMBuildBr(ctx->builder, flow->next_block);
3118 }
3119
ac_build_continue(struct ac_llvm_context * ctx)3120 void ac_build_continue(struct ac_llvm_context *ctx)
3121 {
3122 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
3123 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
3124 }
3125
ac_build_else(struct ac_llvm_context * ctx,int label_id)3126 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
3127 {
3128 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3129 LLVMBasicBlockRef endif_block;
3130
3131 assert(!current_branch->loop_entry_block);
3132
3133 endif_block = append_basic_block(ctx, "ENDIF");
3134 emit_default_branch(ctx->builder, endif_block);
3135
3136 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3137 set_basicblock_name(current_branch->next_block, "else", label_id);
3138
3139 current_branch->next_block = endif_block;
3140 }
3141
ac_build_endif(struct ac_llvm_context * ctx,int label_id)3142 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
3143 {
3144 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
3145
3146 assert(!current_branch->loop_entry_block);
3147
3148 emit_default_branch(ctx->builder, current_branch->next_block);
3149 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
3150 set_basicblock_name(current_branch->next_block, "endif", label_id);
3151
3152 ctx->flow->depth--;
3153 }
3154
ac_build_endloop(struct ac_llvm_context * ctx,int label_id)3155 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
3156 {
3157 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
3158
3159 assert(current_loop->loop_entry_block);
3160
3161 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
3162
3163 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
3164 set_basicblock_name(current_loop->next_block, "endloop", label_id);
3165 ctx->flow->depth--;
3166 }
3167
ac_build_ifcc(struct ac_llvm_context * ctx,LLVMValueRef cond,int label_id)3168 void ac_build_ifcc(struct ac_llvm_context *ctx, LLVMValueRef cond, int label_id)
3169 {
3170 struct ac_llvm_flow *flow = push_flow(ctx);
3171 LLVMBasicBlockRef if_block;
3172
3173 if_block = append_basic_block(ctx, "IF");
3174 flow->next_block = append_basic_block(ctx, "ELSE");
3175 set_basicblock_name(if_block, "if", label_id);
3176 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
3177 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3178 }
3179
ac_build_alloca_undef(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3180 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3181 {
3182 LLVMBuilderRef builder = ac->builder;
3183 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
3184 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
3185 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
3186 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
3187 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
3188 LLVMValueRef res;
3189
3190 if (first_instr) {
3191 LLVMPositionBuilderBefore(first_builder, first_instr);
3192 } else {
3193 LLVMPositionBuilderAtEnd(first_builder, first_block);
3194 }
3195
3196 res = LLVMBuildAlloca(first_builder, type, name);
3197 LLVMDisposeBuilder(first_builder);
3198 return res;
3199 }
3200
ac_build_alloca(struct ac_llvm_context * ac,LLVMTypeRef type,const char * name)3201 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type, const char *name)
3202 {
3203 LLVMValueRef ptr = ac_build_alloca_undef(ac, type, name);
3204 LLVMBuildStore(ac->builder, LLVMConstNull(type), ptr);
3205 return ptr;
3206 }
3207
ac_cast_ptr(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMTypeRef type)3208 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr, LLVMTypeRef type)
3209 {
3210 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
3211 return LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
3212 }
3213
ac_trim_vector(struct ac_llvm_context * ctx,LLVMValueRef value,unsigned count)3214 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned count)
3215 {
3216 unsigned num_components = ac_get_llvm_num_components(value);
3217 if (count == num_components)
3218 return value;
3219
3220 LLVMValueRef *const masks = alloca(MAX2(count, 2) * sizeof(LLVMValueRef));
3221 masks[0] = ctx->i32_0;
3222 masks[1] = ctx->i32_1;
3223 for (unsigned i = 2; i < count; i++)
3224 masks[i] = LLVMConstInt(ctx->i32, i, false);
3225
3226 if (count == 1)
3227 return LLVMBuildExtractElement(ctx->builder, value, masks[0], "");
3228
3229 LLVMValueRef swizzle = LLVMConstVector(masks, count);
3230 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
3231 }
3232
ac_unpack_param(struct ac_llvm_context * ctx,LLVMValueRef param,unsigned rshift,unsigned bitwidth)3233 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param, unsigned rshift,
3234 unsigned bitwidth)
3235 {
3236 LLVMValueRef value = param;
3237 if (rshift)
3238 value = LLVMBuildLShr(ctx->builder, value, LLVMConstInt(ctx->i32, rshift, false), "");
3239
3240 if (rshift + bitwidth < 32) {
3241 unsigned mask = (1 << bitwidth) - 1;
3242 value = LLVMBuildAnd(ctx->builder, value, LLVMConstInt(ctx->i32, mask, false), "");
3243 }
3244 return value;
3245 }
3246
3247 /* Adjust the sample index according to FMASK.
3248 *
3249 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3250 * which is the identity mapping. Each nibble says which physical sample
3251 * should be fetched to get that sample.
3252 *
3253 * For example, 0x11111100 means there are only 2 samples stored and
3254 * the second sample covers 3/4 of the pixel. When reading samples 0
3255 * and 1, return physical sample 0 (determined by the first two 0s
3256 * in FMASK), otherwise return physical sample 1.
3257 *
3258 * The sample index should be adjusted as follows:
3259 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
3260 */
ac_apply_fmask_to_sample(struct ac_llvm_context * ac,LLVMValueRef fmask,LLVMValueRef * addr,bool is_array_tex)3261 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LLVMValueRef *addr,
3262 bool is_array_tex)
3263 {
3264 struct ac_image_args fmask_load = {0};
3265 fmask_load.opcode = ac_image_load;
3266 fmask_load.resource = fmask;
3267 fmask_load.dmask = 0xf;
3268 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
3269 fmask_load.attributes = AC_FUNC_ATTR_READNONE;
3270
3271 fmask_load.coords[0] = addr[0];
3272 fmask_load.coords[1] = addr[1];
3273 if (is_array_tex)
3274 fmask_load.coords[2] = addr[2];
3275
3276 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
3277 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, "");
3278
3279 /* Apply the formula. */
3280 unsigned sample_chan = is_array_tex ? 3 : 2;
3281 LLVMValueRef final_sample;
3282 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), "");
3283 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
3284 /* Mask the sample index by 0x7, because 0x8 means an unknown value
3285 * with EQAA, so those will map to 0. */
3286 final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), "");
3287
3288 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3289 * resource descriptor is 0 (invalid).
3290 */
3291 LLVMValueRef tmp;
3292 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
3293 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
3294 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
3295
3296 /* Replace the MSAA sample index. */
3297 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample, addr[sample_chan], "");
3298 }
3299
_ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3300 static LLVMValueRef _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src,
3301 LLVMValueRef lane, bool with_opt_barrier)
3302 {
3303 LLVMTypeRef type = LLVMTypeOf(src);
3304 LLVMValueRef result;
3305
3306 if (with_opt_barrier)
3307 ac_build_optimization_barrier(ctx, &src);
3308
3309 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3310 if (lane)
3311 lane = LLVMBuildZExt(ctx->builder, lane, ctx->i32, "");
3312
3313 result =
3314 ac_build_intrinsic(ctx, lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
3315 ctx->i32, (LLVMValueRef[]){src, lane}, lane == NULL ? 1 : 2,
3316 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3317
3318 return LLVMBuildTrunc(ctx->builder, result, type, "");
3319 }
3320
ac_build_readlane_common(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane,bool with_opt_barrier)3321 static LLVMValueRef ac_build_readlane_common(struct ac_llvm_context *ctx, LLVMValueRef src,
3322 LLVMValueRef lane, bool with_opt_barrier)
3323 {
3324 LLVMTypeRef src_type = LLVMTypeOf(src);
3325 src = ac_to_integer(ctx, src);
3326 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3327 LLVMValueRef ret;
3328
3329 if (bits > 32) {
3330 assert(bits % 32 == 0);
3331 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3332 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3333 ret = LLVMGetUndef(vec_type);
3334 for (unsigned i = 0; i < bits / 32; i++) {
3335 LLVMValueRef ret_comp;
3336
3337 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3338
3339 ret_comp = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3340
3341 ret =
3342 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3343 }
3344 } else {
3345 ret = _ac_build_readlane(ctx, src, lane, with_opt_barrier);
3346 }
3347
3348 if (LLVMGetTypeKind(src_type) == LLVMPointerTypeKind)
3349 return LLVMBuildIntToPtr(ctx->builder, ret, src_type, "");
3350 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3351 }
3352
3353 /**
3354 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
3355 *
3356 * The optimization barrier is not needed if the value is the same in all lanes
3357 * or if this is called in the outermost block.
3358 *
3359 * @param ctx
3360 * @param src
3361 * @param lane - id of the lane or NULL for the first active lane
3362 * @return value of the lane
3363 */
ac_build_readlane_no_opt_barrier(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3364 LLVMValueRef ac_build_readlane_no_opt_barrier(struct ac_llvm_context *ctx, LLVMValueRef src,
3365 LLVMValueRef lane)
3366 {
3367 return ac_build_readlane_common(ctx, src, lane, false);
3368 }
3369
ac_build_readlane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef lane)3370 LLVMValueRef ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
3371 {
3372 return ac_build_readlane_common(ctx, src, lane, true);
3373 }
3374
ac_build_writelane(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef value,LLVMValueRef lane)3375 LLVMValueRef ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value,
3376 LLVMValueRef lane)
3377 {
3378 return ac_build_intrinsic(ctx, "llvm.amdgcn.writelane", ctx->i32,
3379 (LLVMValueRef[]){value, lane, src}, 3,
3380 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3381 }
3382
ac_build_mbcnt(struct ac_llvm_context * ctx,LLVMValueRef mask)3383 LLVMValueRef ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
3384 {
3385 if (ctx->wave_size == 32) {
3386 return ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3387 (LLVMValueRef[]){mask, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3388 }
3389 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask, ctx->v2i32, "");
3390 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_0, "");
3391 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec, ctx->i32_1, "");
3392 LLVMValueRef val =
3393 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
3394 (LLVMValueRef[]){mask_lo, ctx->i32_0}, 2, AC_FUNC_ATTR_READNONE);
3395 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32, (LLVMValueRef[]){mask_hi, val},
3396 2, AC_FUNC_ATTR_READNONE);
3397 return val;
3398 }
3399
3400 enum dpp_ctrl
3401 {
3402 _dpp_quad_perm = 0x000,
3403 _dpp_row_sl = 0x100,
3404 _dpp_row_sr = 0x110,
3405 _dpp_row_rr = 0x120,
3406 dpp_wf_sl1 = 0x130,
3407 dpp_wf_rl1 = 0x134,
3408 dpp_wf_sr1 = 0x138,
3409 dpp_wf_rr1 = 0x13C,
3410 dpp_row_mirror = 0x140,
3411 dpp_row_half_mirror = 0x141,
3412 dpp_row_bcast15 = 0x142,
3413 dpp_row_bcast31 = 0x143
3414 };
3415
dpp_quad_perm(unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)3416 static inline enum dpp_ctrl dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2,
3417 unsigned lane3)
3418 {
3419 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
3420 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
3421 }
3422
dpp_row_sl(unsigned amount)3423 static inline enum dpp_ctrl dpp_row_sl(unsigned amount)
3424 {
3425 assert(amount > 0 && amount < 16);
3426 return _dpp_row_sl | amount;
3427 }
3428
dpp_row_sr(unsigned amount)3429 static inline enum dpp_ctrl dpp_row_sr(unsigned amount)
3430 {
3431 assert(amount > 0 && amount < 16);
3432 return _dpp_row_sr | amount;
3433 }
3434
_ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3435 static LLVMValueRef _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3436 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3437 bool bound_ctrl)
3438 {
3439 LLVMTypeRef type = LLVMTypeOf(src);
3440 LLVMValueRef res;
3441
3442 old = LLVMBuildZExt(ctx->builder, old, ctx->i32, "");
3443 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3444
3445 res = ac_build_intrinsic(
3446 ctx, "llvm.amdgcn.update.dpp.i32", ctx->i32,
3447 (LLVMValueRef[]){old, src, LLVMConstInt(ctx->i32, dpp_ctrl, 0),
3448 LLVMConstInt(ctx->i32, row_mask, 0), LLVMConstInt(ctx->i32, bank_mask, 0),
3449 LLVMConstInt(ctx->i1, bound_ctrl, 0)},
3450 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3451
3452 return LLVMBuildTrunc(ctx->builder, res, type, "");
3453 }
3454
ac_build_dpp(struct ac_llvm_context * ctx,LLVMValueRef old,LLVMValueRef src,enum dpp_ctrl dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)3455 static LLVMValueRef ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
3456 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
3457 bool bound_ctrl)
3458 {
3459 LLVMTypeRef src_type = LLVMTypeOf(src);
3460 src = ac_to_integer(ctx, src);
3461 old = ac_to_integer(ctx, old);
3462 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3463 LLVMValueRef ret;
3464 if (bits > 32) {
3465 assert(bits % 32 == 0);
3466 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3467 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3468 LLVMValueRef old_vector = LLVMBuildBitCast(ctx->builder, old, vec_type, "");
3469 ret = LLVMGetUndef(vec_type);
3470 for (unsigned i = 0; i < bits / 32; i++) {
3471 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3472 old = LLVMBuildExtractElement(ctx->builder, old_vector, LLVMConstInt(ctx->i32, i, 0), "");
3473 LLVMValueRef ret_comp =
3474 _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3475 ret =
3476 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3477 }
3478 } else {
3479 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
3480 }
3481 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3482 }
3483
_ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3484 static LLVMValueRef _ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src,
3485 uint64_t sel, bool exchange_rows, bool bound_ctrl)
3486 {
3487 LLVMTypeRef type = LLVMTypeOf(src);
3488 LLVMValueRef result;
3489
3490 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3491
3492 LLVMValueRef args[6] = {
3493 src,
3494 src,
3495 LLVMConstInt(ctx->i32, sel, false),
3496 LLVMConstInt(ctx->i32, sel >> 32, false),
3497 ctx->i1true, /* fi */
3498 bound_ctrl ? ctx->i1true : ctx->i1false,
3499 };
3500
3501 result =
3502 ac_build_intrinsic(ctx, exchange_rows ? "llvm.amdgcn.permlanex16" : "llvm.amdgcn.permlane16",
3503 ctx->i32, args, 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3504
3505 return LLVMBuildTrunc(ctx->builder, result, type, "");
3506 }
3507
ac_build_permlane16(struct ac_llvm_context * ctx,LLVMValueRef src,uint64_t sel,bool exchange_rows,bool bound_ctrl)3508 static LLVMValueRef ac_build_permlane16(struct ac_llvm_context *ctx, LLVMValueRef src, uint64_t sel,
3509 bool exchange_rows, bool bound_ctrl)
3510 {
3511 LLVMTypeRef src_type = LLVMTypeOf(src);
3512 src = ac_to_integer(ctx, src);
3513 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3514 LLVMValueRef ret;
3515 if (bits > 32) {
3516 assert(bits % 32 == 0);
3517 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3518 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3519 ret = LLVMGetUndef(vec_type);
3520 for (unsigned i = 0; i < bits / 32; i++) {
3521 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3522 LLVMValueRef ret_comp = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3523 ret =
3524 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3525 }
3526 } else {
3527 ret = _ac_build_permlane16(ctx, src, sel, exchange_rows, bound_ctrl);
3528 }
3529 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3530 }
3531
ds_pattern_bitmode(unsigned and_mask,unsigned or_mask,unsigned xor_mask)3532 static inline unsigned ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
3533 {
3534 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
3535 return and_mask | (or_mask << 5) | (xor_mask << 10);
3536 }
3537
_ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3538 static LLVMValueRef _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3539 unsigned mask)
3540 {
3541 LLVMTypeRef src_type = LLVMTypeOf(src);
3542 LLVMValueRef ret;
3543
3544 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3545
3546 ret = ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle", ctx->i32,
3547 (LLVMValueRef[]){src, LLVMConstInt(ctx->i32, mask, 0)}, 2,
3548 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3549
3550 return LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3551 }
3552
ac_build_ds_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned mask)3553 LLVMValueRef ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
3554 {
3555 LLVMTypeRef src_type = LLVMTypeOf(src);
3556 src = ac_to_integer(ctx, src);
3557 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
3558 LLVMValueRef ret;
3559 if (bits > 32) {
3560 assert(bits % 32 == 0);
3561 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
3562 LLVMValueRef src_vector = LLVMBuildBitCast(ctx->builder, src, vec_type, "");
3563 ret = LLVMGetUndef(vec_type);
3564 for (unsigned i = 0; i < bits / 32; i++) {
3565 src = LLVMBuildExtractElement(ctx->builder, src_vector, LLVMConstInt(ctx->i32, i, 0), "");
3566 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src, mask);
3567 ret =
3568 LLVMBuildInsertElement(ctx->builder, ret, ret_comp, LLVMConstInt(ctx->i32, i, 0), "");
3569 }
3570 } else {
3571 ret = _ac_build_ds_swizzle(ctx, src, mask);
3572 }
3573 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3574 }
3575
ac_build_wwm(struct ac_llvm_context * ctx,LLVMValueRef src)3576 static LLVMValueRef ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
3577 {
3578 LLVMTypeRef src_type = LLVMTypeOf(src);
3579 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3580 char name[32], type[8];
3581 LLVMValueRef ret;
3582
3583 src = ac_to_integer(ctx, src);
3584
3585 if (bitsize < 32)
3586 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3587
3588 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3589 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
3590 ret = ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src}, 1,
3591 AC_FUNC_ATTR_READNONE);
3592
3593 if (bitsize < 32)
3594 ret = LLVMBuildTrunc(ctx->builder, ret, ac_to_integer_type(ctx, src_type), "");
3595
3596 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
3597 }
3598
ac_build_set_inactive(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef inactive)3599 static LLVMValueRef ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
3600 LLVMValueRef inactive)
3601 {
3602 char name[33], type[8];
3603 LLVMTypeRef src_type = LLVMTypeOf(src);
3604 unsigned bitsize = ac_get_elem_bits(ctx, src_type);
3605 src = ac_to_integer(ctx, src);
3606 inactive = ac_to_integer(ctx, inactive);
3607
3608 if (bitsize < 32) {
3609 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
3610 inactive = LLVMBuildZExt(ctx->builder, inactive, ctx->i32, "");
3611 }
3612
3613 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
3614 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
3615 LLVMValueRef ret =
3616 ac_build_intrinsic(ctx, name, LLVMTypeOf(src), (LLVMValueRef[]){src, inactive}, 2,
3617 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
3618 if (bitsize < 32)
3619 ret = LLVMBuildTrunc(ctx->builder, ret, src_type, "");
3620
3621 return ret;
3622 }
3623
get_reduction_identity(struct ac_llvm_context * ctx,nir_op op,unsigned type_size)3624 static LLVMValueRef get_reduction_identity(struct ac_llvm_context *ctx, nir_op op,
3625 unsigned type_size)
3626 {
3627
3628 if (type_size == 0) {
3629 switch (op) {
3630 case nir_op_ior:
3631 case nir_op_ixor:
3632 return LLVMConstInt(ctx->i1, 0, 0);
3633 case nir_op_iand:
3634 return LLVMConstInt(ctx->i1, 1, 0);
3635 default:
3636 unreachable("bad reduction intrinsic");
3637 }
3638 } else if (type_size == 1) {
3639 switch (op) {
3640 case nir_op_iadd:
3641 return ctx->i8_0;
3642 case nir_op_imul:
3643 return ctx->i8_1;
3644 case nir_op_imin:
3645 return LLVMConstInt(ctx->i8, INT8_MAX, 0);
3646 case nir_op_umin:
3647 return LLVMConstInt(ctx->i8, UINT8_MAX, 0);
3648 case nir_op_imax:
3649 return LLVMConstInt(ctx->i8, INT8_MIN, 0);
3650 case nir_op_umax:
3651 return ctx->i8_0;
3652 case nir_op_iand:
3653 return LLVMConstInt(ctx->i8, -1, 0);
3654 case nir_op_ior:
3655 return ctx->i8_0;
3656 case nir_op_ixor:
3657 return ctx->i8_0;
3658 default:
3659 unreachable("bad reduction intrinsic");
3660 }
3661 } else if (type_size == 2) {
3662 switch (op) {
3663 case nir_op_iadd:
3664 return ctx->i16_0;
3665 case nir_op_fadd:
3666 return ctx->f16_0;
3667 case nir_op_imul:
3668 return ctx->i16_1;
3669 case nir_op_fmul:
3670 return ctx->f16_1;
3671 case nir_op_imin:
3672 return LLVMConstInt(ctx->i16, INT16_MAX, 0);
3673 case nir_op_umin:
3674 return LLVMConstInt(ctx->i16, UINT16_MAX, 0);
3675 case nir_op_fmin:
3676 return LLVMConstReal(ctx->f16, INFINITY);
3677 case nir_op_imax:
3678 return LLVMConstInt(ctx->i16, INT16_MIN, 0);
3679 case nir_op_umax:
3680 return ctx->i16_0;
3681 case nir_op_fmax:
3682 return LLVMConstReal(ctx->f16, -INFINITY);
3683 case nir_op_iand:
3684 return LLVMConstInt(ctx->i16, -1, 0);
3685 case nir_op_ior:
3686 return ctx->i16_0;
3687 case nir_op_ixor:
3688 return ctx->i16_0;
3689 default:
3690 unreachable("bad reduction intrinsic");
3691 }
3692 } else if (type_size == 4) {
3693 switch (op) {
3694 case nir_op_iadd:
3695 return ctx->i32_0;
3696 case nir_op_fadd:
3697 return ctx->f32_0;
3698 case nir_op_imul:
3699 return ctx->i32_1;
3700 case nir_op_fmul:
3701 return ctx->f32_1;
3702 case nir_op_imin:
3703 return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3704 case nir_op_umin:
3705 return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3706 case nir_op_fmin:
3707 return LLVMConstReal(ctx->f32, INFINITY);
3708 case nir_op_imax:
3709 return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3710 case nir_op_umax:
3711 return ctx->i32_0;
3712 case nir_op_fmax:
3713 return LLVMConstReal(ctx->f32, -INFINITY);
3714 case nir_op_iand:
3715 return LLVMConstInt(ctx->i32, -1, 0);
3716 case nir_op_ior:
3717 return ctx->i32_0;
3718 case nir_op_ixor:
3719 return ctx->i32_0;
3720 default:
3721 unreachable("bad reduction intrinsic");
3722 }
3723 } else { /* type_size == 64bit */
3724 switch (op) {
3725 case nir_op_iadd:
3726 return ctx->i64_0;
3727 case nir_op_fadd:
3728 return ctx->f64_0;
3729 case nir_op_imul:
3730 return ctx->i64_1;
3731 case nir_op_fmul:
3732 return ctx->f64_1;
3733 case nir_op_imin:
3734 return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3735 case nir_op_umin:
3736 return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3737 case nir_op_fmin:
3738 return LLVMConstReal(ctx->f64, INFINITY);
3739 case nir_op_imax:
3740 return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3741 case nir_op_umax:
3742 return ctx->i64_0;
3743 case nir_op_fmax:
3744 return LLVMConstReal(ctx->f64, -INFINITY);
3745 case nir_op_iand:
3746 return LLVMConstInt(ctx->i64, -1, 0);
3747 case nir_op_ior:
3748 return ctx->i64_0;
3749 case nir_op_ixor:
3750 return ctx->i64_0;
3751 default:
3752 unreachable("bad reduction intrinsic");
3753 }
3754 }
3755 }
3756
ac_build_alu_op(struct ac_llvm_context * ctx,LLVMValueRef lhs,LLVMValueRef rhs,nir_op op)3757 static LLVMValueRef ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs,
3758 nir_op op)
3759 {
3760 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3761 bool _32bit = ac_get_type_size(LLVMTypeOf(lhs)) == 4;
3762 switch (op) {
3763 case nir_op_iadd:
3764 return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3765 case nir_op_fadd:
3766 return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3767 case nir_op_imul:
3768 return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3769 case nir_op_fmul:
3770 return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3771 case nir_op_imin:
3772 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3773 lhs, rhs, "");
3774 case nir_op_umin:
3775 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3776 lhs, rhs, "");
3777 case nir_op_fmin:
3778 return ac_build_intrinsic(
3779 ctx, _64bit ? "llvm.minnum.f64" : _32bit ? "llvm.minnum.f32" : "llvm.minnum.f16",
3780 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3781 AC_FUNC_ATTR_READNONE);
3782 case nir_op_imax:
3783 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3784 lhs, rhs, "");
3785 case nir_op_umax:
3786 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3787 lhs, rhs, "");
3788 case nir_op_fmax:
3789 return ac_build_intrinsic(
3790 ctx, _64bit ? "llvm.maxnum.f64" : _32bit ? "llvm.maxnum.f32" : "llvm.maxnum.f16",
3791 _64bit ? ctx->f64 : _32bit ? ctx->f32 : ctx->f16, (LLVMValueRef[]){lhs, rhs}, 2,
3792 AC_FUNC_ATTR_READNONE);
3793 case nir_op_iand:
3794 return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3795 case nir_op_ior:
3796 return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3797 case nir_op_ixor:
3798 return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3799 default:
3800 unreachable("bad reduction intrinsic");
3801 }
3802 }
3803
3804 /**
3805 * \param src The value to shift.
3806 * \param identity The value to use the first lane.
3807 * \param maxprefix specifies that the result only needs to be correct for a
3808 * prefix of this many threads
3809 * \return src, shifted 1 lane up, and identity shifted into lane 0.
3810 */
ac_wavefront_shift_right_1(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix)3811 static LLVMValueRef ac_wavefront_shift_right_1(struct ac_llvm_context *ctx, LLVMValueRef src,
3812 LLVMValueRef identity, unsigned maxprefix)
3813 {
3814 if (ctx->chip_class >= GFX10) {
3815 /* wavefront shift_right by 1 on GFX10 (emulate dpp_wf_sr1) */
3816 LLVMValueRef active, tmp1, tmp2;
3817 LLVMValueRef tid = ac_get_thread_id(ctx);
3818
3819 tmp1 = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3820
3821 tmp2 = ac_build_permlane16(ctx, src, (uint64_t)~0, true, false);
3822
3823 if (maxprefix > 32) {
3824 active =
3825 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, false), "");
3826
3827 tmp2 = LLVMBuildSelect(ctx->builder, active,
3828 ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, false)),
3829 tmp2, "");
3830
3831 active = LLVMBuildOr(
3832 ctx->builder, active,
3833 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3834 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, false), ""),
3835 LLVMConstInt(ctx->i32, 0x10, false), ""),
3836 "");
3837 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3838 } else if (maxprefix > 16) {
3839 active =
3840 LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 16, false), "");
3841
3842 return LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3843 }
3844 } else if (ctx->chip_class >= GFX8) {
3845 return ac_build_dpp(ctx, identity, src, dpp_wf_sr1, 0xf, 0xf, false);
3846 }
3847
3848 /* wavefront shift_right by 1 on SI/CI */
3849 LLVMValueRef active, tmp1, tmp2;
3850 LLVMValueRef tid = ac_get_thread_id(ctx);
3851 tmp1 = ac_build_ds_swizzle(ctx, src, (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
3852 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x18, 0x03, 0x00));
3853 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3854 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x7, 0), ""),
3855 LLVMConstInt(ctx->i32, 0x4, 0), "");
3856 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3857 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x10, 0x07, 0x00));
3858 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3859 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0xf, 0), ""),
3860 LLVMConstInt(ctx->i32, 0x8, 0), "");
3861 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3862 tmp2 = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3863 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
3864 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 0x1f, 0), ""),
3865 LLVMConstInt(ctx->i32, 0x10, 0), "");
3866 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3867 tmp2 = ac_build_readlane(ctx, src, LLVMConstInt(ctx->i32, 31, 0));
3868 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 32, 0), "");
3869 tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
3870 active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, LLVMConstInt(ctx->i32, 0, 0), "");
3871 return LLVMBuildSelect(ctx->builder, active, identity, tmp1, "");
3872 }
3873
3874 /**
3875 * \param maxprefix specifies that the result only needs to be correct for a
3876 * prefix of this many threads
3877 */
ac_build_scan(struct ac_llvm_context * ctx,nir_op op,LLVMValueRef src,LLVMValueRef identity,unsigned maxprefix,bool inclusive)3878 static LLVMValueRef ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src,
3879 LLVMValueRef identity, unsigned maxprefix, bool inclusive)
3880 {
3881 LLVMValueRef result, tmp;
3882
3883 if (!inclusive)
3884 src = ac_wavefront_shift_right_1(ctx, src, identity, maxprefix);
3885
3886 result = src;
3887
3888 if (ctx->chip_class <= GFX7) {
3889 assert(maxprefix == 64);
3890 LLVMValueRef tid = ac_get_thread_id(ctx);
3891 LLVMValueRef active;
3892 tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 0x00, 0x00));
3893 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3894 LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, ""), ctx->i32_0, "");
3895 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3896 result = ac_build_alu_op(ctx, result, tmp, op);
3897 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 0x01, 0x00));
3898 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3899 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 2, 0), ""),
3900 ctx->i32_0, "");
3901 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3902 result = ac_build_alu_op(ctx, result, tmp, op);
3903 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 0x03, 0x00));
3904 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3905 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 4, 0), ""),
3906 ctx->i32_0, "");
3907 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3908 result = ac_build_alu_op(ctx, result, tmp, op);
3909 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 0x07, 0x00));
3910 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3911 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 8, 0), ""),
3912 ctx->i32_0, "");
3913 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3914 result = ac_build_alu_op(ctx, result, tmp, op);
3915 tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 0x0f, 0x00));
3916 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3917 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, 0), ""),
3918 ctx->i32_0, "");
3919 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3920 result = ac_build_alu_op(ctx, result, tmp, op);
3921 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 0));
3922 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3923 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 32, 0), ""),
3924 ctx->i32_0, "");
3925 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3926 result = ac_build_alu_op(ctx, result, tmp, op);
3927 return result;
3928 }
3929
3930 if (maxprefix <= 1)
3931 return result;
3932 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3933 result = ac_build_alu_op(ctx, result, tmp, op);
3934 if (maxprefix <= 2)
3935 return result;
3936 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3937 result = ac_build_alu_op(ctx, result, tmp, op);
3938 if (maxprefix <= 3)
3939 return result;
3940 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3941 result = ac_build_alu_op(ctx, result, tmp, op);
3942 if (maxprefix <= 4)
3943 return result;
3944 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3945 result = ac_build_alu_op(ctx, result, tmp, op);
3946 if (maxprefix <= 8)
3947 return result;
3948 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3949 result = ac_build_alu_op(ctx, result, tmp, op);
3950 if (maxprefix <= 16)
3951 return result;
3952
3953 if (ctx->chip_class >= GFX10) {
3954 LLVMValueRef tid = ac_get_thread_id(ctx);
3955 LLVMValueRef active;
3956
3957 tmp = ac_build_permlane16(ctx, result, ~(uint64_t)0, true, false);
3958
3959 active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
3960 LLVMBuildAnd(ctx->builder, tid, LLVMConstInt(ctx->i32, 16, false), ""),
3961 ctx->i32_0, "");
3962
3963 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3964
3965 result = ac_build_alu_op(ctx, result, tmp, op);
3966
3967 if (maxprefix <= 32)
3968 return result;
3969
3970 tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
3971
3972 active = LLVMBuildICmp(ctx->builder, LLVMIntUGE, tid, LLVMConstInt(ctx->i32, 32, false), "");
3973
3974 tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
3975
3976 result = ac_build_alu_op(ctx, result, tmp, op);
3977 return result;
3978 }
3979
3980 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3981 result = ac_build_alu_op(ctx, result, tmp, op);
3982 if (maxprefix <= 32)
3983 return result;
3984 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3985 result = ac_build_alu_op(ctx, result, tmp, op);
3986 return result;
3987 }
3988
ac_build_inclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)3989 LLVMValueRef ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3990 {
3991 LLVMValueRef result;
3992
3993 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
3994 LLVMBuilderRef builder = ctx->builder;
3995 src = LLVMBuildZExt(builder, src, ctx->i32, "");
3996 result = ac_build_ballot(ctx, src);
3997 result = ac_build_mbcnt(ctx, result);
3998 result = LLVMBuildAdd(builder, result, src, "");
3999 return result;
4000 }
4001
4002 ac_build_optimization_barrier(ctx, &src);
4003
4004 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4005 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4006 LLVMTypeOf(identity), "");
4007 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, true);
4008
4009 return ac_build_wwm(ctx, result);
4010 }
4011
ac_build_exclusive_scan(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op)4012 LLVMValueRef ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
4013 {
4014 LLVMValueRef result;
4015
4016 if (LLVMTypeOf(src) == ctx->i1 && op == nir_op_iadd) {
4017 LLVMBuilderRef builder = ctx->builder;
4018 src = LLVMBuildZExt(builder, src, ctx->i32, "");
4019 result = ac_build_ballot(ctx, src);
4020 result = ac_build_mbcnt(ctx, result);
4021 return result;
4022 }
4023
4024 ac_build_optimization_barrier(ctx, &src);
4025
4026 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4027 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4028 LLVMTypeOf(identity), "");
4029 result = ac_build_scan(ctx, op, result, identity, ctx->wave_size, false);
4030
4031 return ac_build_wwm(ctx, result);
4032 }
4033
ac_build_reduce(struct ac_llvm_context * ctx,LLVMValueRef src,nir_op op,unsigned cluster_size)4034 LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op,
4035 unsigned cluster_size)
4036 {
4037 if (cluster_size == 1)
4038 return src;
4039 ac_build_optimization_barrier(ctx, &src);
4040 LLVMValueRef result, swap;
4041 LLVMValueRef identity = get_reduction_identity(ctx, op, ac_get_type_size(LLVMTypeOf(src)));
4042 result = LLVMBuildBitCast(ctx->builder, ac_build_set_inactive(ctx, src, identity),
4043 LLVMTypeOf(identity), "");
4044 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
4045 result = ac_build_alu_op(ctx, result, swap, op);
4046 if (cluster_size == 2)
4047 return ac_build_wwm(ctx, result);
4048
4049 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
4050 result = ac_build_alu_op(ctx, result, swap, op);
4051 if (cluster_size == 4)
4052 return ac_build_wwm(ctx, result);
4053
4054 if (ctx->chip_class >= GFX8)
4055 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
4056 else
4057 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
4058 result = ac_build_alu_op(ctx, result, swap, op);
4059 if (cluster_size == 8)
4060 return ac_build_wwm(ctx, result);
4061
4062 if (ctx->chip_class >= GFX8)
4063 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
4064 else
4065 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
4066 result = ac_build_alu_op(ctx, result, swap, op);
4067 if (cluster_size == 16)
4068 return ac_build_wwm(ctx, result);
4069
4070 if (ctx->chip_class >= GFX10)
4071 swap = ac_build_permlane16(ctx, result, 0, true, false);
4072 else if (ctx->chip_class >= GFX8 && cluster_size != 32)
4073 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
4074 else
4075 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
4076 result = ac_build_alu_op(ctx, result, swap, op);
4077 if (cluster_size == 32)
4078 return ac_build_wwm(ctx, result);
4079
4080 if (ctx->chip_class >= GFX8) {
4081 if (ctx->wave_size == 64) {
4082 if (ctx->chip_class >= GFX10)
4083 swap = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, false));
4084 else
4085 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
4086 result = ac_build_alu_op(ctx, result, swap, op);
4087 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
4088 }
4089
4090 return ac_build_wwm(ctx, result);
4091 } else {
4092 swap = ac_build_readlane(ctx, result, ctx->i32_0);
4093 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
4094 result = ac_build_alu_op(ctx, result, swap, op);
4095 return ac_build_wwm(ctx, result);
4096 }
4097 }
4098
4099 /**
4100 * "Top half" of a scan that reduces per-wave values across an entire
4101 * workgroup.
4102 *
4103 * The source value must be present in the highest lane of the wave, and the
4104 * highest lane must be live.
4105 */
ac_build_wg_wavescan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4106 void ac_build_wg_wavescan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4107 {
4108 if (ws->maxwaves <= 1)
4109 return;
4110
4111 const LLVMValueRef last_lane = LLVMConstInt(ctx->i32, ctx->wave_size - 1, false);
4112 LLVMBuilderRef builder = ctx->builder;
4113 LLVMValueRef tid = ac_get_thread_id(ctx);
4114 LLVMValueRef tmp;
4115
4116 tmp = LLVMBuildICmp(builder, LLVMIntEQ, tid, last_lane, "");
4117 ac_build_ifcc(ctx, tmp, 1000);
4118 LLVMBuildStore(builder, ws->src, LLVMBuildGEP(builder, ws->scratch, &ws->waveidx, 1, ""));
4119 ac_build_endif(ctx, 1000);
4120 }
4121
4122 /**
4123 * "Bottom half" of a scan that reduces per-wave values across an entire
4124 * workgroup.
4125 *
4126 * The caller must place a barrier between the top and bottom halves.
4127 */
ac_build_wg_wavescan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4128 void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4129 {
4130 const LLVMTypeRef type = LLVMTypeOf(ws->src);
4131 const LLVMValueRef identity = get_reduction_identity(ctx, ws->op, ac_get_type_size(type));
4132
4133 if (ws->maxwaves <= 1) {
4134 ws->result_reduce = ws->src;
4135 ws->result_inclusive = ws->src;
4136 ws->result_exclusive = identity;
4137 return;
4138 }
4139 assert(ws->maxwaves <= 32);
4140
4141 LLVMBuilderRef builder = ctx->builder;
4142 LLVMValueRef tid = ac_get_thread_id(ctx);
4143 LLVMBasicBlockRef bbs[2];
4144 LLVMValueRef phivalues_scan[2];
4145 LLVMValueRef tmp, tmp2;
4146
4147 bbs[0] = LLVMGetInsertBlock(builder);
4148 phivalues_scan[0] = LLVMGetUndef(type);
4149
4150 if (ws->enable_reduce)
4151 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->numwaves, "");
4152 else if (ws->enable_inclusive)
4153 tmp = LLVMBuildICmp(builder, LLVMIntULE, tid, ws->waveidx, "");
4154 else
4155 tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, ws->waveidx, "");
4156 ac_build_ifcc(ctx, tmp, 1001);
4157 {
4158 tmp = LLVMBuildLoad(builder, LLVMBuildGEP(builder, ws->scratch, &tid, 1, ""), "");
4159
4160 ac_build_optimization_barrier(ctx, &tmp);
4161
4162 bbs[1] = LLVMGetInsertBlock(builder);
4163 phivalues_scan[1] = ac_build_scan(ctx, ws->op, tmp, identity, ws->maxwaves, true);
4164 }
4165 ac_build_endif(ctx, 1001);
4166
4167 const LLVMValueRef scan = ac_build_phi(ctx, type, 2, phivalues_scan, bbs);
4168
4169 if (ws->enable_reduce) {
4170 tmp = LLVMBuildSub(builder, ws->numwaves, ctx->i32_1, "");
4171 ws->result_reduce = ac_build_readlane(ctx, scan, tmp);
4172 }
4173 if (ws->enable_inclusive)
4174 ws->result_inclusive = ac_build_readlane(ctx, scan, ws->waveidx);
4175 if (ws->enable_exclusive) {
4176 tmp = LLVMBuildSub(builder, ws->waveidx, ctx->i32_1, "");
4177 tmp = ac_build_readlane(ctx, scan, tmp);
4178 tmp2 = LLVMBuildICmp(builder, LLVMIntEQ, ws->waveidx, ctx->i32_0, "");
4179 ws->result_exclusive = LLVMBuildSelect(builder, tmp2, identity, tmp, "");
4180 }
4181 }
4182
4183 /**
4184 * Inclusive scan of a per-wave value across an entire workgroup.
4185 *
4186 * This implies an s_barrier instruction.
4187 *
4188 * Unlike ac_build_inclusive_scan, the caller \em must ensure that all threads
4189 * of the workgroup are live. (This requirement cannot easily be relaxed in a
4190 * useful manner because of the barrier in the algorithm.)
4191 */
ac_build_wg_wavescan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4192 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4193 {
4194 ac_build_wg_wavescan_top(ctx, ws);
4195 ac_build_s_barrier(ctx);
4196 ac_build_wg_wavescan_bottom(ctx, ws);
4197 }
4198
4199 /**
4200 * "Top half" of a scan that reduces per-thread values across an entire
4201 * workgroup.
4202 *
4203 * All lanes must be active when this code runs.
4204 */
ac_build_wg_scan_top(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4205 void ac_build_wg_scan_top(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4206 {
4207 if (ws->enable_exclusive) {
4208 ws->extra = ac_build_exclusive_scan(ctx, ws->src, ws->op);
4209 if (LLVMTypeOf(ws->src) == ctx->i1 && ws->op == nir_op_iadd)
4210 ws->src = LLVMBuildZExt(ctx->builder, ws->src, ctx->i32, "");
4211 ws->src = ac_build_alu_op(ctx, ws->extra, ws->src, ws->op);
4212 } else {
4213 ws->src = ac_build_inclusive_scan(ctx, ws->src, ws->op);
4214 }
4215
4216 bool enable_inclusive = ws->enable_inclusive;
4217 bool enable_exclusive = ws->enable_exclusive;
4218 ws->enable_inclusive = false;
4219 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4220 ac_build_wg_wavescan_top(ctx, ws);
4221 ws->enable_inclusive = enable_inclusive;
4222 ws->enable_exclusive = enable_exclusive;
4223 }
4224
4225 /**
4226 * "Bottom half" of a scan that reduces per-thread values across an entire
4227 * workgroup.
4228 *
4229 * The caller must place a barrier between the top and bottom halves.
4230 */
ac_build_wg_scan_bottom(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4231 void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4232 {
4233 bool enable_inclusive = ws->enable_inclusive;
4234 bool enable_exclusive = ws->enable_exclusive;
4235 ws->enable_inclusive = false;
4236 ws->enable_exclusive = ws->enable_exclusive || enable_inclusive;
4237 ac_build_wg_wavescan_bottom(ctx, ws);
4238 ws->enable_inclusive = enable_inclusive;
4239 ws->enable_exclusive = enable_exclusive;
4240
4241 /* ws->result_reduce is already the correct value */
4242 if (ws->enable_inclusive)
4243 ws->result_inclusive = ac_build_alu_op(ctx, ws->result_inclusive, ws->src, ws->op);
4244 if (ws->enable_exclusive)
4245 ws->result_exclusive = ac_build_alu_op(ctx, ws->result_exclusive, ws->extra, ws->op);
4246 }
4247
4248 /**
4249 * A scan that reduces per-thread values across an entire workgroup.
4250 *
4251 * The caller must ensure that all lanes are active when this code runs
4252 * (WWM is insufficient!), because there is an implied barrier.
4253 */
ac_build_wg_scan(struct ac_llvm_context * ctx,struct ac_wg_scan * ws)4254 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
4255 {
4256 ac_build_wg_scan_top(ctx, ws);
4257 ac_build_s_barrier(ctx);
4258 ac_build_wg_scan_bottom(ctx, ws);
4259 }
4260
ac_build_quad_swizzle(struct ac_llvm_context * ctx,LLVMValueRef src,unsigned lane0,unsigned lane1,unsigned lane2,unsigned lane3)4261 LLVMValueRef ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned lane0,
4262 unsigned lane1, unsigned lane2, unsigned lane3)
4263 {
4264 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
4265 if (ctx->chip_class >= GFX8) {
4266 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
4267 } else {
4268 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
4269 }
4270 }
4271
ac_build_shuffle(struct ac_llvm_context * ctx,LLVMValueRef src,LLVMValueRef index)4272 LLVMValueRef ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
4273 {
4274 LLVMTypeRef type = LLVMTypeOf(src);
4275 LLVMValueRef result;
4276
4277 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
4278 src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
4279
4280 result =
4281 ac_build_intrinsic(ctx, "llvm.amdgcn.ds.bpermute", ctx->i32, (LLVMValueRef[]){index, src}, 2,
4282 AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
4283 return LLVMBuildTrunc(ctx->builder, result, type, "");
4284 }
4285
ac_build_frexp_exp(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4286 LLVMValueRef ac_build_frexp_exp(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4287 {
4288 LLVMTypeRef type;
4289 char *intr;
4290
4291 if (bitsize == 16) {
4292 intr = "llvm.amdgcn.frexp.exp.i16.f16";
4293 type = ctx->i16;
4294 } else if (bitsize == 32) {
4295 intr = "llvm.amdgcn.frexp.exp.i32.f32";
4296 type = ctx->i32;
4297 } else {
4298 intr = "llvm.amdgcn.frexp.exp.i32.f64";
4299 type = ctx->i32;
4300 }
4301
4302 LLVMValueRef params[] = {
4303 src0,
4304 };
4305 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4306 }
ac_build_frexp_mant(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4307 LLVMValueRef ac_build_frexp_mant(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4308 {
4309 LLVMTypeRef type;
4310 char *intr;
4311
4312 if (bitsize == 16) {
4313 intr = "llvm.amdgcn.frexp.mant.f16";
4314 type = ctx->f16;
4315 } else if (bitsize == 32) {
4316 intr = "llvm.amdgcn.frexp.mant.f32";
4317 type = ctx->f32;
4318 } else {
4319 intr = "llvm.amdgcn.frexp.mant.f64";
4320 type = ctx->f64;
4321 }
4322
4323 LLVMValueRef params[] = {
4324 src0,
4325 };
4326 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4327 }
4328
ac_build_canonicalize(struct ac_llvm_context * ctx,LLVMValueRef src0,unsigned bitsize)4329 LLVMValueRef ac_build_canonicalize(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
4330 {
4331 LLVMTypeRef type;
4332 char *intr;
4333
4334 if (bitsize == 16) {
4335 intr = "llvm.canonicalize.f16";
4336 type = ctx->f16;
4337 } else if (bitsize == 32) {
4338 intr = "llvm.canonicalize.f32";
4339 type = ctx->f32;
4340 } else {
4341 intr = "llvm.canonicalize.f64";
4342 type = ctx->f64;
4343 }
4344
4345 LLVMValueRef params[] = {
4346 src0,
4347 };
4348 return ac_build_intrinsic(ctx, intr, type, params, 1, AC_FUNC_ATTR_READNONE);
4349 }
4350
4351 /*
4352 * this takes an I,J coordinate pair,
4353 * and works out the X and Y derivatives.
4354 * it returns DDX(I), DDX(J), DDY(I), DDY(J).
4355 */
ac_build_ddxy_interp(struct ac_llvm_context * ctx,LLVMValueRef interp_ij)4356 LLVMValueRef ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij)
4357 {
4358 LLVMValueRef result[4], a;
4359 unsigned i;
4360
4361 for (i = 0; i < 2; i++) {
4362 a = LLVMBuildExtractElement(ctx->builder, interp_ij, LLVMConstInt(ctx->i32, i, false), "");
4363 result[i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 1, a);
4364 result[2 + i] = ac_build_ddxy(ctx, AC_TID_MASK_TOP_LEFT, 2, a);
4365 }
4366 return ac_build_gather_values(ctx, result, 4);
4367 }
4368
ac_build_load_helper_invocation(struct ac_llvm_context * ctx)4369 LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx)
4370 {
4371 LLVMValueRef result =
4372 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4373 return LLVMBuildNot(ctx->builder, result, "");
4374 }
4375
ac_build_is_helper_invocation(struct ac_llvm_context * ctx)4376 LLVMValueRef ac_build_is_helper_invocation(struct ac_llvm_context *ctx)
4377 {
4378 if (!ctx->postponed_kill)
4379 return ac_build_load_helper_invocation(ctx);
4380
4381 /* !(exact && postponed) */
4382 LLVMValueRef exact =
4383 ac_build_intrinsic(ctx, "llvm.amdgcn.ps.live", ctx->i1, NULL, 0, AC_FUNC_ATTR_READNONE);
4384
4385 LLVMValueRef postponed = LLVMBuildLoad(ctx->builder, ctx->postponed_kill, "");
4386 return LLVMBuildNot(ctx->builder, LLVMBuildAnd(ctx->builder, exact, postponed, ""), "");
4387 }
4388
ac_build_call(struct ac_llvm_context * ctx,LLVMValueRef func,LLVMValueRef * args,unsigned num_args)4389 LLVMValueRef ac_build_call(struct ac_llvm_context *ctx, LLVMValueRef func, LLVMValueRef *args,
4390 unsigned num_args)
4391 {
4392 LLVMValueRef ret = LLVMBuildCall(ctx->builder, func, args, num_args, "");
4393 LLVMSetInstructionCallConv(ret, LLVMGetFunctionCallConv(func));
4394 return ret;
4395 }
4396
ac_export_mrt_z(struct ac_llvm_context * ctx,LLVMValueRef depth,LLVMValueRef stencil,LLVMValueRef samplemask,struct ac_export_args * args)4397 void ac_export_mrt_z(struct ac_llvm_context *ctx, LLVMValueRef depth, LLVMValueRef stencil,
4398 LLVMValueRef samplemask, struct ac_export_args *args)
4399 {
4400 unsigned mask = 0;
4401 unsigned format = ac_get_spi_shader_z_format(depth != NULL, stencil != NULL, samplemask != NULL);
4402
4403 assert(depth || stencil || samplemask);
4404
4405 memset(args, 0, sizeof(*args));
4406
4407 args->valid_mask = 1; /* whether the EXEC mask is valid */
4408 args->done = 1; /* DONE bit */
4409
4410 /* Specify the target we are exporting */
4411 args->target = V_008DFC_SQ_EXP_MRTZ;
4412
4413 args->compr = 0; /* COMP flag */
4414 args->out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4415 args->out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4416 args->out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4417 args->out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4418
4419 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
4420 assert(!depth);
4421 args->compr = 1; /* COMPR flag */
4422
4423 if (stencil) {
4424 /* Stencil should be in X[23:16]. */
4425 stencil = ac_to_integer(ctx, stencil);
4426 stencil = LLVMBuildShl(ctx->builder, stencil, LLVMConstInt(ctx->i32, 16, 0), "");
4427 args->out[0] = ac_to_float(ctx, stencil);
4428 mask |= 0x3;
4429 }
4430 if (samplemask) {
4431 /* SampleMask should be in Y[15:0]. */
4432 args->out[1] = samplemask;
4433 mask |= 0xc;
4434 }
4435 } else {
4436 if (depth) {
4437 args->out[0] = depth;
4438 mask |= 0x1;
4439 }
4440 if (stencil) {
4441 args->out[1] = stencil;
4442 mask |= 0x2;
4443 }
4444 if (samplemask) {
4445 args->out[2] = samplemask;
4446 mask |= 0x4;
4447 }
4448 }
4449
4450 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks
4451 * at the X writemask component. */
4452 if (ctx->chip_class == GFX6 && ctx->family != CHIP_OLAND && ctx->family != CHIP_HAINAN)
4453 mask |= 0x1;
4454
4455 /* Specify which components to enable */
4456 args->enabled_channels = mask;
4457 }
4458
4459 /* Send GS Alloc Req message from the first wave of the group to SPI.
4460 * Message payload is:
4461 * - bits 0..10: vertices in group
4462 * - bits 12..22: primitives in group
4463 */
ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context * ctx,LLVMValueRef wave_id,LLVMValueRef vtx_cnt,LLVMValueRef prim_cnt)4464 void ac_build_sendmsg_gs_alloc_req(struct ac_llvm_context *ctx, LLVMValueRef wave_id,
4465 LLVMValueRef vtx_cnt, LLVMValueRef prim_cnt)
4466 {
4467 LLVMBuilderRef builder = ctx->builder;
4468 LLVMValueRef tmp;
4469 bool export_dummy_prim = false;
4470
4471 /* HW workaround for a GPU hang with 100% culling.
4472 * We always have to export at least 1 primitive.
4473 * Export a degenerate triangle using vertex 0 for all 3 vertices.
4474 */
4475 if (prim_cnt == ctx->i32_0 && ctx->chip_class == GFX10) {
4476 assert(vtx_cnt == ctx->i32_0);
4477 prim_cnt = ctx->i32_1;
4478 vtx_cnt = ctx->i32_1;
4479 export_dummy_prim = true;
4480 }
4481
4482 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, wave_id, ctx->i32_0, ""), 5020);
4483
4484 tmp = LLVMBuildShl(builder, prim_cnt, LLVMConstInt(ctx->i32, 12, false), "");
4485 tmp = LLVMBuildOr(builder, tmp, vtx_cnt, "");
4486 ac_build_sendmsg(ctx, AC_SENDMSG_GS_ALLOC_REQ, tmp);
4487
4488 if (export_dummy_prim) {
4489 struct ac_ngg_prim prim = {0};
4490 /* The vertex indices are 0,0,0. */
4491 prim.passthrough = ctx->i32_0;
4492
4493 struct ac_export_args pos = {0};
4494 pos.out[0] = pos.out[1] = pos.out[2] = pos.out[3] = ctx->f32_0;
4495 pos.target = V_008DFC_SQ_EXP_POS;
4496 pos.enabled_channels = 0xf;
4497 pos.done = true;
4498
4499 ac_build_ifcc(ctx, LLVMBuildICmp(builder, LLVMIntEQ, ac_get_thread_id(ctx), ctx->i32_0, ""),
4500 5021);
4501 ac_build_export_prim(ctx, &prim);
4502 ac_build_export(ctx, &pos);
4503 ac_build_endif(ctx, 5021);
4504 }
4505
4506 ac_build_endif(ctx, 5020);
4507 }
4508
ac_pack_prim_export(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4509 LLVMValueRef ac_pack_prim_export(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4510 {
4511 /* The prim export format is:
4512 * - bits 0..8: index 0
4513 * - bit 9: edge flag 0
4514 * - bits 10..18: index 1
4515 * - bit 19: edge flag 1
4516 * - bits 20..28: index 2
4517 * - bit 29: edge flag 2
4518 * - bit 31: null primitive (skip)
4519 */
4520 LLVMBuilderRef builder = ctx->builder;
4521 LLVMValueRef tmp = LLVMBuildZExt(builder, prim->isnull, ctx->i32, "");
4522 LLVMValueRef result = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 31, false), "");
4523
4524 for (unsigned i = 0; i < prim->num_vertices; ++i) {
4525 tmp = LLVMBuildShl(builder, prim->index[i], LLVMConstInt(ctx->i32, 10 * i, false), "");
4526 result = LLVMBuildOr(builder, result, tmp, "");
4527 tmp = LLVMBuildZExt(builder, prim->edgeflag[i], ctx->i32, "");
4528 tmp = LLVMBuildShl(builder, tmp, LLVMConstInt(ctx->i32, 10 * i + 9, false), "");
4529 result = LLVMBuildOr(builder, result, tmp, "");
4530 }
4531 return result;
4532 }
4533
ac_build_export_prim(struct ac_llvm_context * ctx,const struct ac_ngg_prim * prim)4534 void ac_build_export_prim(struct ac_llvm_context *ctx, const struct ac_ngg_prim *prim)
4535 {
4536 struct ac_export_args args;
4537
4538 if (prim->passthrough) {
4539 args.out[0] = prim->passthrough;
4540 } else {
4541 args.out[0] = ac_pack_prim_export(ctx, prim);
4542 }
4543
4544 args.out[0] = LLVMBuildBitCast(ctx->builder, args.out[0], ctx->f32, "");
4545 args.out[1] = LLVMGetUndef(ctx->f32);
4546 args.out[2] = LLVMGetUndef(ctx->f32);
4547 args.out[3] = LLVMGetUndef(ctx->f32);
4548
4549 args.target = V_008DFC_SQ_EXP_PRIM;
4550 args.enabled_channels = 1;
4551 args.done = true;
4552 args.valid_mask = false;
4553 args.compr = false;
4554
4555 ac_build_export(ctx, &args);
4556 }
4557
arg_llvm_type(enum ac_arg_type type,unsigned size,struct ac_llvm_context * ctx)4558 static LLVMTypeRef arg_llvm_type(enum ac_arg_type type, unsigned size, struct ac_llvm_context *ctx)
4559 {
4560 if (type == AC_ARG_FLOAT) {
4561 return size == 1 ? ctx->f32 : LLVMVectorType(ctx->f32, size);
4562 } else if (type == AC_ARG_INT) {
4563 return size == 1 ? ctx->i32 : LLVMVectorType(ctx->i32, size);
4564 } else {
4565 LLVMTypeRef ptr_type;
4566 switch (type) {
4567 case AC_ARG_CONST_PTR:
4568 ptr_type = ctx->i8;
4569 break;
4570 case AC_ARG_CONST_FLOAT_PTR:
4571 ptr_type = ctx->f32;
4572 break;
4573 case AC_ARG_CONST_PTR_PTR:
4574 ptr_type = ac_array_in_const32_addr_space(ctx->i8);
4575 break;
4576 case AC_ARG_CONST_DESC_PTR:
4577 ptr_type = ctx->v4i32;
4578 break;
4579 case AC_ARG_CONST_IMAGE_PTR:
4580 ptr_type = ctx->v8i32;
4581 break;
4582 default:
4583 unreachable("unknown arg type");
4584 }
4585 if (size == 1) {
4586 return ac_array_in_const32_addr_space(ptr_type);
4587 } else {
4588 assert(size == 2);
4589 return ac_array_in_const_addr_space(ptr_type);
4590 }
4591 }
4592 }
4593
ac_build_main(const struct ac_shader_args * args,struct ac_llvm_context * ctx,enum ac_llvm_calling_convention convention,const char * name,LLVMTypeRef ret_type,LLVMModuleRef module)4594 LLVMValueRef ac_build_main(const struct ac_shader_args *args, struct ac_llvm_context *ctx,
4595 enum ac_llvm_calling_convention convention, const char *name,
4596 LLVMTypeRef ret_type, LLVMModuleRef module)
4597 {
4598 LLVMTypeRef arg_types[AC_MAX_ARGS];
4599
4600 for (unsigned i = 0; i < args->arg_count; i++) {
4601 arg_types[i] = arg_llvm_type(args->args[i].type, args->args[i].size, ctx);
4602 }
4603
4604 LLVMTypeRef main_function_type = LLVMFunctionType(ret_type, arg_types, args->arg_count, 0);
4605
4606 LLVMValueRef main_function = LLVMAddFunction(module, name, main_function_type);
4607 LLVMBasicBlockRef main_function_body =
4608 LLVMAppendBasicBlockInContext(ctx->context, main_function, "main_body");
4609 LLVMPositionBuilderAtEnd(ctx->builder, main_function_body);
4610
4611 LLVMSetFunctionCallConv(main_function, convention);
4612 for (unsigned i = 0; i < args->arg_count; ++i) {
4613 LLVMValueRef P = LLVMGetParam(main_function, i);
4614
4615 if (args->args[i].file != AC_ARG_SGPR)
4616 continue;
4617
4618 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_INREG);
4619
4620 if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
4621 ac_add_function_attr(ctx->context, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
4622 ac_add_attr_dereferenceable(P, UINT64_MAX);
4623 ac_add_attr_alignment(P, 32);
4624 }
4625 }
4626
4627 ctx->main_function = main_function;
4628
4629 if (LLVM_VERSION_MAJOR >= 11) {
4630 /* Enable denormals for FP16 and FP64: */
4631 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math", "ieee,ieee");
4632 /* Disable denormals for FP32: */
4633 LLVMAddTargetDependentFunctionAttr(main_function, "denormal-fp-math-f32",
4634 "preserve-sign,preserve-sign");
4635 }
4636 return main_function;
4637 }
4638
ac_build_s_endpgm(struct ac_llvm_context * ctx)4639 void ac_build_s_endpgm(struct ac_llvm_context *ctx)
4640 {
4641 LLVMTypeRef calltype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
4642 LLVMValueRef code = LLVMConstInlineAsm(calltype, "s_endpgm", "", true, false);
4643 LLVMBuildCall(ctx->builder, code, NULL, 0, "");
4644 }
4645
ac_prefix_bitcount(struct ac_llvm_context * ctx,LLVMValueRef mask,LLVMValueRef index)4646 LLVMValueRef ac_prefix_bitcount(struct ac_llvm_context *ctx, LLVMValueRef mask, LLVMValueRef index)
4647 {
4648 LLVMBuilderRef builder = ctx->builder;
4649 LLVMTypeRef type = LLVMTypeOf(mask);
4650
4651 LLVMValueRef bit =
4652 LLVMBuildShl(builder, LLVMConstInt(type, 1, 0), LLVMBuildZExt(builder, index, type, ""), "");
4653 LLVMValueRef prefix_bits = LLVMBuildSub(builder, bit, LLVMConstInt(type, 1, 0), "");
4654 LLVMValueRef prefix_mask = LLVMBuildAnd(builder, mask, prefix_bits, "");
4655 return ac_build_bit_count(ctx, prefix_mask);
4656 }
4657
4658 /* Compute the prefix sum of the "mask" bit array with 128 elements (bits). */
ac_prefix_bitcount_2x64(struct ac_llvm_context * ctx,LLVMValueRef mask[2],LLVMValueRef index)4659 LLVMValueRef ac_prefix_bitcount_2x64(struct ac_llvm_context *ctx, LLVMValueRef mask[2],
4660 LLVMValueRef index)
4661 {
4662 LLVMBuilderRef builder = ctx->builder;
4663 #if 0
4664 /* Reference version using i128. */
4665 LLVMValueRef input_mask =
4666 LLVMBuildBitCast(builder, ac_build_gather_values(ctx, mask, 2), ctx->i128, "");
4667
4668 return ac_prefix_bitcount(ctx, input_mask, index);
4669 #else
4670 /* Optimized version using 2 64-bit masks. */
4671 LLVMValueRef is_hi, is_0, c64, c128, all_bits;
4672 LLVMValueRef prefix_mask[2], shift[2], mask_bcnt0, prefix_bcnt[2];
4673
4674 /* Compute the 128-bit prefix mask. */
4675 c64 = LLVMConstInt(ctx->i32, 64, 0);
4676 c128 = LLVMConstInt(ctx->i32, 128, 0);
4677 all_bits = LLVMConstInt(ctx->i64, UINT64_MAX, 0);
4678 /* The first index that can have non-zero high bits in the prefix mask is 65. */
4679 is_hi = LLVMBuildICmp(builder, LLVMIntUGT, index, c64, "");
4680 is_0 = LLVMBuildICmp(builder, LLVMIntEQ, index, ctx->i32_0, "");
4681 mask_bcnt0 = ac_build_bit_count(ctx, mask[0]);
4682
4683 for (unsigned i = 0; i < 2; i++) {
4684 shift[i] = LLVMBuildSub(builder, i ? c128 : c64, index, "");
4685 /* For i==0, index==0, the right shift by 64 doesn't give the desired result,
4686 * so we handle it by the is_0 select.
4687 * For i==1, index==64, same story, so we handle it by the last is_hi select.
4688 * For i==0, index==64, we shift by 0, which is what we want.
4689 */
4690 prefix_mask[i] =
4691 LLVMBuildLShr(builder, all_bits, LLVMBuildZExt(builder, shift[i], ctx->i64, ""), "");
4692 prefix_mask[i] = LLVMBuildAnd(builder, mask[i], prefix_mask[i], "");
4693 prefix_bcnt[i] = ac_build_bit_count(ctx, prefix_mask[i]);
4694 }
4695
4696 prefix_bcnt[0] = LLVMBuildSelect(builder, is_0, ctx->i32_0, prefix_bcnt[0], "");
4697 prefix_bcnt[0] = LLVMBuildSelect(builder, is_hi, mask_bcnt0, prefix_bcnt[0], "");
4698 prefix_bcnt[1] = LLVMBuildSelect(builder, is_hi, prefix_bcnt[1], ctx->i32_0, "");
4699
4700 return LLVMBuildAdd(builder, prefix_bcnt[0], prefix_bcnt[1], "");
4701 #endif
4702 }
4703
4704 /**
4705 * Convert triangle strip indices to triangle indices. This is used to decompose
4706 * triangle strips into triangles.
4707 */
ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context * ctx,LLVMValueRef is_odd,LLVMValueRef flatshade_first,LLVMValueRef index[3])4708 void ac_build_triangle_strip_indices_to_triangle(struct ac_llvm_context *ctx, LLVMValueRef is_odd,
4709 LLVMValueRef flatshade_first,
4710 LLVMValueRef index[3])
4711 {
4712 LLVMBuilderRef builder = ctx->builder;
4713 LLVMValueRef out[3];
4714
4715 /* We need to change the vertex order for odd triangles to get correct
4716 * front/back facing by swapping 2 vertex indices, but we also have to
4717 * keep the provoking vertex in the same place.
4718 *
4719 * If the first vertex is provoking, swap index 1 and 2.
4720 * If the last vertex is provoking, swap index 0 and 1.
4721 */
4722 out[0] = LLVMBuildSelect(builder, flatshade_first, index[0],
4723 LLVMBuildSelect(builder, is_odd, index[1], index[0], ""), "");
4724 out[1] = LLVMBuildSelect(builder, flatshade_first,
4725 LLVMBuildSelect(builder, is_odd, index[2], index[1], ""),
4726 LLVMBuildSelect(builder, is_odd, index[0], index[1], ""), "");
4727 out[2] = LLVMBuildSelect(builder, flatshade_first,
4728 LLVMBuildSelect(builder, is_odd, index[1], index[2], ""), index[2], "");
4729 memcpy(index, out, sizeof(out));
4730 }
4731