1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Helper functions for logical operations.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35 #include <llvm/Config/llvm-config.h>
36
37 #include "util/u_cpu_detect.h"
38 #include "util/u_memory.h"
39 #include "util/u_debug.h"
40
41 #include "lp_bld_type.h"
42 #include "lp_bld_const.h"
43 #include "lp_bld_swizzle.h"
44 #include "lp_bld_init.h"
45 #include "lp_bld_intr.h"
46 #include "lp_bld_debug.h"
47 #include "lp_bld_logic.h"
48
49
50 /*
51 * XXX
52 *
53 * Selection with vector conditional like
54 *
55 * select <4 x i1> %C, %A, %B
56 *
57 * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
58 * supported on some backends (x86) starting with llvm 3.1.
59 *
60 * Expanding the boolean vector to full SIMD register width, as in
61 *
62 * sext <4 x i1> %C to <4 x i32>
63 *
64 * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
65 * it causes assertion failures in LLVM 2.6. It appears to work correctly on
66 * LLVM 2.7.
67 */
68
69
70 /**
71 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
72 * \param func one of PIPE_FUNC_x
73 * If the ordered argument is true the function will use LLVM's ordered
74 * comparisons, otherwise unordered comparisons will be used.
75 * The result values will be 0 for false or ~0 for true.
76 */
77 static LLVMValueRef
lp_build_compare_ext(struct gallivm_state * gallivm,const struct lp_type type,unsigned func,LLVMValueRef a,LLVMValueRef b,boolean ordered)78 lp_build_compare_ext(struct gallivm_state *gallivm,
79 const struct lp_type type,
80 unsigned func,
81 LLVMValueRef a,
82 LLVMValueRef b,
83 boolean ordered)
84 {
85 LLVMBuilderRef builder = gallivm->builder;
86 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
87 LLVMValueRef zeros = LLVMConstNull(int_vec_type);
88 LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
89 LLVMValueRef cond;
90 LLVMValueRef res;
91
92 assert(lp_check_value(type, a));
93 assert(lp_check_value(type, b));
94
95 if(func == PIPE_FUNC_NEVER)
96 return zeros;
97 if(func == PIPE_FUNC_ALWAYS)
98 return ones;
99
100 assert(func > PIPE_FUNC_NEVER);
101 assert(func < PIPE_FUNC_ALWAYS);
102
103 if(type.floating) {
104 LLVMRealPredicate op;
105 switch(func) {
106 case PIPE_FUNC_EQUAL:
107 op = ordered ? LLVMRealOEQ : LLVMRealUEQ;
108 break;
109 case PIPE_FUNC_NOTEQUAL:
110 op = ordered ? LLVMRealONE : LLVMRealUNE;
111 break;
112 case PIPE_FUNC_LESS:
113 op = ordered ? LLVMRealOLT : LLVMRealULT;
114 break;
115 case PIPE_FUNC_LEQUAL:
116 op = ordered ? LLVMRealOLE : LLVMRealULE;
117 break;
118 case PIPE_FUNC_GREATER:
119 op = ordered ? LLVMRealOGT : LLVMRealUGT;
120 break;
121 case PIPE_FUNC_GEQUAL:
122 op = ordered ? LLVMRealOGE : LLVMRealUGE;
123 break;
124 default:
125 assert(0);
126 return lp_build_undef(gallivm, type);
127 }
128
129 cond = LLVMBuildFCmp(builder, op, a, b, "");
130 res = LLVMBuildSExt(builder, cond, int_vec_type, "");
131 }
132 else {
133 LLVMIntPredicate op;
134 switch(func) {
135 case PIPE_FUNC_EQUAL:
136 op = LLVMIntEQ;
137 break;
138 case PIPE_FUNC_NOTEQUAL:
139 op = LLVMIntNE;
140 break;
141 case PIPE_FUNC_LESS:
142 op = type.sign ? LLVMIntSLT : LLVMIntULT;
143 break;
144 case PIPE_FUNC_LEQUAL:
145 op = type.sign ? LLVMIntSLE : LLVMIntULE;
146 break;
147 case PIPE_FUNC_GREATER:
148 op = type.sign ? LLVMIntSGT : LLVMIntUGT;
149 break;
150 case PIPE_FUNC_GEQUAL:
151 op = type.sign ? LLVMIntSGE : LLVMIntUGE;
152 break;
153 default:
154 assert(0);
155 return lp_build_undef(gallivm, type);
156 }
157
158 cond = LLVMBuildICmp(builder, op, a, b, "");
159 res = LLVMBuildSExt(builder, cond, int_vec_type, "");
160 }
161
162 return res;
163 }
164
165 /**
166 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
167 * \param func one of PIPE_FUNC_x
168 * The result values will be 0 for false or ~0 for true.
169 */
170 LLVMValueRef
lp_build_compare(struct gallivm_state * gallivm,const struct lp_type type,unsigned func,LLVMValueRef a,LLVMValueRef b)171 lp_build_compare(struct gallivm_state *gallivm,
172 const struct lp_type type,
173 unsigned func,
174 LLVMValueRef a,
175 LLVMValueRef b)
176 {
177 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
178 LLVMValueRef zeros = LLVMConstNull(int_vec_type);
179 LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
180
181 assert(lp_check_value(type, a));
182 assert(lp_check_value(type, b));
183
184 if(func == PIPE_FUNC_NEVER)
185 return zeros;
186 if(func == PIPE_FUNC_ALWAYS)
187 return ones;
188
189 assert(func > PIPE_FUNC_NEVER);
190 assert(func < PIPE_FUNC_ALWAYS);
191
192 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
193 /*
194 * There are no unsigned integer comparison instructions in SSE.
195 */
196
197 if (!type.floating && !type.sign &&
198 type.width * type.length == 128 &&
199 util_get_cpu_caps()->has_sse2 &&
200 (func == PIPE_FUNC_LESS ||
201 func == PIPE_FUNC_LEQUAL ||
202 func == PIPE_FUNC_GREATER ||
203 func == PIPE_FUNC_GEQUAL) &&
204 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
205 debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
206 __FUNCTION__, type.length, type.width);
207 }
208 #endif
209
210 return lp_build_compare_ext(gallivm, type, func, a, b, FALSE);
211 }
212
213 /**
214 * Build code to compare two values 'a' and 'b' using the given func.
215 * \param func one of PIPE_FUNC_x
216 * If the operands are floating point numbers, the function will use
217 * ordered comparison which means that it will return true if both
218 * operands are not a NaN and the specified condition evaluates to true.
219 * The result values will be 0 for false or ~0 for true.
220 */
221 LLVMValueRef
lp_build_cmp_ordered(struct lp_build_context * bld,unsigned func,LLVMValueRef a,LLVMValueRef b)222 lp_build_cmp_ordered(struct lp_build_context *bld,
223 unsigned func,
224 LLVMValueRef a,
225 LLVMValueRef b)
226 {
227 return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE);
228 }
229
230 /**
231 * Build code to compare two values 'a' and 'b' using the given func.
232 * \param func one of PIPE_FUNC_x
233 * If the operands are floating point numbers, the function will use
234 * unordered comparison which means that it will return true if either
235 * operand is a NaN or the specified condition evaluates to true.
236 * The result values will be 0 for false or ~0 for true.
237 */
238 LLVMValueRef
lp_build_cmp(struct lp_build_context * bld,unsigned func,LLVMValueRef a,LLVMValueRef b)239 lp_build_cmp(struct lp_build_context *bld,
240 unsigned func,
241 LLVMValueRef a,
242 LLVMValueRef b)
243 {
244 return lp_build_compare(bld->gallivm, bld->type, func, a, b);
245 }
246
247
248 /**
249 * Return (mask & a) | (~mask & b);
250 */
251 LLVMValueRef
lp_build_select_bitwise(struct lp_build_context * bld,LLVMValueRef mask,LLVMValueRef a,LLVMValueRef b)252 lp_build_select_bitwise(struct lp_build_context *bld,
253 LLVMValueRef mask,
254 LLVMValueRef a,
255 LLVMValueRef b)
256 {
257 LLVMBuilderRef builder = bld->gallivm->builder;
258 struct lp_type type = bld->type;
259 LLVMValueRef res;
260 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
261
262 assert(lp_check_value(type, a));
263 assert(lp_check_value(type, b));
264
265 if (a == b) {
266 return a;
267 }
268
269 if(type.floating) {
270 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
271 b = LLVMBuildBitCast(builder, b, int_vec_type, "");
272 }
273
274 if (type.width > 32)
275 mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
276 a = LLVMBuildAnd(builder, a, mask, "");
277
278 /* This often gets translated to PANDN, but sometimes the NOT is
279 * pre-computed and stored in another constant. The best strategy depends
280 * on available registers, so it is not a big deal -- hopefully LLVM does
281 * the right decision attending the rest of the program.
282 */
283 b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), "");
284
285 res = LLVMBuildOr(builder, a, b, "");
286
287 if(type.floating) {
288 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
289 res = LLVMBuildBitCast(builder, res, vec_type, "");
290 }
291
292 return res;
293 }
294
295
296 /**
297 * Return mask ? a : b;
298 *
299 * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value
300 * will yield unpredictable results.
301 */
302 LLVMValueRef
lp_build_select(struct lp_build_context * bld,LLVMValueRef mask,LLVMValueRef a,LLVMValueRef b)303 lp_build_select(struct lp_build_context *bld,
304 LLVMValueRef mask,
305 LLVMValueRef a,
306 LLVMValueRef b)
307 {
308 LLVMBuilderRef builder = bld->gallivm->builder;
309 LLVMContextRef lc = bld->gallivm->context;
310 struct lp_type type = bld->type;
311 LLVMValueRef res;
312
313 assert(lp_check_value(type, a));
314 assert(lp_check_value(type, b));
315
316 if(a == b)
317 return a;
318
319 if (type.length == 1) {
320 mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
321 res = LLVMBuildSelect(builder, mask, a, b, "");
322 }
323 else if (LLVMIsConstant(mask) ||
324 LLVMGetInstructionOpcode(mask) == LLVMSExt) {
325 /* Generate a vector select.
326 *
327 * Using vector selects should avoid emitting intrinsics hence avoid
328 * hindering optimization passes, but vector selects weren't properly
329 * supported yet for a long time, and LLVM will generate poor code when
330 * the mask is not the result of a comparison.
331 * XXX: Even if the instruction was an SExt, this may still produce
332 * terrible code. Try piglit stencil-twoside.
333 */
334
335 /* Convert the mask to a vector of booleans.
336 *
337 * XXX: In x86 the mask is controlled by the MSB, so if we shifted the
338 * mask by `type.width - 1`, LLVM should realize the mask is ready. Alas
339 * what really happens is that LLVM will emit two shifts back to back.
340 */
341 if (0) {
342 LLVMValueRef shift = LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0);
343 shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift);
344 mask = LLVMBuildLShr(builder, mask, shift, "");
345 }
346 LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
347 mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");
348
349 res = LLVMBuildSelect(builder, mask, a, b, "");
350 }
351 else if (((util_get_cpu_caps()->has_sse4_1 &&
352 type.width * type.length == 128) ||
353 (util_get_cpu_caps()->has_avx &&
354 type.width * type.length == 256 && type.width >= 32) ||
355 (util_get_cpu_caps()->has_avx2 &&
356 type.width * type.length == 256)) &&
357 !LLVMIsConstant(a) &&
358 !LLVMIsConstant(b) &&
359 !LLVMIsConstant(mask)) {
360 const char *intrinsic;
361 LLVMTypeRef arg_type;
362 LLVMValueRef args[3];
363
364 LLVMTypeRef mask_type = LLVMGetElementType(LLVMTypeOf(mask));
365 if (LLVMGetIntTypeWidth(mask_type) != type.width) {
366 LLVMTypeRef int_vec_type = LLVMVectorType(LLVMIntTypeInContext(lc, type.width), type.length);
367 mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
368 }
369 /*
370 * There's only float blend in AVX but can just cast i32/i64
371 * to float.
372 */
373 if (type.width * type.length == 256) {
374 if (type.width == 64) {
375 intrinsic = "llvm.x86.avx.blendv.pd.256";
376 arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
377 }
378 else if (type.width == 32) {
379 intrinsic = "llvm.x86.avx.blendv.ps.256";
380 arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
381 } else {
382 assert(util_get_cpu_caps()->has_avx2);
383 intrinsic = "llvm.x86.avx2.pblendvb";
384 arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
385 }
386 }
387 else if (type.floating &&
388 type.width == 64) {
389 intrinsic = "llvm.x86.sse41.blendvpd";
390 arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
391 } else if (type.floating &&
392 type.width == 32) {
393 intrinsic = "llvm.x86.sse41.blendvps";
394 arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4);
395 } else {
396 intrinsic = "llvm.x86.sse41.pblendvb";
397 arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16);
398 }
399
400 if (arg_type != bld->int_vec_type) {
401 mask = LLVMBuildBitCast(builder, mask, arg_type, "");
402 }
403
404 if (arg_type != bld->vec_type) {
405 a = LLVMBuildBitCast(builder, a, arg_type, "");
406 b = LLVMBuildBitCast(builder, b, arg_type, "");
407 }
408
409 args[0] = b;
410 args[1] = a;
411 args[2] = mask;
412
413 res = lp_build_intrinsic(builder, intrinsic,
414 arg_type, args, ARRAY_SIZE(args), 0);
415
416 if (arg_type != bld->vec_type) {
417 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
418 }
419 }
420 else {
421 res = lp_build_select_bitwise(bld, mask, a, b);
422 }
423
424 return res;
425 }
426
427
428 /**
429 * Return mask ? a : b;
430 *
431 * mask is a TGSI_WRITEMASK_xxx.
432 */
433 LLVMValueRef
lp_build_select_aos(struct lp_build_context * bld,unsigned mask,LLVMValueRef a,LLVMValueRef b,unsigned num_channels)434 lp_build_select_aos(struct lp_build_context *bld,
435 unsigned mask,
436 LLVMValueRef a,
437 LLVMValueRef b,
438 unsigned num_channels)
439 {
440 LLVMBuilderRef builder = bld->gallivm->builder;
441 const struct lp_type type = bld->type;
442 const unsigned n = type.length;
443 unsigned i, j;
444
445 assert((mask & ~0xf) == 0);
446 assert(lp_check_value(type, a));
447 assert(lp_check_value(type, b));
448
449 if(a == b)
450 return a;
451 if((mask & 0xf) == 0xf)
452 return a;
453 if((mask & 0xf) == 0x0)
454 return b;
455 if(a == bld->undef || b == bld->undef)
456 return bld->undef;
457
458 /*
459 * There are two major ways of accomplishing this:
460 * - with a shuffle
461 * - with a select
462 *
463 * The flip between these is empirical and might need to be adjusted.
464 */
465 if (n <= 4) {
466 /*
467 * Shuffle.
468 */
469 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
470 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
471
472 for(j = 0; j < n; j += num_channels)
473 for(i = 0; i < num_channels; ++i)
474 shuffles[j + i] = LLVMConstInt(elem_type,
475 (mask & (1 << i) ? 0 : n) + j + i,
476 0);
477
478 return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), "");
479 }
480 else {
481 LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels);
482 return lp_build_select(bld, mask_vec, a, b);
483 }
484 }
485
486
487 /**
488 * Return (scalar-cast)val ? true : false;
489 */
490 LLVMValueRef
lp_build_any_true_range(struct lp_build_context * bld,unsigned real_length,LLVMValueRef val)491 lp_build_any_true_range(struct lp_build_context *bld,
492 unsigned real_length,
493 LLVMValueRef val)
494 {
495 LLVMBuilderRef builder = bld->gallivm->builder;
496 LLVMTypeRef scalar_type;
497 LLVMTypeRef true_type;
498
499 assert(real_length <= bld->type.length);
500
501 true_type = LLVMIntTypeInContext(bld->gallivm->context,
502 bld->type.width * real_length);
503 scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
504 bld->type.width * bld->type.length);
505 val = LLVMBuildBitCast(builder, val, scalar_type, "");
506 /*
507 * We're using always native types so we can use intrinsics.
508 * However, if we don't do per-element calculations, we must ensure
509 * the excess elements aren't used since they may contain garbage.
510 */
511 if (real_length < bld->type.length) {
512 val = LLVMBuildTrunc(builder, val, true_type, "");
513 }
514 return LLVMBuildICmp(builder, LLVMIntNE,
515 val, LLVMConstNull(true_type), "");
516 }
517