1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_get_cpu_caps()->has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_get_cpu_caps()->has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_get_cpu_caps()->has_altivec) {
147 intr_size = 128;
148 if (type.width == 8) {
149 if (!type.sign) {
150 intrinsic = "llvm.ppc.altivec.vminub";
151 } else {
152 intrinsic = "llvm.ppc.altivec.vminsb";
153 }
154 } else if (type.width == 16) {
155 if (!type.sign) {
156 intrinsic = "llvm.ppc.altivec.vminuh";
157 } else {
158 intrinsic = "llvm.ppc.altivec.vminsh";
159 }
160 } else if (type.width == 32) {
161 if (!type.sign) {
162 intrinsic = "llvm.ppc.altivec.vminuw";
163 } else {
164 intrinsic = "llvm.ppc.altivec.vminsw";
165 }
166 }
167 }
168
169 if (intrinsic) {
170 /* We need to handle nan's for floating point numbers. If one of the
171 * inputs is nan the other should be returned (required by both D3D10+
172 * and OpenCL).
173 * The sse intrinsics return the second operator in case of nan by
174 * default so we need to special code to handle those.
175 */
176 if (util_get_cpu_caps()->has_sse && type.floating &&
177 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178 LLVMValueRef isnan, min;
179 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 isnan = lp_build_isnan(bld, b);
183 return lp_build_select(bld, isnan, a, min);
184 } else {
185 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186 type,
187 intr_size, a, b);
188 }
189 }
190
191 if (type.floating) {
192 switch (nan_behavior) {
193 case GALLIVM_NAN_RETURN_OTHER: {
194 LLVMValueRef isnan = lp_build_isnan(bld, a);
195 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197 return lp_build_select(bld, cond, a, b);
198 }
199 break;
200 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202 return lp_build_select(bld, cond, a, b);
203 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205 return lp_build_select(bld, cond, b, a);
206 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208 return lp_build_select(bld, cond, a, b);
209 break;
210 default:
211 assert(0);
212 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213 return lp_build_select(bld, cond, a, b);
214 }
215 } else {
216 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217 return lp_build_select(bld, cond, a, b);
218 }
219 }
220
221
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224 LLVMValueRef a,
225 LLVMValueRef b,
226 LLVMValueRef c)
227 {
228 LLVMTypeRef type = LLVMTypeOf(a);
229 assert(type == LLVMTypeOf(b));
230 assert(type == LLVMTypeOf(c));
231
232 char intrinsic[32];
233 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234 LLVMValueRef args[] = { a, b, c };
235 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237
238
239 /**
240 * Generate max(a, b)
241 * No checks for special case values of a or b = 1 or 0 are done.
242 * NaN's are handled according to the behavior specified by the
243 * nan_behavior argument.
244 */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247 LLVMValueRef a,
248 LLVMValueRef b,
249 enum gallivm_nan_behavior nan_behavior)
250 {
251 const struct lp_type type = bld->type;
252 const char *intrinsic = NULL;
253 unsigned intr_size = 0;
254 LLVMValueRef cond;
255
256 assert(lp_check_value(type, a));
257 assert(lp_check_value(type, b));
258
259 /* TODO: optimize the constant case */
260
261 if (type.floating && util_get_cpu_caps()->has_sse) {
262 if (type.width == 32) {
263 if (type.length == 1) {
264 intrinsic = "llvm.x86.sse.max.ss";
265 intr_size = 128;
266 }
267 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268 intrinsic = "llvm.x86.sse.max.ps";
269 intr_size = 128;
270 }
271 else {
272 intrinsic = "llvm.x86.avx.max.ps.256";
273 intr_size = 256;
274 }
275 }
276 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277 if (type.length == 1) {
278 intrinsic = "llvm.x86.sse2.max.sd";
279 intr_size = 128;
280 }
281 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282 intrinsic = "llvm.x86.sse2.max.pd";
283 intr_size = 128;
284 }
285 else {
286 intrinsic = "llvm.x86.avx.max.pd.256";
287 intr_size = 256;
288 }
289 }
290 }
291 else if (type.floating && util_get_cpu_caps()->has_altivec) {
292 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294 __FUNCTION__);
295 }
296 if (type.width == 32 || type.length == 4) {
297 intrinsic = "llvm.ppc.altivec.vmaxfp";
298 intr_size = 128;
299 }
300 } else if (util_get_cpu_caps()->has_altivec) {
301 intr_size = 128;
302 if (type.width == 8) {
303 if (!type.sign) {
304 intrinsic = "llvm.ppc.altivec.vmaxub";
305 } else {
306 intrinsic = "llvm.ppc.altivec.vmaxsb";
307 }
308 } else if (type.width == 16) {
309 if (!type.sign) {
310 intrinsic = "llvm.ppc.altivec.vmaxuh";
311 } else {
312 intrinsic = "llvm.ppc.altivec.vmaxsh";
313 }
314 } else if (type.width == 32) {
315 if (!type.sign) {
316 intrinsic = "llvm.ppc.altivec.vmaxuw";
317 } else {
318 intrinsic = "llvm.ppc.altivec.vmaxsw";
319 }
320 }
321 }
322
323 if (intrinsic) {
324 if (util_get_cpu_caps()->has_sse && type.floating &&
325 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326 LLVMValueRef isnan, max;
327 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328 type,
329 intr_size, a, b);
330 isnan = lp_build_isnan(bld, b);
331 return lp_build_select(bld, isnan, a, max);
332 } else {
333 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334 type,
335 intr_size, a, b);
336 }
337 }
338
339 if (type.floating) {
340 switch (nan_behavior) {
341 case GALLIVM_NAN_RETURN_OTHER: {
342 LLVMValueRef isnan = lp_build_isnan(bld, a);
343 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345 return lp_build_select(bld, cond, a, b);
346 }
347 break;
348 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350 return lp_build_select(bld, cond, a, b);
351 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353 return lp_build_select(bld, cond, b, a);
354 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356 return lp_build_select(bld, cond, a, b);
357 break;
358 default:
359 assert(0);
360 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361 return lp_build_select(bld, cond, a, b);
362 }
363 } else {
364 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365 return lp_build_select(bld, cond, a, b);
366 }
367 }
368
369
370 /**
371 * Generate 1 - a, or ~a depending on bld->type.
372 */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375 LLVMValueRef a)
376 {
377 LLVMBuilderRef builder = bld->gallivm->builder;
378 const struct lp_type type = bld->type;
379
380 assert(lp_check_value(type, a));
381
382 if(a == bld->one)
383 return bld->zero;
384 if(a == bld->zero)
385 return bld->one;
386
387 if(type.norm && !type.floating && !type.fixed && !type.sign) {
388 if(LLVMIsConstant(a))
389 return LLVMConstNot(a);
390 else
391 return LLVMBuildNot(builder, a, "");
392 }
393
394 if(LLVMIsConstant(a))
395 if (type.floating)
396 return LLVMConstFSub(bld->one, a);
397 else
398 return LLVMConstSub(bld->one, a);
399 else
400 if (type.floating)
401 return LLVMBuildFSub(builder, bld->one, a, "");
402 else
403 return LLVMBuildSub(builder, bld->one, a, "");
404 }
405
406
407 /**
408 * Generate a + b
409 */
410 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)411 lp_build_add(struct lp_build_context *bld,
412 LLVMValueRef a,
413 LLVMValueRef b)
414 {
415 LLVMBuilderRef builder = bld->gallivm->builder;
416 const struct lp_type type = bld->type;
417 LLVMValueRef res;
418
419 assert(lp_check_value(type, a));
420 assert(lp_check_value(type, b));
421
422 if (a == bld->zero)
423 return b;
424 if (b == bld->zero)
425 return a;
426 if (a == bld->undef || b == bld->undef)
427 return bld->undef;
428
429 if (type.norm) {
430 const char *intrinsic = NULL;
431
432 if (!type.sign && (a == bld->one || b == bld->one))
433 return bld->one;
434
435 if (!type.floating && !type.fixed) {
436 if (LLVM_VERSION_MAJOR >= 8) {
437 char intrin[32];
438 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441 }
442 if (type.width * type.length == 128) {
443 if (util_get_cpu_caps()->has_sse2) {
444 if (type.width == 8)
445 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446 if (type.width == 16)
447 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448 } else if (util_get_cpu_caps()->has_altivec) {
449 if (type.width == 8)
450 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451 if (type.width == 16)
452 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453 }
454 }
455 if (type.width * type.length == 256) {
456 if (util_get_cpu_caps()->has_avx2) {
457 if (type.width == 8)
458 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459 if (type.width == 16)
460 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461 }
462 }
463 }
464
465 if (intrinsic)
466 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467 }
468
469 if(type.norm && !type.floating && !type.fixed) {
470 if (type.sign) {
471 uint64_t sign = (uint64_t)1 << (type.width - 1);
472 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474 /* a_clamp_max is the maximum a for positive b,
475 a_clamp_min is the minimum a for negative b. */
476 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479 }
480 }
481
482 if(LLVMIsConstant(a) && LLVMIsConstant(b))
483 if (type.floating)
484 res = LLVMConstFAdd(a, b);
485 else
486 res = LLVMConstAdd(a, b);
487 else
488 if (type.floating)
489 res = LLVMBuildFAdd(builder, a, b, "");
490 else
491 res = LLVMBuildAdd(builder, a, b, "");
492
493 /* clamp to ceiling of 1.0 */
494 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496
497 if (type.norm && !type.floating && !type.fixed) {
498 if (!type.sign) {
499 /*
500 * newer llvm versions no longer support the intrinsics, but recognize
501 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502 * code, it is important we match the pattern llvm uses (and pray llvm
503 * doesn't change it - and hope they decide on the same pattern for
504 * all backends supporting it...).
505 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506 * interfere with llvm's ability to recognize the pattern but seems
507 * a bit brittle.
508 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509 */
510 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511 res = lp_build_select(bld, overflowed,
512 LLVMConstAllOnes(bld->int_vec_type), res);
513 }
514 }
515
516 /* XXX clamp to floor of -1 or 0??? */
517
518 return res;
519 }
520
521
522 /** Return the scalar sum of the elements of a.
523 * Should avoid this operation whenever possible.
524 */
525 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)526 lp_build_horizontal_add(struct lp_build_context *bld,
527 LLVMValueRef a)
528 {
529 LLVMBuilderRef builder = bld->gallivm->builder;
530 const struct lp_type type = bld->type;
531 LLVMValueRef index, res;
532 unsigned i, length;
533 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535 LLVMValueRef vecres, elem2;
536
537 assert(lp_check_value(type, a));
538
539 if (type.length == 1) {
540 return a;
541 }
542
543 assert(!bld->type.norm);
544
545 /*
546 * for byte vectors can do much better with psadbw.
547 * Using repeated shuffle/adds here. Note with multiple vectors
548 * this can be done more efficiently as outlined in the intel
549 * optimization manual.
550 * Note: could cause data rearrangement if used with smaller element
551 * sizes.
552 */
553
554 vecres = a;
555 length = type.length / 2;
556 while (length > 1) {
557 LLVMValueRef vec1, vec2;
558 for (i = 0; i < length; i++) {
559 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561 }
562 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563 LLVMConstVector(shuffles1, length), "");
564 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565 LLVMConstVector(shuffles2, length), "");
566 if (type.floating) {
567 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568 }
569 else {
570 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571 }
572 length = length >> 1;
573 }
574
575 /* always have vector of size 2 here */
576 assert(length == 1);
577
578 index = lp_build_const_int32(bld->gallivm, 0);
579 res = LLVMBuildExtractElement(builder, vecres, index, "");
580 index = lp_build_const_int32(bld->gallivm, 1);
581 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582
583 if (type.floating)
584 res = LLVMBuildFAdd(builder, res, elem2, "");
585 else
586 res = LLVMBuildAdd(builder, res, elem2, "");
587
588 return res;
589 }
590
591 /**
592 * Return the horizontal sums of 4 float vectors as a float4 vector.
593 * This uses the technique as outlined in Intel Optimization Manual.
594 */
595 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])596 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597 LLVMValueRef src[4])
598 {
599 struct gallivm_state *gallivm = bld->gallivm;
600 LLVMBuilderRef builder = gallivm->builder;
601 LLVMValueRef shuffles[4];
602 LLVMValueRef tmp[4];
603 LLVMValueRef sumtmp[2], shuftmp[2];
604
605 /* lower half of regs */
606 shuffles[0] = lp_build_const_int32(gallivm, 0);
607 shuffles[1] = lp_build_const_int32(gallivm, 1);
608 shuffles[2] = lp_build_const_int32(gallivm, 4);
609 shuffles[3] = lp_build_const_int32(gallivm, 5);
610 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611 LLVMConstVector(shuffles, 4), "");
612 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613 LLVMConstVector(shuffles, 4), "");
614
615 /* upper half of regs */
616 shuffles[0] = lp_build_const_int32(gallivm, 2);
617 shuffles[1] = lp_build_const_int32(gallivm, 3);
618 shuffles[2] = lp_build_const_int32(gallivm, 6);
619 shuffles[3] = lp_build_const_int32(gallivm, 7);
620 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621 LLVMConstVector(shuffles, 4), "");
622 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623 LLVMConstVector(shuffles, 4), "");
624
625 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627
628 shuffles[0] = lp_build_const_int32(gallivm, 0);
629 shuffles[1] = lp_build_const_int32(gallivm, 2);
630 shuffles[2] = lp_build_const_int32(gallivm, 4);
631 shuffles[3] = lp_build_const_int32(gallivm, 6);
632 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633 LLVMConstVector(shuffles, 4), "");
634
635 shuffles[0] = lp_build_const_int32(gallivm, 1);
636 shuffles[1] = lp_build_const_int32(gallivm, 3);
637 shuffles[2] = lp_build_const_int32(gallivm, 5);
638 shuffles[3] = lp_build_const_int32(gallivm, 7);
639 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640 LLVMConstVector(shuffles, 4), "");
641
642 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643 }
644
645
646 /*
647 * partially horizontally add 2-4 float vectors with length nx4,
648 * i.e. only four adjacent values in each vector will be added,
649 * assuming values are really grouped in 4 which also determines
650 * output order.
651 *
652 * Return a vector of the same length as the initial vectors,
653 * with the excess elements (if any) being undefined.
654 * The element order is independent of number of input vectors.
655 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656 * the output order thus will be
657 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658 */
659 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)660 lp_build_hadd_partial4(struct lp_build_context *bld,
661 LLVMValueRef vectors[],
662 unsigned num_vecs)
663 {
664 struct gallivm_state *gallivm = bld->gallivm;
665 LLVMBuilderRef builder = gallivm->builder;
666 LLVMValueRef ret_vec;
667 LLVMValueRef tmp[4];
668 const char *intrinsic = NULL;
669
670 assert(num_vecs >= 2 && num_vecs <= 4);
671 assert(bld->type.floating);
672
673 /* only use this with at least 2 vectors, as it is sort of expensive
674 * (depending on cpu) and we always need two horizontal adds anyway,
675 * so a shuffle/add approach might be better.
676 */
677
678 tmp[0] = vectors[0];
679 tmp[1] = vectors[1];
680
681 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683
684 if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685 bld->type.length == 4) {
686 intrinsic = "llvm.x86.sse3.hadd.ps";
687 }
688 else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689 bld->type.length == 8) {
690 intrinsic = "llvm.x86.avx.hadd.ps.256";
691 }
692 if (intrinsic) {
693 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694 lp_build_vec_type(gallivm, bld->type),
695 tmp[0], tmp[1]);
696 if (num_vecs > 2) {
697 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698 lp_build_vec_type(gallivm, bld->type),
699 tmp[2], tmp[3]);
700 }
701 else {
702 tmp[1] = tmp[0];
703 }
704 return lp_build_intrinsic_binary(builder, intrinsic,
705 lp_build_vec_type(gallivm, bld->type),
706 tmp[0], tmp[1]);
707 }
708
709 if (bld->type.length == 4) {
710 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711 }
712 else {
713 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714 unsigned j;
715 unsigned num_iter = bld->type.length / 4;
716 struct lp_type parttype = bld->type;
717 parttype.length = 4;
718 for (j = 0; j < num_iter; j++) {
719 LLVMValueRef partsrc[4];
720 unsigned i;
721 for (i = 0; i < 4; i++) {
722 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723 }
724 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725 }
726 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727 }
728 return ret_vec;
729 }
730
731 /**
732 * Generate a - b
733 */
734 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)735 lp_build_sub(struct lp_build_context *bld,
736 LLVMValueRef a,
737 LLVMValueRef b)
738 {
739 LLVMBuilderRef builder = bld->gallivm->builder;
740 const struct lp_type type = bld->type;
741 LLVMValueRef res;
742
743 assert(lp_check_value(type, a));
744 assert(lp_check_value(type, b));
745
746 if (b == bld->zero)
747 return a;
748 if (a == bld->undef || b == bld->undef)
749 return bld->undef;
750 if (a == b)
751 return bld->zero;
752
753 if (type.norm) {
754 const char *intrinsic = NULL;
755
756 if (!type.sign && b == bld->one)
757 return bld->zero;
758
759 if (!type.floating && !type.fixed) {
760 if (LLVM_VERSION_MAJOR >= 8) {
761 char intrin[32];
762 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765 }
766 if (type.width * type.length == 128) {
767 if (util_get_cpu_caps()->has_sse2) {
768 if (type.width == 8)
769 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770 if (type.width == 16)
771 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772 } else if (util_get_cpu_caps()->has_altivec) {
773 if (type.width == 8)
774 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775 if (type.width == 16)
776 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777 }
778 }
779 if (type.width * type.length == 256) {
780 if (util_get_cpu_caps()->has_avx2) {
781 if (type.width == 8)
782 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783 if (type.width == 16)
784 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785 }
786 }
787 }
788
789 if (intrinsic)
790 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791 }
792
793 if(type.norm && !type.floating && !type.fixed) {
794 if (type.sign) {
795 uint64_t sign = (uint64_t)1 << (type.width - 1);
796 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798 /* a_clamp_max is the maximum a for negative b,
799 a_clamp_min is the minimum a for positive b. */
800 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803 } else {
804 /*
805 * This must match llvm pattern for saturated unsigned sub.
806 * (lp_build_max_simple actually does the job with its current
807 * definition but do it explicitly here.)
808 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809 * interfere with llvm's ability to recognize the pattern but seems
810 * a bit brittle.
811 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812 */
813 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814 a = lp_build_select(bld, no_ov, a, b);
815 }
816 }
817
818 if(LLVMIsConstant(a) && LLVMIsConstant(b))
819 if (type.floating)
820 res = LLVMConstFSub(a, b);
821 else
822 res = LLVMConstSub(a, b);
823 else
824 if (type.floating)
825 res = LLVMBuildFSub(builder, a, b, "");
826 else
827 res = LLVMBuildSub(builder, a, b, "");
828
829 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831
832 return res;
833 }
834
835
836
837 /**
838 * Normalized multiplication.
839 *
840 * There are several approaches for (using 8-bit normalized multiplication as
841 * an example):
842 *
843 * - alpha plus one
844 *
845 * makes the following approximation to the division (Sree)
846 *
847 * a*b/255 ~= (a*(b + 1)) >> 256
848 *
849 * which is the fastest method that satisfies the following OpenGL criteria of
850 *
851 * 0*0 = 0 and 255*255 = 255
852 *
853 * - geometric series
854 *
855 * takes the geometric series approximation to the division
856 *
857 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858 *
859 * in this case just the first two terms to fit in 16bit arithmetic
860 *
861 * t/255 ~= (t + (t >> 8)) >> 8
862 *
863 * note that just by itself it doesn't satisfies the OpenGL criteria, as
864 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
865 * must be used.
866 *
867 * - geometric series plus rounding
868 *
869 * when using a geometric series division instead of truncating the result
870 * use roundoff in the approximation (Jim Blinn)
871 *
872 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
873 *
874 * achieving the exact results.
875 *
876 *
877 *
878 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880 * @sa Michael Herf, The "double blend trick", May 2000,
881 * http://www.stereopsis.com/doubleblend.html
882 */
883 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)884 lp_build_mul_norm(struct gallivm_state *gallivm,
885 struct lp_type wide_type,
886 LLVMValueRef a, LLVMValueRef b)
887 {
888 LLVMBuilderRef builder = gallivm->builder;
889 struct lp_build_context bld;
890 unsigned n;
891 LLVMValueRef half;
892 LLVMValueRef ab;
893
894 assert(!wide_type.floating);
895 assert(lp_check_value(wide_type, a));
896 assert(lp_check_value(wide_type, b));
897
898 lp_build_context_init(&bld, gallivm, wide_type);
899
900 n = wide_type.width / 2;
901 if (wide_type.sign) {
902 --n;
903 }
904
905 /*
906 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908 */
909
910 /*
911 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912 */
913
914 ab = LLVMBuildMul(builder, a, b, "");
915 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916
917 /*
918 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919 */
920
921 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922 if (wide_type.sign) {
923 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925 half = lp_build_select(&bld, sign, minus_half, half);
926 }
927 ab = LLVMBuildAdd(builder, ab, half, "");
928
929 /* Final division */
930 ab = lp_build_shr_imm(&bld, ab, n);
931
932 return ab;
933 }
934
935 /**
936 * Generate a * b
937 */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940 LLVMValueRef a,
941 LLVMValueRef b)
942 {
943 LLVMBuilderRef builder = bld->gallivm->builder;
944 const struct lp_type type = bld->type;
945 LLVMValueRef shift;
946 LLVMValueRef res;
947
948 assert(lp_check_value(type, a));
949 assert(lp_check_value(type, b));
950
951 if(a == bld->zero)
952 return bld->zero;
953 if(a == bld->one)
954 return b;
955 if(b == bld->zero)
956 return bld->zero;
957 if(b == bld->one)
958 return a;
959 if(a == bld->undef || b == bld->undef)
960 return bld->undef;
961
962 if (!type.floating && !type.fixed && type.norm) {
963 struct lp_type wide_type = lp_wider_type(type);
964 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965
966 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968
969 /* PMULLW, PSRLW, PADDW */
970 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972
973 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974
975 return ab;
976 }
977
978 if(type.fixed)
979 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980 else
981 shift = NULL;
982
983 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984 if (type.floating)
985 res = LLVMConstFMul(a, b);
986 else
987 res = LLVMConstMul(a, b);
988 if(shift) {
989 if(type.sign)
990 res = LLVMConstAShr(res, shift);
991 else
992 res = LLVMConstLShr(res, shift);
993 }
994 }
995 else {
996 if (type.floating)
997 res = LLVMBuildFMul(builder, a, b, "");
998 else
999 res = LLVMBuildMul(builder, a, b, "");
1000 if(shift) {
1001 if(type.sign)
1002 res = LLVMBuildAShr(builder, res, shift, "");
1003 else
1004 res = LLVMBuildLShr(builder, res, shift, "");
1005 }
1006 }
1007
1008 return res;
1009 }
1010
1011 /*
1012 * Widening mul, valid for 32x32 bit -> 64bit only.
1013 * Result is low 32bits, high bits returned in res_hi.
1014 *
1015 * Emits code that is meant to be compiled for the host CPU.
1016 */
1017 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1018 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019 LLVMValueRef a,
1020 LLVMValueRef b,
1021 LLVMValueRef *res_hi)
1022 {
1023 struct gallivm_state *gallivm = bld->gallivm;
1024 LLVMBuilderRef builder = gallivm->builder;
1025
1026 assert(bld->type.width == 32);
1027 assert(bld->type.floating == 0);
1028 assert(bld->type.fixed == 0);
1029 assert(bld->type.norm == 0);
1030
1031 /*
1032 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033 * for x86 simd is atrocious (even if the high bits weren't required),
1034 * trying to handle real 64bit inputs (which of course can't happen due
1035 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036 * apparently llvm does not recognize this widening mul). This includes 6
1037 * (instead of 2) pmuludq plus extra adds and shifts
1038 * The same story applies to signed mul, albeit fixing this requires sse41.
1039 * https://llvm.org/bugs/show_bug.cgi?id=30845
1040 * So, whip up our own code, albeit only for length 4 and 8 (which
1041 * should be good enough)...
1042 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044 * for signed), which the fallback code does not, without this llvm
1045 * will likely still produce atrocious code.
1046 */
1047 if (LLVM_VERSION_MAJOR < 7 &&
1048 (bld->type.length == 4 || bld->type.length == 8) &&
1049 ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050 util_get_cpu_caps()->has_sse4_1)) {
1051 const char *intrinsic = NULL;
1052 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054 struct lp_type type_wide = lp_wider_type(bld->type);
1055 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056 unsigned i;
1057 for (i = 0; i < bld->type.length; i += 2) {
1058 shuf[i] = lp_build_const_int32(gallivm, i+1);
1059 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060 }
1061 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062 aeven = a;
1063 beven = b;
1064 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066
1067 if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068 if (bld->type.sign) {
1069 intrinsic = "llvm.x86.avx2.pmul.dq";
1070 } else {
1071 intrinsic = "llvm.x86.avx2.pmulu.dq";
1072 }
1073 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074 wider_type, aeven, beven);
1075 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076 wider_type, aodd, bodd);
1077 }
1078 else {
1079 /* for consistent naming look elsewhere... */
1080 if (bld->type.sign) {
1081 intrinsic = "llvm.x86.sse41.pmuldq";
1082 } else {
1083 intrinsic = "llvm.x86.sse2.pmulu.dq";
1084 }
1085 /*
1086 * XXX If we only have AVX but not AVX2 this is a pain.
1087 * lp_build_intrinsic_binary_anylength() can't handle it
1088 * (due to src and dst type not being identical).
1089 */
1090 if (bld->type.length == 8) {
1091 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093 LLVMValueRef muleven2[2], mulodd2[2];
1094 struct lp_type type_wide_half = type_wide;
1095 LLVMTypeRef wtype_half;
1096 type_wide_half.length = 2;
1097 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107 wtype_half, aevenlo, bevenlo);
1108 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109 wtype_half, aoddlo, boddlo);
1110 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111 wtype_half, aevenhi, bevenhi);
1112 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113 wtype_half, aoddhi, boddhi);
1114 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116
1117 }
1118 else {
1119 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120 wider_type, aeven, beven);
1121 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122 wider_type, aodd, bodd);
1123 }
1124 }
1125 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127
1128 for (i = 0; i < bld->type.length; i += 2) {
1129 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131 }
1132 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i);
1137 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141 }
1142 else {
1143 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144 }
1145 }
1146
1147
1148 /*
1149 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150 * Result is low N bits, high bits returned in res_hi.
1151 *
1152 * Emits generic code.
1153 */
1154 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1155 lp_build_mul_32_lohi(struct lp_build_context *bld,
1156 LLVMValueRef a,
1157 LLVMValueRef b,
1158 LLVMValueRef *res_hi)
1159 {
1160 struct gallivm_state *gallivm = bld->gallivm;
1161 LLVMBuilderRef builder = gallivm->builder;
1162 LLVMValueRef tmp, shift, res_lo;
1163 struct lp_type type_tmp;
1164 LLVMTypeRef wide_type, narrow_type;
1165
1166 type_tmp = bld->type;
1167 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168 if (bld->type.width < 32)
1169 type_tmp.width = 32;
1170 else
1171 type_tmp.width *= 2;
1172 wide_type = lp_build_vec_type(gallivm, type_tmp);
1173 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174
1175 if (bld->type.sign) {
1176 a = LLVMBuildSExt(builder, a, wide_type, "");
1177 b = LLVMBuildSExt(builder, b, wide_type, "");
1178 } else {
1179 a = LLVMBuildZExt(builder, a, wide_type, "");
1180 b = LLVMBuildZExt(builder, b, wide_type, "");
1181 }
1182 tmp = LLVMBuildMul(builder, a, b, "");
1183
1184 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185
1186 /* Since we truncate anyway, LShr and AShr are equivalent. */
1187 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189
1190 return res_lo;
1191 }
1192
1193
1194 /* a * b + c */
1195 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1196 lp_build_mad(struct lp_build_context *bld,
1197 LLVMValueRef a,
1198 LLVMValueRef b,
1199 LLVMValueRef c)
1200 {
1201 const struct lp_type type = bld->type;
1202 if (type.floating) {
1203 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204 } else {
1205 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206 }
1207 }
1208
1209
1210 /**
1211 * Small vector x scale multiplication optimization.
1212 */
1213 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1214 lp_build_mul_imm(struct lp_build_context *bld,
1215 LLVMValueRef a,
1216 int b)
1217 {
1218 LLVMBuilderRef builder = bld->gallivm->builder;
1219 LLVMValueRef factor;
1220
1221 assert(lp_check_value(bld->type, a));
1222
1223 if(b == 0)
1224 return bld->zero;
1225
1226 if(b == 1)
1227 return a;
1228
1229 if(b == -1)
1230 return lp_build_negate(bld, a);
1231
1232 if(b == 2 && bld->type.floating)
1233 return lp_build_add(bld, a, a);
1234
1235 if(util_is_power_of_two_or_zero(b)) {
1236 unsigned shift = ffs(b) - 1;
1237
1238 if(bld->type.floating) {
1239 #if 0
1240 /*
1241 * Power of two multiplication by directly manipulating the exponent.
1242 *
1243 * XXX: This might not be always faster, it will introduce a small error
1244 * for multiplication by zero, and it will produce wrong results
1245 * for Inf and NaN.
1246 */
1247 unsigned mantissa = lp_mantissa(bld->type);
1248 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250 a = LLVMBuildAdd(builder, a, factor, "");
1251 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252 return a;
1253 #endif
1254 }
1255 else {
1256 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257 return LLVMBuildShl(builder, a, factor, "");
1258 }
1259 }
1260
1261 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262 return lp_build_mul(bld, a, factor);
1263 }
1264
1265
1266 /**
1267 * Generate a / b
1268 */
1269 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1270 lp_build_div(struct lp_build_context *bld,
1271 LLVMValueRef a,
1272 LLVMValueRef b)
1273 {
1274 LLVMBuilderRef builder = bld->gallivm->builder;
1275 const struct lp_type type = bld->type;
1276
1277 assert(lp_check_value(type, a));
1278 assert(lp_check_value(type, b));
1279
1280 if(a == bld->zero)
1281 return bld->zero;
1282 if(a == bld->one && type.floating)
1283 return lp_build_rcp(bld, b);
1284 if(b == bld->zero)
1285 return bld->undef;
1286 if(b == bld->one)
1287 return a;
1288 if(a == bld->undef || b == bld->undef)
1289 return bld->undef;
1290
1291 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292 if (type.floating)
1293 return LLVMConstFDiv(a, b);
1294 else if (type.sign)
1295 return LLVMConstSDiv(a, b);
1296 else
1297 return LLVMConstUDiv(a, b);
1298 }
1299
1300 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301 if(FALSE &&
1302 ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304 type.floating)
1305 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306
1307 if (type.floating)
1308 return LLVMBuildFDiv(builder, a, b, "");
1309 else if (type.sign)
1310 return LLVMBuildSDiv(builder, a, b, "");
1311 else
1312 return LLVMBuildUDiv(builder, a, b, "");
1313 }
1314
1315
1316 /**
1317 * Linear interpolation helper.
1318 *
1319 * @param normalized whether we are interpolating normalized values,
1320 * encoded in normalized integers, twice as wide.
1321 *
1322 * @sa http://www.stereopsis.com/doubleblend.html
1323 */
1324 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1325 lp_build_lerp_simple(struct lp_build_context *bld,
1326 LLVMValueRef x,
1327 LLVMValueRef v0,
1328 LLVMValueRef v1,
1329 unsigned flags)
1330 {
1331 unsigned half_width = bld->type.width/2;
1332 LLVMBuilderRef builder = bld->gallivm->builder;
1333 LLVMValueRef delta;
1334 LLVMValueRef res;
1335
1336 assert(lp_check_value(bld->type, x));
1337 assert(lp_check_value(bld->type, v0));
1338 assert(lp_check_value(bld->type, v1));
1339
1340 delta = lp_build_sub(bld, v1, v0);
1341
1342 if (bld->type.floating) {
1343 assert(flags == 0);
1344 return lp_build_mad(bld, x, delta, v0);
1345 }
1346
1347 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348 if (!bld->type.sign) {
1349 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350 /*
1351 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352 * most-significant-bit to the lowest-significant-bit, so that
1353 * later we can just divide by 2**n instead of 2**n - 1.
1354 */
1355
1356 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357 }
1358
1359 /* (x * delta) >> n */
1360 /*
1361 * For this multiply, higher internal precision is required to pass CTS,
1362 * the most efficient path to that is pmulhrsw on ssse3 and above.
1363 * This could be opencoded on other arches if conformance was required.
1364 */
1365 if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1366 res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1367 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1368 } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1369 res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1370 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1371 } else {
1372 res = lp_build_mul(bld, x, delta);
1373 res = lp_build_shr_imm(bld, res, half_width);
1374 }
1375 } else {
1376 /*
1377 * The rescaling trick above doesn't work for signed numbers, so
1378 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1379 * instead.
1380 */
1381 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1382 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1383 }
1384 } else {
1385 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1386 res = lp_build_mul(bld, x, delta);
1387 }
1388
1389 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1390 /*
1391 * At this point both res and v0 only use the lower half of the bits,
1392 * the rest is zero. Instead of add / mask, do add with half wide type.
1393 */
1394 struct lp_type narrow_type;
1395 struct lp_build_context narrow_bld;
1396
1397 memset(&narrow_type, 0, sizeof narrow_type);
1398 narrow_type.sign = bld->type.sign;
1399 narrow_type.width = bld->type.width/2;
1400 narrow_type.length = bld->type.length*2;
1401
1402 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1403 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1404 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1405 res = lp_build_add(&narrow_bld, v0, res);
1406 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1407 } else {
1408 res = lp_build_add(bld, v0, res);
1409
1410 if (bld->type.fixed) {
1411 /*
1412 * We need to mask out the high order bits when lerping 8bit
1413 * normalized colors stored on 16bits
1414 */
1415 /* XXX: This step is necessary for lerping 8bit colors stored on
1416 * 16bits, but it will be wrong for true fixed point use cases.
1417 * Basically we need a more powerful lp_type, capable of further
1418 * distinguishing the values interpretation from the value storage.
1419 */
1420 LLVMValueRef low_bits;
1421 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1422 res = LLVMBuildAnd(builder, res, low_bits, "");
1423 }
1424 }
1425
1426 return res;
1427 }
1428
1429
1430 /**
1431 * Linear interpolation.
1432 */
1433 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1434 lp_build_lerp(struct lp_build_context *bld,
1435 LLVMValueRef x,
1436 LLVMValueRef v0,
1437 LLVMValueRef v1,
1438 unsigned flags)
1439 {
1440 const struct lp_type type = bld->type;
1441 LLVMValueRef res;
1442
1443 assert(lp_check_value(type, x));
1444 assert(lp_check_value(type, v0));
1445 assert(lp_check_value(type, v1));
1446
1447 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1448
1449 if (type.norm) {
1450 struct lp_type wide_type;
1451 struct lp_build_context wide_bld;
1452 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1453
1454 assert(type.length >= 2);
1455
1456 /*
1457 * Create a wider integer type, enough to hold the
1458 * intermediate result of the multiplication.
1459 */
1460 memset(&wide_type, 0, sizeof wide_type);
1461 wide_type.sign = type.sign;
1462 wide_type.width = type.width*2;
1463 wide_type.length = type.length/2;
1464
1465 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1466
1467 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1468 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1469 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1470
1471 /*
1472 * Lerp both halves.
1473 */
1474
1475 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1476
1477 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1478 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1479
1480 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1481 } else {
1482 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1483 }
1484
1485 return res;
1486 }
1487
1488
1489 /**
1490 * Bilinear interpolation.
1491 *
1492 * Values indices are in v_{yx}.
1493 */
1494 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1495 lp_build_lerp_2d(struct lp_build_context *bld,
1496 LLVMValueRef x,
1497 LLVMValueRef y,
1498 LLVMValueRef v00,
1499 LLVMValueRef v01,
1500 LLVMValueRef v10,
1501 LLVMValueRef v11,
1502 unsigned flags)
1503 {
1504 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1505 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1506 return lp_build_lerp(bld, y, v0, v1, flags);
1507 }
1508
1509
1510 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1511 lp_build_lerp_3d(struct lp_build_context *bld,
1512 LLVMValueRef x,
1513 LLVMValueRef y,
1514 LLVMValueRef z,
1515 LLVMValueRef v000,
1516 LLVMValueRef v001,
1517 LLVMValueRef v010,
1518 LLVMValueRef v011,
1519 LLVMValueRef v100,
1520 LLVMValueRef v101,
1521 LLVMValueRef v110,
1522 LLVMValueRef v111,
1523 unsigned flags)
1524 {
1525 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1526 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1527 return lp_build_lerp(bld, z, v0, v1, flags);
1528 }
1529
1530
1531 /**
1532 * Generate min(a, b)
1533 * Do checks for special cases but not for nans.
1534 */
1535 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1536 lp_build_min(struct lp_build_context *bld,
1537 LLVMValueRef a,
1538 LLVMValueRef b)
1539 {
1540 assert(lp_check_value(bld->type, a));
1541 assert(lp_check_value(bld->type, b));
1542
1543 if(a == bld->undef || b == bld->undef)
1544 return bld->undef;
1545
1546 if(a == b)
1547 return a;
1548
1549 if (bld->type.norm) {
1550 if (!bld->type.sign) {
1551 if (a == bld->zero || b == bld->zero) {
1552 return bld->zero;
1553 }
1554 }
1555 if(a == bld->one)
1556 return b;
1557 if(b == bld->one)
1558 return a;
1559 }
1560
1561 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1562 }
1563
1564 /**
1565 * Generate min(a, b)
1566 * NaN's are handled according to the behavior specified by the
1567 * nan_behavior argument.
1568 */
1569 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1570 lp_build_min_ext(struct lp_build_context *bld,
1571 LLVMValueRef a,
1572 LLVMValueRef b,
1573 enum gallivm_nan_behavior nan_behavior)
1574 {
1575 assert(lp_check_value(bld->type, a));
1576 assert(lp_check_value(bld->type, b));
1577
1578 if(a == bld->undef || b == bld->undef)
1579 return bld->undef;
1580
1581 if(a == b)
1582 return a;
1583
1584 if (bld->type.norm) {
1585 if (!bld->type.sign) {
1586 if (a == bld->zero || b == bld->zero) {
1587 return bld->zero;
1588 }
1589 }
1590 if(a == bld->one)
1591 return b;
1592 if(b == bld->one)
1593 return a;
1594 }
1595
1596 return lp_build_min_simple(bld, a, b, nan_behavior);
1597 }
1598
1599 /**
1600 * Generate max(a, b)
1601 * Do checks for special cases, but NaN behavior is undefined.
1602 */
1603 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1604 lp_build_max(struct lp_build_context *bld,
1605 LLVMValueRef a,
1606 LLVMValueRef b)
1607 {
1608 assert(lp_check_value(bld->type, a));
1609 assert(lp_check_value(bld->type, b));
1610
1611 if(a == bld->undef || b == bld->undef)
1612 return bld->undef;
1613
1614 if(a == b)
1615 return a;
1616
1617 if(bld->type.norm) {
1618 if(a == bld->one || b == bld->one)
1619 return bld->one;
1620 if (!bld->type.sign) {
1621 if (a == bld->zero) {
1622 return b;
1623 }
1624 if (b == bld->zero) {
1625 return a;
1626 }
1627 }
1628 }
1629
1630 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1631 }
1632
1633
1634 /**
1635 * Generate max(a, b)
1636 * Checks for special cases.
1637 * NaN's are handled according to the behavior specified by the
1638 * nan_behavior argument.
1639 */
1640 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1641 lp_build_max_ext(struct lp_build_context *bld,
1642 LLVMValueRef a,
1643 LLVMValueRef b,
1644 enum gallivm_nan_behavior nan_behavior)
1645 {
1646 assert(lp_check_value(bld->type, a));
1647 assert(lp_check_value(bld->type, b));
1648
1649 if(a == bld->undef || b == bld->undef)
1650 return bld->undef;
1651
1652 if(a == b)
1653 return a;
1654
1655 if(bld->type.norm) {
1656 if(a == bld->one || b == bld->one)
1657 return bld->one;
1658 if (!bld->type.sign) {
1659 if (a == bld->zero) {
1660 return b;
1661 }
1662 if (b == bld->zero) {
1663 return a;
1664 }
1665 }
1666 }
1667
1668 return lp_build_max_simple(bld, a, b, nan_behavior);
1669 }
1670
1671 /**
1672 * Generate clamp(a, min, max)
1673 * NaN behavior (for any of a, min, max) is undefined.
1674 * Do checks for special cases.
1675 */
1676 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1677 lp_build_clamp(struct lp_build_context *bld,
1678 LLVMValueRef a,
1679 LLVMValueRef min,
1680 LLVMValueRef max)
1681 {
1682 assert(lp_check_value(bld->type, a));
1683 assert(lp_check_value(bld->type, min));
1684 assert(lp_check_value(bld->type, max));
1685
1686 a = lp_build_min(bld, a, max);
1687 a = lp_build_max(bld, a, min);
1688 return a;
1689 }
1690
1691
1692 /**
1693 * Generate clamp(a, 0, 1)
1694 * A NaN will get converted to zero.
1695 */
1696 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1697 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1698 LLVMValueRef a)
1699 {
1700 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1701 a = lp_build_min(bld, a, bld->one);
1702 return a;
1703 }
1704
1705
1706 /**
1707 * Generate abs(a)
1708 */
1709 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1710 lp_build_abs(struct lp_build_context *bld,
1711 LLVMValueRef a)
1712 {
1713 LLVMBuilderRef builder = bld->gallivm->builder;
1714 const struct lp_type type = bld->type;
1715 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1716
1717 assert(lp_check_value(type, a));
1718
1719 if(!type.sign)
1720 return a;
1721
1722 if(type.floating) {
1723 char intrinsic[32];
1724 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1725 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1726 }
1727
1728 if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1729 switch(type.width) {
1730 case 8:
1731 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1732 case 16:
1733 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1734 case 32:
1735 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1736 }
1737 }
1738 else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1739 switch(type.width) {
1740 case 8:
1741 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1742 case 16:
1743 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1744 case 32:
1745 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1746 }
1747 }
1748
1749 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1750 a, LLVMBuildNeg(builder, a, ""));
1751 }
1752
1753
1754 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_negate(struct lp_build_context *bld,
1756 LLVMValueRef a)
1757 {
1758 LLVMBuilderRef builder = bld->gallivm->builder;
1759
1760 assert(lp_check_value(bld->type, a));
1761
1762 if (bld->type.floating)
1763 a = LLVMBuildFNeg(builder, a, "");
1764 else
1765 a = LLVMBuildNeg(builder, a, "");
1766
1767 return a;
1768 }
1769
1770
1771 /** Return -1, 0 or +1 depending on the sign of a */
1772 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_sgn(struct lp_build_context *bld,
1774 LLVMValueRef a)
1775 {
1776 LLVMBuilderRef builder = bld->gallivm->builder;
1777 const struct lp_type type = bld->type;
1778 LLVMValueRef cond;
1779 LLVMValueRef res;
1780
1781 assert(lp_check_value(type, a));
1782
1783 /* Handle non-zero case */
1784 if(!type.sign) {
1785 /* if not zero then sign must be positive */
1786 res = bld->one;
1787 }
1788 else if(type.floating) {
1789 LLVMTypeRef vec_type;
1790 LLVMTypeRef int_type;
1791 LLVMValueRef mask;
1792 LLVMValueRef sign;
1793 LLVMValueRef one;
1794 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1795
1796 int_type = lp_build_int_vec_type(bld->gallivm, type);
1797 vec_type = lp_build_vec_type(bld->gallivm, type);
1798 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1799
1800 /* Take the sign bit and add it to 1 constant */
1801 sign = LLVMBuildBitCast(builder, a, int_type, "");
1802 sign = LLVMBuildAnd(builder, sign, mask, "");
1803 one = LLVMConstBitCast(bld->one, int_type);
1804 res = LLVMBuildOr(builder, sign, one, "");
1805 res = LLVMBuildBitCast(builder, res, vec_type, "");
1806 }
1807 else
1808 {
1809 /* signed int/norm/fixed point */
1810 /* could use psign with sse3 and appropriate vectors here */
1811 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1812 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1813 res = lp_build_select(bld, cond, bld->one, minus_one);
1814 }
1815
1816 /* Handle zero */
1817 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1818 res = lp_build_select(bld, cond, bld->zero, res);
1819
1820 return res;
1821 }
1822
1823
1824 /**
1825 * Set the sign of float vector 'a' according to 'sign'.
1826 * If sign==0, return abs(a).
1827 * If sign==1, return -abs(a);
1828 * Other values for sign produce undefined results.
1829 */
1830 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1831 lp_build_set_sign(struct lp_build_context *bld,
1832 LLVMValueRef a, LLVMValueRef sign)
1833 {
1834 LLVMBuilderRef builder = bld->gallivm->builder;
1835 const struct lp_type type = bld->type;
1836 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1837 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1838 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1839 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1840 ~((unsigned long long) 1 << (type.width - 1)));
1841 LLVMValueRef val, res;
1842
1843 assert(type.floating);
1844 assert(lp_check_value(type, a));
1845
1846 /* val = reinterpret_cast<int>(a) */
1847 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1848 /* val = val & mask */
1849 val = LLVMBuildAnd(builder, val, mask, "");
1850 /* sign = sign << shift */
1851 sign = LLVMBuildShl(builder, sign, shift, "");
1852 /* res = val | sign */
1853 res = LLVMBuildOr(builder, val, sign, "");
1854 /* res = reinterpret_cast<float>(res) */
1855 res = LLVMBuildBitCast(builder, res, vec_type, "");
1856
1857 return res;
1858 }
1859
1860
1861 /**
1862 * Convert vector of (or scalar) int to vector of (or scalar) float.
1863 */
1864 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1865 lp_build_int_to_float(struct lp_build_context *bld,
1866 LLVMValueRef a)
1867 {
1868 LLVMBuilderRef builder = bld->gallivm->builder;
1869 const struct lp_type type = bld->type;
1870 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1871
1872 assert(type.floating);
1873
1874 return LLVMBuildSIToFP(builder, a, vec_type, "");
1875 }
1876
1877 static boolean
arch_rounding_available(const struct lp_type type)1878 arch_rounding_available(const struct lp_type type)
1879 {
1880 if ((util_get_cpu_caps()->has_sse4_1 &&
1881 (type.length == 1 || type.width*type.length == 128)) ||
1882 (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1883 (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1884 return TRUE;
1885 else if ((util_get_cpu_caps()->has_altivec &&
1886 (type.width == 32 && type.length == 4)))
1887 return TRUE;
1888 else if (util_get_cpu_caps()->has_neon)
1889 return TRUE;
1890
1891 return FALSE;
1892 }
1893
1894 enum lp_build_round_mode
1895 {
1896 LP_BUILD_ROUND_NEAREST = 0,
1897 LP_BUILD_ROUND_FLOOR = 1,
1898 LP_BUILD_ROUND_CEIL = 2,
1899 LP_BUILD_ROUND_TRUNCATE = 3
1900 };
1901
1902 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1903 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1904 LLVMValueRef a)
1905 {
1906 LLVMBuilderRef builder = bld->gallivm->builder;
1907 const struct lp_type type = bld->type;
1908 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1909 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1910 const char *intrinsic;
1911 LLVMValueRef res;
1912
1913 assert(type.floating);
1914 /* using the double precision conversions is a bit more complicated */
1915 assert(type.width == 32);
1916
1917 assert(lp_check_value(type, a));
1918 assert(util_get_cpu_caps()->has_sse2);
1919
1920 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1921 if (type.length == 1) {
1922 LLVMTypeRef vec_type;
1923 LLVMValueRef undef;
1924 LLVMValueRef arg;
1925 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1926
1927 vec_type = LLVMVectorType(bld->elem_type, 4);
1928
1929 intrinsic = "llvm.x86.sse.cvtss2si";
1930
1931 undef = LLVMGetUndef(vec_type);
1932
1933 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1934
1935 res = lp_build_intrinsic_unary(builder, intrinsic,
1936 ret_type, arg);
1937 }
1938 else {
1939 if (type.width* type.length == 128) {
1940 intrinsic = "llvm.x86.sse2.cvtps2dq";
1941 }
1942 else {
1943 assert(type.width*type.length == 256);
1944 assert(util_get_cpu_caps()->has_avx);
1945
1946 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1947 }
1948 res = lp_build_intrinsic_unary(builder, intrinsic,
1949 ret_type, a);
1950 }
1951
1952 return res;
1953 }
1954
1955
1956 /*
1957 */
1958 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1959 lp_build_round_altivec(struct lp_build_context *bld,
1960 LLVMValueRef a,
1961 enum lp_build_round_mode mode)
1962 {
1963 LLVMBuilderRef builder = bld->gallivm->builder;
1964 const struct lp_type type = bld->type;
1965 const char *intrinsic = NULL;
1966
1967 assert(type.floating);
1968
1969 assert(lp_check_value(type, a));
1970 assert(util_get_cpu_caps()->has_altivec);
1971
1972 (void)type;
1973
1974 switch (mode) {
1975 case LP_BUILD_ROUND_NEAREST:
1976 intrinsic = "llvm.ppc.altivec.vrfin";
1977 break;
1978 case LP_BUILD_ROUND_FLOOR:
1979 intrinsic = "llvm.ppc.altivec.vrfim";
1980 break;
1981 case LP_BUILD_ROUND_CEIL:
1982 intrinsic = "llvm.ppc.altivec.vrfip";
1983 break;
1984 case LP_BUILD_ROUND_TRUNCATE:
1985 intrinsic = "llvm.ppc.altivec.vrfiz";
1986 break;
1987 }
1988
1989 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1990 }
1991
1992 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1993 lp_build_round_arch(struct lp_build_context *bld,
1994 LLVMValueRef a,
1995 enum lp_build_round_mode mode)
1996 {
1997 if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
1998 LLVMBuilderRef builder = bld->gallivm->builder;
1999 const struct lp_type type = bld->type;
2000 const char *intrinsic_root;
2001 char intrinsic[32];
2002
2003 assert(type.floating);
2004 assert(lp_check_value(type, a));
2005 (void)type;
2006
2007 switch (mode) {
2008 case LP_BUILD_ROUND_NEAREST:
2009 intrinsic_root = "llvm.nearbyint";
2010 break;
2011 case LP_BUILD_ROUND_FLOOR:
2012 intrinsic_root = "llvm.floor";
2013 break;
2014 case LP_BUILD_ROUND_CEIL:
2015 intrinsic_root = "llvm.ceil";
2016 break;
2017 case LP_BUILD_ROUND_TRUNCATE:
2018 intrinsic_root = "llvm.trunc";
2019 break;
2020 default:
2021 unreachable("unhandled lp_build_round_mode");
2022 }
2023
2024 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2025 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2026 }
2027 else /* (util_get_cpu_caps()->has_altivec) */
2028 return lp_build_round_altivec(bld, a, mode);
2029 }
2030
2031 /**
2032 * Return the integer part of a float (vector) value (== round toward zero).
2033 * The returned value is a float (vector).
2034 * Ex: trunc(-1.5) = -1.0
2035 */
2036 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2037 lp_build_trunc(struct lp_build_context *bld,
2038 LLVMValueRef a)
2039 {
2040 LLVMBuilderRef builder = bld->gallivm->builder;
2041 const struct lp_type type = bld->type;
2042
2043 assert(type.floating);
2044 assert(lp_check_value(type, a));
2045
2046 if (type.width == 16) {
2047 char intrinsic[64];
2048 lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2049 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2050 }
2051
2052 if (arch_rounding_available(type)) {
2053 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2054 }
2055 else {
2056 const struct lp_type type = bld->type;
2057 struct lp_type inttype;
2058 struct lp_build_context intbld;
2059 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2060 LLVMValueRef trunc, res, anosign, mask;
2061 LLVMTypeRef int_vec_type = bld->int_vec_type;
2062 LLVMTypeRef vec_type = bld->vec_type;
2063
2064 inttype = type;
2065 inttype.floating = 0;
2066 lp_build_context_init(&intbld, bld->gallivm, inttype);
2067
2068 /* round by truncation */
2069 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2070 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2071
2072 /* mask out sign bit */
2073 anosign = lp_build_abs(bld, a);
2074 /*
2075 * mask out all values if anosign > 2^24
2076 * This should work both for large ints (all rounding is no-op for them
2077 * because such floats are always exact) as well as special cases like
2078 * NaNs, Infs (taking advantage of the fact they use max exponent).
2079 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2080 */
2081 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2082 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2083 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2084 return lp_build_select(bld, mask, a, res);
2085 }
2086 }
2087
2088
2089 /**
2090 * Return float (vector) rounded to nearest integer (vector). The returned
2091 * value is a float (vector).
2092 * Ex: round(0.9) = 1.0
2093 * Ex: round(-1.5) = -2.0
2094 */
2095 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2096 lp_build_round(struct lp_build_context *bld,
2097 LLVMValueRef a)
2098 {
2099 LLVMBuilderRef builder = bld->gallivm->builder;
2100 const struct lp_type type = bld->type;
2101
2102 assert(type.floating);
2103 assert(lp_check_value(type, a));
2104
2105 if (type.width == 16) {
2106 char intrinsic[64];
2107 lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2108 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2109 }
2110
2111 if (arch_rounding_available(type)) {
2112 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2113 }
2114 else {
2115 const struct lp_type type = bld->type;
2116 struct lp_type inttype;
2117 struct lp_build_context intbld;
2118 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2119 LLVMValueRef res, anosign, mask;
2120 LLVMTypeRef int_vec_type = bld->int_vec_type;
2121 LLVMTypeRef vec_type = bld->vec_type;
2122
2123 inttype = type;
2124 inttype.floating = 0;
2125 lp_build_context_init(&intbld, bld->gallivm, inttype);
2126
2127 res = lp_build_iround(bld, a);
2128 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2129
2130 /* mask out sign bit */
2131 anosign = lp_build_abs(bld, a);
2132 /*
2133 * mask out all values if anosign > 2^24
2134 * This should work both for large ints (all rounding is no-op for them
2135 * because such floats are always exact) as well as special cases like
2136 * NaNs, Infs (taking advantage of the fact they use max exponent).
2137 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2138 */
2139 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2140 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2141 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2142 return lp_build_select(bld, mask, a, res);
2143 }
2144 }
2145
2146
2147 /**
2148 * Return floor of float (vector), result is a float (vector)
2149 * Ex: floor(1.1) = 1.0
2150 * Ex: floor(-1.1) = -2.0
2151 */
2152 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2153 lp_build_floor(struct lp_build_context *bld,
2154 LLVMValueRef a)
2155 {
2156 LLVMBuilderRef builder = bld->gallivm->builder;
2157 const struct lp_type type = bld->type;
2158
2159 assert(type.floating);
2160 assert(lp_check_value(type, a));
2161
2162 if (arch_rounding_available(type)) {
2163 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2164 }
2165 else {
2166 const struct lp_type type = bld->type;
2167 struct lp_type inttype;
2168 struct lp_build_context intbld;
2169 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2170 LLVMValueRef trunc, res, anosign, mask;
2171 LLVMTypeRef int_vec_type = bld->int_vec_type;
2172 LLVMTypeRef vec_type = bld->vec_type;
2173
2174 if (type.width != 32) {
2175 char intrinsic[32];
2176 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2177 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2178 }
2179
2180 assert(type.width == 32); /* might want to handle doubles at some point */
2181
2182 inttype = type;
2183 inttype.floating = 0;
2184 lp_build_context_init(&intbld, bld->gallivm, inttype);
2185
2186 /* round by truncation */
2187 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2188 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2189
2190 if (type.sign) {
2191 LLVMValueRef tmp;
2192
2193 /*
2194 * fix values if rounding is wrong (for non-special cases)
2195 * - this is the case if trunc > a
2196 */
2197 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2198 /* tmp = trunc > a ? 1.0 : 0.0 */
2199 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2200 tmp = lp_build_and(&intbld, mask, tmp);
2201 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2202 res = lp_build_sub(bld, res, tmp);
2203 }
2204
2205 /* mask out sign bit */
2206 anosign = lp_build_abs(bld, a);
2207 /*
2208 * mask out all values if anosign > 2^24
2209 * This should work both for large ints (all rounding is no-op for them
2210 * because such floats are always exact) as well as special cases like
2211 * NaNs, Infs (taking advantage of the fact they use max exponent).
2212 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2213 */
2214 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2215 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2216 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2217 return lp_build_select(bld, mask, a, res);
2218 }
2219 }
2220
2221
2222 /**
2223 * Return ceiling of float (vector), returning float (vector).
2224 * Ex: ceil( 1.1) = 2.0
2225 * Ex: ceil(-1.1) = -1.0
2226 */
2227 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2228 lp_build_ceil(struct lp_build_context *bld,
2229 LLVMValueRef a)
2230 {
2231 LLVMBuilderRef builder = bld->gallivm->builder;
2232 const struct lp_type type = bld->type;
2233
2234 assert(type.floating);
2235 assert(lp_check_value(type, a));
2236
2237 if (arch_rounding_available(type)) {
2238 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2239 }
2240 else {
2241 const struct lp_type type = bld->type;
2242 struct lp_type inttype;
2243 struct lp_build_context intbld;
2244 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2245 LLVMValueRef trunc, res, anosign, mask, tmp;
2246 LLVMTypeRef int_vec_type = bld->int_vec_type;
2247 LLVMTypeRef vec_type = bld->vec_type;
2248
2249 if (type.width != 32) {
2250 char intrinsic[32];
2251 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2252 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2253 }
2254
2255 assert(type.width == 32); /* might want to handle doubles at some point */
2256
2257 inttype = type;
2258 inttype.floating = 0;
2259 lp_build_context_init(&intbld, bld->gallivm, inttype);
2260
2261 /* round by truncation */
2262 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2263 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2264
2265 /*
2266 * fix values if rounding is wrong (for non-special cases)
2267 * - this is the case if trunc < a
2268 */
2269 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2270 /* tmp = trunc < a ? 1.0 : 0.0 */
2271 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2272 tmp = lp_build_and(&intbld, mask, tmp);
2273 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2274 res = lp_build_add(bld, trunc, tmp);
2275
2276 /* mask out sign bit */
2277 anosign = lp_build_abs(bld, a);
2278 /*
2279 * mask out all values if anosign > 2^24
2280 * This should work both for large ints (all rounding is no-op for them
2281 * because such floats are always exact) as well as special cases like
2282 * NaNs, Infs (taking advantage of the fact they use max exponent).
2283 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2284 */
2285 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2286 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2287 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2288 return lp_build_select(bld, mask, a, res);
2289 }
2290 }
2291
2292
2293 /**
2294 * Return fractional part of 'a' computed as a - floor(a)
2295 * Typically used in texture coord arithmetic.
2296 */
2297 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2298 lp_build_fract(struct lp_build_context *bld,
2299 LLVMValueRef a)
2300 {
2301 assert(bld->type.floating);
2302 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2303 }
2304
2305
2306 /**
2307 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2308 * against 0.99999(9). (Will also return that value for NaNs.)
2309 */
2310 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2311 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2312 {
2313 LLVMValueRef max;
2314
2315 /* this is the largest number smaller than 1.0 representable as float */
2316 max = lp_build_const_vec(bld->gallivm, bld->type,
2317 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2318 return lp_build_min_ext(bld, fract, max,
2319 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2320 }
2321
2322
2323 /**
2324 * Same as lp_build_fract, but guarantees that the result is always smaller
2325 * than one. Will also return the smaller-than-one value for infs, NaNs.
2326 */
2327 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2328 lp_build_fract_safe(struct lp_build_context *bld,
2329 LLVMValueRef a)
2330 {
2331 return clamp_fract(bld, lp_build_fract(bld, a));
2332 }
2333
2334
2335 /**
2336 * Return the integer part of a float (vector) value (== round toward zero).
2337 * The returned value is an integer (vector).
2338 * Ex: itrunc(-1.5) = -1
2339 */
2340 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2341 lp_build_itrunc(struct lp_build_context *bld,
2342 LLVMValueRef a)
2343 {
2344 LLVMBuilderRef builder = bld->gallivm->builder;
2345 const struct lp_type type = bld->type;
2346 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2347
2348 assert(type.floating);
2349 assert(lp_check_value(type, a));
2350
2351 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2352 }
2353
2354
2355 /**
2356 * Return float (vector) rounded to nearest integer (vector). The returned
2357 * value is an integer (vector).
2358 * Ex: iround(0.9) = 1
2359 * Ex: iround(-1.5) = -2
2360 */
2361 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2362 lp_build_iround(struct lp_build_context *bld,
2363 LLVMValueRef a)
2364 {
2365 LLVMBuilderRef builder = bld->gallivm->builder;
2366 const struct lp_type type = bld->type;
2367 LLVMTypeRef int_vec_type = bld->int_vec_type;
2368 LLVMValueRef res;
2369
2370 assert(type.floating);
2371
2372 assert(lp_check_value(type, a));
2373
2374 if ((util_get_cpu_caps()->has_sse2 &&
2375 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2376 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2377 return lp_build_iround_nearest_sse2(bld, a);
2378 }
2379 if (arch_rounding_available(type)) {
2380 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2381 }
2382 else {
2383 LLVMValueRef half;
2384
2385 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2386
2387 if (type.sign) {
2388 LLVMTypeRef vec_type = bld->vec_type;
2389 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2390 (unsigned long long)1 << (type.width - 1));
2391 LLVMValueRef sign;
2392
2393 /* get sign bit */
2394 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2395 sign = LLVMBuildAnd(builder, sign, mask, "");
2396
2397 /* sign * 0.5 */
2398 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2399 half = LLVMBuildOr(builder, sign, half, "");
2400 half = LLVMBuildBitCast(builder, half, vec_type, "");
2401 }
2402
2403 res = LLVMBuildFAdd(builder, a, half, "");
2404 }
2405
2406 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2407
2408 return res;
2409 }
2410
2411
2412 /**
2413 * Return floor of float (vector), result is an int (vector)
2414 * Ex: ifloor(1.1) = 1.0
2415 * Ex: ifloor(-1.1) = -2.0
2416 */
2417 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2418 lp_build_ifloor(struct lp_build_context *bld,
2419 LLVMValueRef a)
2420 {
2421 LLVMBuilderRef builder = bld->gallivm->builder;
2422 const struct lp_type type = bld->type;
2423 LLVMTypeRef int_vec_type = bld->int_vec_type;
2424 LLVMValueRef res;
2425
2426 assert(type.floating);
2427 assert(lp_check_value(type, a));
2428
2429 res = a;
2430 if (type.sign) {
2431 if (arch_rounding_available(type)) {
2432 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2433 }
2434 else {
2435 struct lp_type inttype;
2436 struct lp_build_context intbld;
2437 LLVMValueRef trunc, itrunc, mask;
2438
2439 assert(type.floating);
2440 assert(lp_check_value(type, a));
2441
2442 inttype = type;
2443 inttype.floating = 0;
2444 lp_build_context_init(&intbld, bld->gallivm, inttype);
2445
2446 /* round by truncation */
2447 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2448 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2449
2450 /*
2451 * fix values if rounding is wrong (for non-special cases)
2452 * - this is the case if trunc > a
2453 * The results of doing this with NaNs, very large values etc.
2454 * are undefined but this seems to be the case anyway.
2455 */
2456 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2457 /* cheapie minus one with mask since the mask is minus one / zero */
2458 return lp_build_add(&intbld, itrunc, mask);
2459 }
2460 }
2461
2462 /* round to nearest (toward zero) */
2463 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2464
2465 return res;
2466 }
2467
2468
2469 /**
2470 * Return ceiling of float (vector), returning int (vector).
2471 * Ex: iceil( 1.1) = 2
2472 * Ex: iceil(-1.1) = -1
2473 */
2474 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2475 lp_build_iceil(struct lp_build_context *bld,
2476 LLVMValueRef a)
2477 {
2478 LLVMBuilderRef builder = bld->gallivm->builder;
2479 const struct lp_type type = bld->type;
2480 LLVMTypeRef int_vec_type = bld->int_vec_type;
2481 LLVMValueRef res;
2482
2483 assert(type.floating);
2484 assert(lp_check_value(type, a));
2485
2486 if (arch_rounding_available(type)) {
2487 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2488 }
2489 else {
2490 struct lp_type inttype;
2491 struct lp_build_context intbld;
2492 LLVMValueRef trunc, itrunc, mask;
2493
2494 assert(type.floating);
2495 assert(lp_check_value(type, a));
2496
2497 inttype = type;
2498 inttype.floating = 0;
2499 lp_build_context_init(&intbld, bld->gallivm, inttype);
2500
2501 /* round by truncation */
2502 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2503 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2504
2505 /*
2506 * fix values if rounding is wrong (for non-special cases)
2507 * - this is the case if trunc < a
2508 * The results of doing this with NaNs, very large values etc.
2509 * are undefined but this seems to be the case anyway.
2510 */
2511 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2512 /* cheapie plus one with mask since the mask is minus one / zero */
2513 return lp_build_sub(&intbld, itrunc, mask);
2514 }
2515
2516 /* round to nearest (toward zero) */
2517 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2518
2519 return res;
2520 }
2521
2522
2523 /**
2524 * Combined ifloor() & fract().
2525 *
2526 * Preferred to calling the functions separately, as it will ensure that the
2527 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2528 */
2529 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2530 lp_build_ifloor_fract(struct lp_build_context *bld,
2531 LLVMValueRef a,
2532 LLVMValueRef *out_ipart,
2533 LLVMValueRef *out_fpart)
2534 {
2535 LLVMBuilderRef builder = bld->gallivm->builder;
2536 const struct lp_type type = bld->type;
2537 LLVMValueRef ipart;
2538
2539 assert(type.floating);
2540 assert(lp_check_value(type, a));
2541
2542 if (arch_rounding_available(type)) {
2543 /*
2544 * floor() is easier.
2545 */
2546
2547 ipart = lp_build_floor(bld, a);
2548 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2549 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2550 }
2551 else {
2552 /*
2553 * ifloor() is easier.
2554 */
2555
2556 *out_ipart = lp_build_ifloor(bld, a);
2557 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2558 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2559 }
2560 }
2561
2562
2563 /**
2564 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2565 * always smaller than one.
2566 */
2567 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2568 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2569 LLVMValueRef a,
2570 LLVMValueRef *out_ipart,
2571 LLVMValueRef *out_fpart)
2572 {
2573 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2574 *out_fpart = clamp_fract(bld, *out_fpart);
2575 }
2576
2577
2578 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2579 lp_build_sqrt(struct lp_build_context *bld,
2580 LLVMValueRef a)
2581 {
2582 LLVMBuilderRef builder = bld->gallivm->builder;
2583 const struct lp_type type = bld->type;
2584 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2585 char intrinsic[32];
2586
2587 assert(lp_check_value(type, a));
2588
2589 assert(type.floating);
2590 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2591
2592 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2593 }
2594
2595
2596 /**
2597 * Do one Newton-Raphson step to improve reciprocate precision:
2598 *
2599 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2600 *
2601 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2602 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2603 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2604 * halo. It would be necessary to clamp the argument to prevent this.
2605 *
2606 * See also:
2607 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2608 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2609 */
2610 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2611 lp_build_rcp_refine(struct lp_build_context *bld,
2612 LLVMValueRef a,
2613 LLVMValueRef rcp_a)
2614 {
2615 LLVMBuilderRef builder = bld->gallivm->builder;
2616 LLVMValueRef neg_a;
2617 LLVMValueRef res;
2618
2619 neg_a = LLVMBuildFNeg(builder, a, "");
2620 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2621 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2622
2623 return res;
2624 }
2625
2626
2627 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2628 lp_build_rcp(struct lp_build_context *bld,
2629 LLVMValueRef a)
2630 {
2631 LLVMBuilderRef builder = bld->gallivm->builder;
2632 const struct lp_type type = bld->type;
2633
2634 assert(lp_check_value(type, a));
2635
2636 if(a == bld->zero)
2637 return bld->undef;
2638 if(a == bld->one)
2639 return bld->one;
2640 if(a == bld->undef)
2641 return bld->undef;
2642
2643 assert(type.floating);
2644
2645 if(LLVMIsConstant(a))
2646 return LLVMConstFDiv(bld->one, a);
2647
2648 /*
2649 * We don't use RCPPS because:
2650 * - it only has 10bits of precision
2651 * - it doesn't even get the reciprocate of 1.0 exactly
2652 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2653 * - for recent processors the benefit over DIVPS is marginal, a case
2654 * dependent
2655 *
2656 * We could still use it on certain processors if benchmarks show that the
2657 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2658 * particular uses that require less workarounds.
2659 */
2660
2661 if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2662 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2663 const unsigned num_iterations = 0;
2664 LLVMValueRef res;
2665 unsigned i;
2666 const char *intrinsic = NULL;
2667
2668 if (type.length == 4) {
2669 intrinsic = "llvm.x86.sse.rcp.ps";
2670 }
2671 else {
2672 intrinsic = "llvm.x86.avx.rcp.ps.256";
2673 }
2674
2675 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2676
2677 for (i = 0; i < num_iterations; ++i) {
2678 res = lp_build_rcp_refine(bld, a, res);
2679 }
2680
2681 return res;
2682 }
2683
2684 return LLVMBuildFDiv(builder, bld->one, a, "");
2685 }
2686
2687
2688 /**
2689 * Do one Newton-Raphson step to improve rsqrt precision:
2690 *
2691 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2692 *
2693 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2694 */
2695 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2696 lp_build_rsqrt_refine(struct lp_build_context *bld,
2697 LLVMValueRef a,
2698 LLVMValueRef rsqrt_a)
2699 {
2700 LLVMBuilderRef builder = bld->gallivm->builder;
2701 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2702 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2703 LLVMValueRef res;
2704
2705 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2706 res = LLVMBuildFMul(builder, a, res, "");
2707 res = LLVMBuildFSub(builder, three, res, "");
2708 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2709 res = LLVMBuildFMul(builder, half, res, "");
2710
2711 return res;
2712 }
2713
2714
2715 /**
2716 * Generate 1/sqrt(a).
2717 * Result is undefined for values < 0, infinity for +0.
2718 */
2719 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2720 lp_build_rsqrt(struct lp_build_context *bld,
2721 LLVMValueRef a)
2722 {
2723 const struct lp_type type = bld->type;
2724
2725 assert(lp_check_value(type, a));
2726
2727 assert(type.floating);
2728
2729 /*
2730 * This should be faster but all denormals will end up as infinity.
2731 */
2732 if (0 && lp_build_fast_rsqrt_available(type)) {
2733 const unsigned num_iterations = 1;
2734 LLVMValueRef res;
2735 unsigned i;
2736
2737 /* rsqrt(1.0) != 1.0 here */
2738 res = lp_build_fast_rsqrt(bld, a);
2739
2740 if (num_iterations) {
2741 /*
2742 * Newton-Raphson will result in NaN instead of infinity for zero,
2743 * and NaN instead of zero for infinity.
2744 * Also, need to ensure rsqrt(1.0) == 1.0.
2745 * All numbers smaller than FLT_MIN will result in +infinity
2746 * (rsqrtps treats all denormals as zero).
2747 */
2748 LLVMValueRef cmp;
2749 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2750 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2751
2752 for (i = 0; i < num_iterations; ++i) {
2753 res = lp_build_rsqrt_refine(bld, a, res);
2754 }
2755 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2756 res = lp_build_select(bld, cmp, inf, res);
2757 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2758 res = lp_build_select(bld, cmp, bld->zero, res);
2759 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2760 res = lp_build_select(bld, cmp, bld->one, res);
2761 }
2762
2763 return res;
2764 }
2765
2766 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2767 }
2768
2769 /**
2770 * If there's a fast (inaccurate) rsqrt instruction available
2771 * (caller may want to avoid to call rsqrt_fast if it's not available,
2772 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2773 * unavailable it would result in sqrt/div/mul so obviously
2774 * much better to just call sqrt, skipping both div and mul).
2775 */
2776 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2777 lp_build_fast_rsqrt_available(struct lp_type type)
2778 {
2779 assert(type.floating);
2780
2781 if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2782 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2783 return true;
2784 }
2785 return false;
2786 }
2787
2788
2789 /**
2790 * Generate 1/sqrt(a).
2791 * Result is undefined for values < 0, infinity for +0.
2792 * Precision is limited, only ~10 bits guaranteed
2793 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2794 */
2795 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2796 lp_build_fast_rsqrt(struct lp_build_context *bld,
2797 LLVMValueRef a)
2798 {
2799 LLVMBuilderRef builder = bld->gallivm->builder;
2800 const struct lp_type type = bld->type;
2801
2802 assert(lp_check_value(type, a));
2803
2804 if (lp_build_fast_rsqrt_available(type)) {
2805 const char *intrinsic = NULL;
2806
2807 if (type.length == 4) {
2808 intrinsic = "llvm.x86.sse.rsqrt.ps";
2809 }
2810 else {
2811 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2812 }
2813 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2814 }
2815 else {
2816 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2817 }
2818 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2819 }
2820
2821
2822 /**
2823 * Generate sin(a) or cos(a) using polynomial approximation.
2824 * TODO: it might be worth recognizing sin and cos using same source
2825 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2826 * would be way cheaper than calculating (nearly) everything twice...
2827 * Not sure it's common enough to be worth bothering however, scs
2828 * opcode could also benefit from calculating both though.
2829 */
2830 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2831 lp_build_sin_or_cos(struct lp_build_context *bld,
2832 LLVMValueRef a,
2833 boolean cos)
2834 {
2835 struct gallivm_state *gallivm = bld->gallivm;
2836 LLVMBuilderRef b = gallivm->builder;
2837 struct lp_type int_type = lp_int_type(bld->type);
2838
2839 /*
2840 * take the absolute value,
2841 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2842 */
2843
2844 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2845 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2846
2847 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2848 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2849
2850 /*
2851 * scale by 4/Pi
2852 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2853 */
2854
2855 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2856 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2857
2858 /*
2859 * store the integer part of y in mm0
2860 * emm2 = _mm_cvttps_epi32(y);
2861 */
2862
2863 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2864
2865 /*
2866 * j=(j+1) & (~1) (see the cephes sources)
2867 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2868 */
2869
2870 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2871 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2872 /*
2873 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2874 */
2875 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2876 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2877
2878 /*
2879 * y = _mm_cvtepi32_ps(emm2);
2880 */
2881 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2882
2883 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2884 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2885 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2886 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2887
2888 /*
2889 * Argument used for poly selection and sign bit determination
2890 * is different for sin vs. cos.
2891 */
2892 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2893 emm2_and;
2894
2895 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2896 LLVMBuildNot(b, emm2_2, ""), ""),
2897 const_29, "sign_bit") :
2898 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2899 LLVMBuildShl(b, emm2_add,
2900 const_29, ""), ""),
2901 sign_mask, "sign_bit");
2902
2903 /*
2904 * get the polynom selection mask
2905 * there is one polynom for 0 <= x <= Pi/4
2906 * and another one for Pi/4<x<=Pi/2
2907 * Both branches will be computed.
2908 *
2909 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2910 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2911 */
2912
2913 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2914 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2915 int_type, PIPE_FUNC_EQUAL,
2916 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2917
2918 /*
2919 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2920 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2921 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2922 */
2923 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2924 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2925 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2926
2927 /*
2928 * The magic pass: "Extended precision modular arithmetic"
2929 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2930 */
2931 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2932 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2933 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2934
2935 /*
2936 * Evaluate the first polynom (0 <= x <= Pi/4)
2937 *
2938 * z = _mm_mul_ps(x,x);
2939 */
2940 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2941
2942 /*
2943 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2944 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2945 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2946 */
2947 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2948 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2949 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2950
2951 /*
2952 * y = *(v4sf*)_ps_coscof_p0;
2953 * y = _mm_mul_ps(y, z);
2954 */
2955 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2956 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2957 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2958 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2959
2960
2961 /*
2962 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2963 * y = _mm_sub_ps(y, tmp);
2964 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2965 */
2966 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2967 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2968 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2969 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2970 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2971
2972 /*
2973 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2974 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2975 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2976 */
2977 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2978 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2979 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2980
2981 /*
2982 * Evaluate the second polynom (Pi/4 <= x <= 0)
2983 *
2984 * y2 = *(v4sf*)_ps_sincof_p0;
2985 * y2 = _mm_mul_ps(y2, z);
2986 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2987 * y2 = _mm_mul_ps(y2, z);
2988 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2989 * y2 = _mm_mul_ps(y2, z);
2990 * y2 = _mm_mul_ps(y2, x);
2991 * y2 = _mm_add_ps(y2, x);
2992 */
2993
2994 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2995 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2996 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2997 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2998
2999 /*
3000 * select the correct result from the two polynoms
3001 * xmm3 = poly_mask;
3002 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3003 * y = _mm_andnot_ps(xmm3, y);
3004 * y = _mm_or_ps(y,y2);
3005 */
3006 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3007 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3008 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3009 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3010 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3011 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3012
3013 /*
3014 * update the sign
3015 * y = _mm_xor_ps(y, sign_bit);
3016 */
3017 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3018 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3019
3020 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3021
3022 /* clamp output to be within [-1, 1] */
3023 y_result = lp_build_clamp(bld, y_result,
3024 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3025 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3026 /* If a is -inf, inf or NaN then return NaN */
3027 y_result = lp_build_select(bld, isfinite, y_result,
3028 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3029 return y_result;
3030 }
3031
3032
3033 /**
3034 * Generate sin(a)
3035 */
3036 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3037 lp_build_sin(struct lp_build_context *bld,
3038 LLVMValueRef a)
3039 {
3040 const struct lp_type type = bld->type;
3041
3042 if (type.width == 16) {
3043 LLVMBuilderRef builder = bld->gallivm->builder;
3044 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3045 char intrinsic[32];
3046 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3047 LLVMValueRef args[] = { a };
3048 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3049 }
3050
3051 return lp_build_sin_or_cos(bld, a, FALSE);
3052 }
3053
3054
3055 /**
3056 * Generate cos(a)
3057 */
3058 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3059 lp_build_cos(struct lp_build_context *bld,
3060 LLVMValueRef a)
3061 {
3062 const struct lp_type type = bld->type;
3063
3064 if (type.width == 16) {
3065 LLVMBuilderRef builder = bld->gallivm->builder;
3066 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3067 char intrinsic[32];
3068 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3069 LLVMValueRef args[] = { a };
3070 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3071 }
3072
3073 return lp_build_sin_or_cos(bld, a, TRUE);
3074 }
3075
3076
3077 /**
3078 * Generate pow(x, y)
3079 */
3080 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3081 lp_build_pow(struct lp_build_context *bld,
3082 LLVMValueRef x,
3083 LLVMValueRef y)
3084 {
3085 /* TODO: optimize the constant case */
3086 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3087 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3088 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3089 __FUNCTION__);
3090 }
3091
3092 LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3093 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3094
3095 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3096 return res;
3097 }
3098
3099
3100 /**
3101 * Generate exp(x)
3102 */
3103 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3104 lp_build_exp(struct lp_build_context *bld,
3105 LLVMValueRef x)
3106 {
3107 /* log2(e) = 1/log(2) */
3108 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3109 1.4426950408889634);
3110
3111 assert(lp_check_value(bld->type, x));
3112
3113 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3114 }
3115
3116
3117 /**
3118 * Generate log(x)
3119 * Behavior is undefined with infs, 0s and nans
3120 */
3121 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3122 lp_build_log(struct lp_build_context *bld,
3123 LLVMValueRef x)
3124 {
3125 /* log(2) */
3126 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3127 0.69314718055994529);
3128
3129 assert(lp_check_value(bld->type, x));
3130
3131 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3132 }
3133
3134 /**
3135 * Generate log(x) that handles edge cases (infs, 0s and nans)
3136 */
3137 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3138 lp_build_log_safe(struct lp_build_context *bld,
3139 LLVMValueRef x)
3140 {
3141 /* log(2) */
3142 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3143 0.69314718055994529);
3144
3145 assert(lp_check_value(bld->type, x));
3146
3147 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3148 }
3149
3150
3151 /**
3152 * Generate polynomial.
3153 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3154 */
3155 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3156 lp_build_polynomial(struct lp_build_context *bld,
3157 LLVMValueRef x,
3158 const double *coeffs,
3159 unsigned num_coeffs)
3160 {
3161 const struct lp_type type = bld->type;
3162 LLVMValueRef even = NULL, odd = NULL;
3163 LLVMValueRef x2;
3164 unsigned i;
3165
3166 assert(lp_check_value(bld->type, x));
3167
3168 /* TODO: optimize the constant case */
3169 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3170 LLVMIsConstant(x)) {
3171 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3172 __FUNCTION__);
3173 }
3174
3175 /*
3176 * Calculate odd and even terms seperately to decrease data dependency
3177 * Ex:
3178 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3179 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3180 */
3181 x2 = lp_build_mul(bld, x, x);
3182
3183 for (i = num_coeffs; i--; ) {
3184 LLVMValueRef coeff;
3185
3186 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3187
3188 if (i % 2 == 0) {
3189 if (even)
3190 even = lp_build_mad(bld, x2, even, coeff);
3191 else
3192 even = coeff;
3193 } else {
3194 if (odd)
3195 odd = lp_build_mad(bld, x2, odd, coeff);
3196 else
3197 odd = coeff;
3198 }
3199 }
3200
3201 if (odd)
3202 return lp_build_mad(bld, odd, x, even);
3203 else if (even)
3204 return even;
3205 else
3206 return bld->undef;
3207 }
3208
3209
3210 /**
3211 * Minimax polynomial fit of 2**x, in range [0, 1[
3212 */
3213 const double lp_build_exp2_polynomial[] = {
3214 #if EXP_POLY_DEGREE == 5
3215 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3216 0.693153073200168932794,
3217 0.240153617044375388211,
3218 0.0558263180532956664775,
3219 0.00898934009049466391101,
3220 0.00187757667519147912699
3221 #elif EXP_POLY_DEGREE == 4
3222 1.00000259337069434683,
3223 0.693003834469974940458,
3224 0.24144275689150793076,
3225 0.0520114606103070150235,
3226 0.0135341679161270268764
3227 #elif EXP_POLY_DEGREE == 3
3228 0.999925218562710312959,
3229 0.695833540494823811697,
3230 0.226067155427249155588,
3231 0.0780245226406372992967
3232 #elif EXP_POLY_DEGREE == 2
3233 1.00172476321474503578,
3234 0.657636275736077639316,
3235 0.33718943461968720704
3236 #else
3237 #error
3238 #endif
3239 };
3240
3241
3242 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3243 lp_build_exp2(struct lp_build_context *bld,
3244 LLVMValueRef x)
3245 {
3246 LLVMBuilderRef builder = bld->gallivm->builder;
3247 const struct lp_type type = bld->type;
3248 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3249 LLVMValueRef ipart = NULL;
3250 LLVMValueRef fpart = NULL;
3251 LLVMValueRef expipart = NULL;
3252 LLVMValueRef expfpart = NULL;
3253 LLVMValueRef res = NULL;
3254
3255 if (type.floating && type.width == 16) {
3256 char intrinsic[32];
3257 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3258 LLVMValueRef args[] = { x };
3259 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3260 }
3261
3262 assert(lp_check_value(bld->type, x));
3263
3264 /* TODO: optimize the constant case */
3265 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3266 LLVMIsConstant(x)) {
3267 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3268 __FUNCTION__);
3269 }
3270
3271 assert(type.floating && type.width == 32);
3272
3273 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3274 * the result is INF and if it's smaller than -126.9 the result is 0 */
3275 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3276 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3277 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3278 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3279
3280 /* ipart = floor(x) */
3281 /* fpart = x - ipart */
3282 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3283
3284 /* expipart = (float) (1 << ipart) */
3285 expipart = LLVMBuildAdd(builder, ipart,
3286 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3287 expipart = LLVMBuildShl(builder, expipart,
3288 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3289 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3290
3291 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3292 ARRAY_SIZE(lp_build_exp2_polynomial));
3293
3294 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3295
3296 return res;
3297 }
3298
3299
3300
3301 /**
3302 * Extract the exponent of a IEEE-754 floating point value.
3303 *
3304 * Optionally apply an integer bias.
3305 *
3306 * Result is an integer value with
3307 *
3308 * ifloor(log2(x)) + bias
3309 */
3310 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3311 lp_build_extract_exponent(struct lp_build_context *bld,
3312 LLVMValueRef x,
3313 int bias)
3314 {
3315 LLVMBuilderRef builder = bld->gallivm->builder;
3316 const struct lp_type type = bld->type;
3317 unsigned mantissa = lp_mantissa(type);
3318 LLVMValueRef res;
3319
3320 assert(type.floating);
3321
3322 assert(lp_check_value(bld->type, x));
3323
3324 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3325
3326 res = LLVMBuildLShr(builder, x,
3327 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3328 res = LLVMBuildAnd(builder, res,
3329 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3330 res = LLVMBuildSub(builder, res,
3331 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3332
3333 return res;
3334 }
3335
3336
3337 /**
3338 * Extract the mantissa of the a floating.
3339 *
3340 * Result is a floating point value with
3341 *
3342 * x / floor(log2(x))
3343 */
3344 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3345 lp_build_extract_mantissa(struct lp_build_context *bld,
3346 LLVMValueRef x)
3347 {
3348 LLVMBuilderRef builder = bld->gallivm->builder;
3349 const struct lp_type type = bld->type;
3350 unsigned mantissa = lp_mantissa(type);
3351 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3352 (1ULL << mantissa) - 1);
3353 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3354 LLVMValueRef res;
3355
3356 assert(lp_check_value(bld->type, x));
3357
3358 assert(type.floating);
3359
3360 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3361
3362 /* res = x / 2**ipart */
3363 res = LLVMBuildAnd(builder, x, mantmask, "");
3364 res = LLVMBuildOr(builder, res, one, "");
3365 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3366
3367 return res;
3368 }
3369
3370
3371
3372 /**
3373 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3374 * These coefficients can be generate with
3375 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3376 */
3377 const double lp_build_log2_polynomial[] = {
3378 #if LOG_POLY_DEGREE == 5
3379 2.88539008148777786488L,
3380 0.961796878841293367824L,
3381 0.577058946784739859012L,
3382 0.412914355135828735411L,
3383 0.308591899232910175289L,
3384 0.352376952300281371868L,
3385 #elif LOG_POLY_DEGREE == 4
3386 2.88539009343309178325L,
3387 0.961791550404184197881L,
3388 0.577440339438736392009L,
3389 0.403343858251329912514L,
3390 0.406718052498846252698L,
3391 #elif LOG_POLY_DEGREE == 3
3392 2.88538959748872753838L,
3393 0.961932915889597772928L,
3394 0.571118517972136195241L,
3395 0.493997535084709500285L,
3396 #else
3397 #error
3398 #endif
3399 };
3400
3401 /**
3402 * See http://www.devmaster.net/forums/showthread.php?p=43580
3403 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3404 * http://www.nezumi.demon.co.uk/consult/logx.htm
3405 *
3406 * If handle_edge_cases is true the function will perform computations
3407 * to match the required D3D10+ behavior for each of the edge cases.
3408 * That means that if input is:
3409 * - less than zero (to and including -inf) then NaN will be returned
3410 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3411 * - +infinity, then +infinity will be returned
3412 * - NaN, then NaN will be returned
3413 *
3414 * Those checks are fairly expensive so if you don't need them make sure
3415 * handle_edge_cases is false.
3416 */
3417 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3418 lp_build_log2_approx(struct lp_build_context *bld,
3419 LLVMValueRef x,
3420 LLVMValueRef *p_exp,
3421 LLVMValueRef *p_floor_log2,
3422 LLVMValueRef *p_log2,
3423 boolean handle_edge_cases)
3424 {
3425 LLVMBuilderRef builder = bld->gallivm->builder;
3426 const struct lp_type type = bld->type;
3427 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3428 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3429
3430 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3431 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3432 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3433
3434 LLVMValueRef i = NULL;
3435 LLVMValueRef y = NULL;
3436 LLVMValueRef z = NULL;
3437 LLVMValueRef exp = NULL;
3438 LLVMValueRef mant = NULL;
3439 LLVMValueRef logexp = NULL;
3440 LLVMValueRef p_z = NULL;
3441 LLVMValueRef res = NULL;
3442
3443 if (bld->type.width == 16) {
3444 char intrinsic[32];
3445 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3446 LLVMValueRef args[] = { x };
3447 if (p_log2)
3448 *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3449 return;
3450 }
3451
3452 assert(lp_check_value(bld->type, x));
3453
3454 if(p_exp || p_floor_log2 || p_log2) {
3455 /* TODO: optimize the constant case */
3456 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3457 LLVMIsConstant(x)) {
3458 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3459 __FUNCTION__);
3460 }
3461
3462 assert(type.floating && type.width == 32);
3463
3464 /*
3465 * We don't explicitly handle denormalized numbers. They will yield a
3466 * result in the neighbourhood of -127, which appears to be adequate
3467 * enough.
3468 */
3469
3470 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3471
3472 /* exp = (float) exponent(x) */
3473 exp = LLVMBuildAnd(builder, i, expmask, "");
3474 }
3475
3476 if(p_floor_log2 || p_log2) {
3477 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3478 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3479 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3480 }
3481
3482 if (p_log2) {
3483 /* mant = 1 + (float) mantissa(x) */
3484 mant = LLVMBuildAnd(builder, i, mantmask, "");
3485 mant = LLVMBuildOr(builder, mant, one, "");
3486 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3487
3488 /* y = (mant - 1) / (mant + 1) */
3489 y = lp_build_div(bld,
3490 lp_build_sub(bld, mant, bld->one),
3491 lp_build_add(bld, mant, bld->one)
3492 );
3493
3494 /* z = y^2 */
3495 z = lp_build_mul(bld, y, y);
3496
3497 /* compute P(z) */
3498 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3499 ARRAY_SIZE(lp_build_log2_polynomial));
3500
3501 /* y * P(z) + logexp */
3502 res = lp_build_mad(bld, y, p_z, logexp);
3503
3504 if (type.floating && handle_edge_cases) {
3505 LLVMValueRef negmask, infmask, zmask;
3506 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3507 lp_build_const_vec(bld->gallivm, type, 0.0f));
3508 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3509 lp_build_const_vec(bld->gallivm, type, 0.0f));
3510 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3511 lp_build_const_vec(bld->gallivm, type, INFINITY));
3512
3513 /* If x is qual to inf make sure we return inf */
3514 res = lp_build_select(bld, infmask,
3515 lp_build_const_vec(bld->gallivm, type, INFINITY),
3516 res);
3517 /* If x is qual to 0, return -inf */
3518 res = lp_build_select(bld, zmask,
3519 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3520 res);
3521 /* If x is nan or less than 0, return nan */
3522 res = lp_build_select(bld, negmask,
3523 lp_build_const_vec(bld->gallivm, type, NAN),
3524 res);
3525 }
3526 }
3527
3528 if (p_exp) {
3529 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3530 *p_exp = exp;
3531 }
3532
3533 if (p_floor_log2)
3534 *p_floor_log2 = logexp;
3535
3536 if (p_log2)
3537 *p_log2 = res;
3538 }
3539
3540
3541 /*
3542 * log2 implementation which doesn't have special code to
3543 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3544 * the results for those cases are undefined.
3545 */
3546 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3547 lp_build_log2(struct lp_build_context *bld,
3548 LLVMValueRef x)
3549 {
3550 LLVMValueRef res;
3551 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3552 return res;
3553 }
3554
3555 /*
3556 * Version of log2 which handles all edge cases.
3557 * Look at documentation of lp_build_log2_approx for
3558 * description of the behavior for each of the edge cases.
3559 */
3560 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3561 lp_build_log2_safe(struct lp_build_context *bld,
3562 LLVMValueRef x)
3563 {
3564 LLVMValueRef res;
3565 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3566 return res;
3567 }
3568
3569
3570 /**
3571 * Faster (and less accurate) log2.
3572 *
3573 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3574 *
3575 * Piece-wise linear approximation, with exact results when x is a
3576 * power of two.
3577 *
3578 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3579 */
3580 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3581 lp_build_fast_log2(struct lp_build_context *bld,
3582 LLVMValueRef x)
3583 {
3584 LLVMBuilderRef builder = bld->gallivm->builder;
3585 LLVMValueRef ipart;
3586 LLVMValueRef fpart;
3587
3588 assert(lp_check_value(bld->type, x));
3589
3590 assert(bld->type.floating);
3591
3592 /* ipart = floor(log2(x)) - 1 */
3593 ipart = lp_build_extract_exponent(bld, x, -1);
3594 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3595
3596 /* fpart = x / 2**ipart */
3597 fpart = lp_build_extract_mantissa(bld, x);
3598
3599 /* ipart + fpart */
3600 return LLVMBuildFAdd(builder, ipart, fpart, "");
3601 }
3602
3603
3604 /**
3605 * Fast implementation of iround(log2(x)).
3606 *
3607 * Not an approximation -- it should give accurate results all the time.
3608 */
3609 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3610 lp_build_ilog2(struct lp_build_context *bld,
3611 LLVMValueRef x)
3612 {
3613 LLVMBuilderRef builder = bld->gallivm->builder;
3614 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3615 LLVMValueRef ipart;
3616
3617 assert(bld->type.floating);
3618
3619 assert(lp_check_value(bld->type, x));
3620
3621 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3622 x = LLVMBuildFMul(builder, x, sqrt2, "");
3623
3624 /* ipart = floor(log2(x) + 0.5) */
3625 ipart = lp_build_extract_exponent(bld, x, 0);
3626
3627 return ipart;
3628 }
3629
3630 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3631 lp_build_mod(struct lp_build_context *bld,
3632 LLVMValueRef x,
3633 LLVMValueRef y)
3634 {
3635 LLVMBuilderRef builder = bld->gallivm->builder;
3636 LLVMValueRef res;
3637 const struct lp_type type = bld->type;
3638
3639 assert(lp_check_value(type, x));
3640 assert(lp_check_value(type, y));
3641
3642 if (type.floating)
3643 res = LLVMBuildFRem(builder, x, y, "");
3644 else if (type.sign)
3645 res = LLVMBuildSRem(builder, x, y, "");
3646 else
3647 res = LLVMBuildURem(builder, x, y, "");
3648 return res;
3649 }
3650
3651
3652 /*
3653 * For floating inputs it creates and returns a mask
3654 * which is all 1's for channels which are NaN.
3655 * Channels inside x which are not NaN will be 0.
3656 */
3657 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3658 lp_build_isnan(struct lp_build_context *bld,
3659 LLVMValueRef x)
3660 {
3661 LLVMValueRef mask;
3662 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3663
3664 assert(bld->type.floating);
3665 assert(lp_check_value(bld->type, x));
3666
3667 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3668 "isnotnan");
3669 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3670 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3671 return mask;
3672 }
3673
3674 /* Returns all 1's for floating point numbers that are
3675 * finite numbers and returns all zeros for -inf,
3676 * inf and nan's */
3677 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3678 lp_build_isfinite(struct lp_build_context *bld,
3679 LLVMValueRef x)
3680 {
3681 LLVMBuilderRef builder = bld->gallivm->builder;
3682 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683 struct lp_type int_type = lp_int_type(bld->type);
3684 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3685 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3686 0x7f800000);
3687
3688 if (!bld->type.floating) {
3689 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3690 }
3691 assert(bld->type.floating);
3692 assert(lp_check_value(bld->type, x));
3693 assert(bld->type.width == 32);
3694
3695 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3696 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3697 intx, infornan32);
3698 }
3699
3700 /*
3701 * Returns true if the number is nan or inf and false otherwise.
3702 * The input has to be a floating point vector.
3703 */
3704 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3705 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3706 const struct lp_type type,
3707 LLVMValueRef x)
3708 {
3709 LLVMBuilderRef builder = gallivm->builder;
3710 struct lp_type int_type = lp_int_type(type);
3711 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3712 0x7f800000);
3713 LLVMValueRef ret;
3714
3715 assert(type.floating);
3716
3717 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3718 ret = LLVMBuildAnd(builder, ret, const0, "");
3719 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3720 ret, const0);
3721
3722 return ret;
3723 }
3724
3725
3726 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3727 lp_build_fpstate_get(struct gallivm_state *gallivm)
3728 {
3729 if (util_get_cpu_caps()->has_sse) {
3730 LLVMBuilderRef builder = gallivm->builder;
3731 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3732 gallivm,
3733 LLVMInt32TypeInContext(gallivm->context),
3734 "mxcsr_ptr");
3735 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3736 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3737 lp_build_intrinsic(builder,
3738 "llvm.x86.sse.stmxcsr",
3739 LLVMVoidTypeInContext(gallivm->context),
3740 &mxcsr_ptr8, 1, 0);
3741 return mxcsr_ptr;
3742 }
3743 return 0;
3744 }
3745
3746 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3747 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3748 boolean zero)
3749 {
3750 if (util_get_cpu_caps()->has_sse) {
3751 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3752 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3753
3754 LLVMBuilderRef builder = gallivm->builder;
3755 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3756 LLVMValueRef mxcsr =
3757 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3758
3759 if (util_get_cpu_caps()->has_daz) {
3760 /* Enable denormals are zero mode */
3761 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3762 }
3763 if (zero) {
3764 mxcsr = LLVMBuildOr(builder, mxcsr,
3765 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3766 } else {
3767 mxcsr = LLVMBuildAnd(builder, mxcsr,
3768 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3769 }
3770
3771 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3772 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3773 }
3774 }
3775
3776 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3777 lp_build_fpstate_set(struct gallivm_state *gallivm,
3778 LLVMValueRef mxcsr_ptr)
3779 {
3780 if (util_get_cpu_caps()->has_sse) {
3781 LLVMBuilderRef builder = gallivm->builder;
3782 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3783 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3784 lp_build_intrinsic(builder,
3785 "llvm.x86.sse.ldmxcsr",
3786 LLVMVoidTypeInContext(gallivm->context),
3787 &mxcsr_ptr, 1, 0);
3788 }
3789 }
3790