1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_get_cpu_caps()->has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_get_cpu_caps()->has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_get_cpu_caps()->has_altivec) {
147 intr_size = 128;
148 if (type.width == 8) {
149 if (!type.sign) {
150 intrinsic = "llvm.ppc.altivec.vminub";
151 } else {
152 intrinsic = "llvm.ppc.altivec.vminsb";
153 }
154 } else if (type.width == 16) {
155 if (!type.sign) {
156 intrinsic = "llvm.ppc.altivec.vminuh";
157 } else {
158 intrinsic = "llvm.ppc.altivec.vminsh";
159 }
160 } else if (type.width == 32) {
161 if (!type.sign) {
162 intrinsic = "llvm.ppc.altivec.vminuw";
163 } else {
164 intrinsic = "llvm.ppc.altivec.vminsw";
165 }
166 }
167 }
168
169 if (intrinsic) {
170 /* We need to handle nan's for floating point numbers. If one of the
171 * inputs is nan the other should be returned (required by both D3D10+
172 * and OpenCL).
173 * The sse intrinsics return the second operator in case of nan by
174 * default so we need to special code to handle those.
175 */
176 if (util_get_cpu_caps()->has_sse && type.floating &&
177 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178 LLVMValueRef isnan, min;
179 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 isnan = lp_build_isnan(bld, b);
183 return lp_build_select(bld, isnan, a, min);
184 } else {
185 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186 type,
187 intr_size, a, b);
188 }
189 }
190
191 if (type.floating) {
192 switch (nan_behavior) {
193 case GALLIVM_NAN_RETURN_OTHER: {
194 LLVMValueRef isnan = lp_build_isnan(bld, a);
195 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197 return lp_build_select(bld, cond, a, b);
198 }
199 break;
200 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202 return lp_build_select(bld, cond, a, b);
203 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205 return lp_build_select(bld, cond, b, a);
206 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208 return lp_build_select(bld, cond, a, b);
209 break;
210 default:
211 assert(0);
212 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213 return lp_build_select(bld, cond, a, b);
214 }
215 } else {
216 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217 return lp_build_select(bld, cond, a, b);
218 }
219 }
220
221
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224 LLVMValueRef a,
225 LLVMValueRef b,
226 LLVMValueRef c)
227 {
228 LLVMTypeRef type = LLVMTypeOf(a);
229 assert(type == LLVMTypeOf(b));
230 assert(type == LLVMTypeOf(c));
231
232 char intrinsic[32];
233 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234 LLVMValueRef args[] = { a, b, c };
235 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237
238
239 /**
240 * Generate max(a, b)
241 * No checks for special case values of a or b = 1 or 0 are done.
242 * NaN's are handled according to the behavior specified by the
243 * nan_behavior argument.
244 */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247 LLVMValueRef a,
248 LLVMValueRef b,
249 enum gallivm_nan_behavior nan_behavior)
250 {
251 const struct lp_type type = bld->type;
252 const char *intrinsic = NULL;
253 unsigned intr_size = 0;
254 LLVMValueRef cond;
255
256 assert(lp_check_value(type, a));
257 assert(lp_check_value(type, b));
258
259 /* TODO: optimize the constant case */
260
261 if (type.floating && util_get_cpu_caps()->has_sse) {
262 if (type.width == 32) {
263 if (type.length == 1) {
264 intrinsic = "llvm.x86.sse.max.ss";
265 intr_size = 128;
266 }
267 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268 intrinsic = "llvm.x86.sse.max.ps";
269 intr_size = 128;
270 }
271 else {
272 intrinsic = "llvm.x86.avx.max.ps.256";
273 intr_size = 256;
274 }
275 }
276 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277 if (type.length == 1) {
278 intrinsic = "llvm.x86.sse2.max.sd";
279 intr_size = 128;
280 }
281 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282 intrinsic = "llvm.x86.sse2.max.pd";
283 intr_size = 128;
284 }
285 else {
286 intrinsic = "llvm.x86.avx.max.pd.256";
287 intr_size = 256;
288 }
289 }
290 }
291 else if (type.floating && util_get_cpu_caps()->has_altivec) {
292 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294 __FUNCTION__);
295 }
296 if (type.width == 32 || type.length == 4) {
297 intrinsic = "llvm.ppc.altivec.vmaxfp";
298 intr_size = 128;
299 }
300 } else if (util_get_cpu_caps()->has_altivec) {
301 intr_size = 128;
302 if (type.width == 8) {
303 if (!type.sign) {
304 intrinsic = "llvm.ppc.altivec.vmaxub";
305 } else {
306 intrinsic = "llvm.ppc.altivec.vmaxsb";
307 }
308 } else if (type.width == 16) {
309 if (!type.sign) {
310 intrinsic = "llvm.ppc.altivec.vmaxuh";
311 } else {
312 intrinsic = "llvm.ppc.altivec.vmaxsh";
313 }
314 } else if (type.width == 32) {
315 if (!type.sign) {
316 intrinsic = "llvm.ppc.altivec.vmaxuw";
317 } else {
318 intrinsic = "llvm.ppc.altivec.vmaxsw";
319 }
320 }
321 }
322
323 if (intrinsic) {
324 if (util_get_cpu_caps()->has_sse && type.floating &&
325 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326 LLVMValueRef isnan, max;
327 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328 type,
329 intr_size, a, b);
330 isnan = lp_build_isnan(bld, b);
331 return lp_build_select(bld, isnan, a, max);
332 } else {
333 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334 type,
335 intr_size, a, b);
336 }
337 }
338
339 if (type.floating) {
340 switch (nan_behavior) {
341 case GALLIVM_NAN_RETURN_OTHER: {
342 LLVMValueRef isnan = lp_build_isnan(bld, a);
343 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345 return lp_build_select(bld, cond, a, b);
346 }
347 break;
348 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350 return lp_build_select(bld, cond, a, b);
351 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353 return lp_build_select(bld, cond, b, a);
354 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356 return lp_build_select(bld, cond, a, b);
357 break;
358 default:
359 assert(0);
360 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361 return lp_build_select(bld, cond, a, b);
362 }
363 } else {
364 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365 return lp_build_select(bld, cond, a, b);
366 }
367 }
368
369
370 /**
371 * Generate 1 - a, or ~a depending on bld->type.
372 */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375 LLVMValueRef a)
376 {
377 LLVMBuilderRef builder = bld->gallivm->builder;
378 const struct lp_type type = bld->type;
379
380 assert(lp_check_value(type, a));
381
382 if (a == bld->one)
383 return bld->zero;
384 if (a == bld->zero)
385 return bld->one;
386
387 if (type.norm && !type.floating && !type.fixed && !type.sign) {
388 if (LLVMIsConstant(a))
389 return LLVMConstNot(a);
390 else
391 return LLVMBuildNot(builder, a, "");
392 }
393
394 if (type.floating)
395 return LLVMBuildFSub(builder, bld->one, a, "");
396 else
397 return LLVMBuildSub(builder, bld->one, a, "");
398 }
399
400
401 /**
402 * Generate a + b
403 */
404 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)405 lp_build_add(struct lp_build_context *bld,
406 LLVMValueRef a,
407 LLVMValueRef b)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411 LLVMValueRef res;
412
413 assert(lp_check_value(type, a));
414 assert(lp_check_value(type, b));
415
416 if (a == bld->zero)
417 return b;
418 if (b == bld->zero)
419 return a;
420 if (a == bld->undef || b == bld->undef)
421 return bld->undef;
422
423 if (type.norm) {
424 const char *intrinsic = NULL;
425
426 if (!type.sign && (a == bld->one || b == bld->one))
427 return bld->one;
428
429 if (!type.floating && !type.fixed) {
430 if (LLVM_VERSION_MAJOR >= 8) {
431 char intrin[32];
432 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
433 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
434 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
435 }
436 if (type.width * type.length == 128) {
437 if (util_get_cpu_caps()->has_sse2) {
438 if (type.width == 8)
439 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
440 if (type.width == 16)
441 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
442 } else if (util_get_cpu_caps()->has_altivec) {
443 if (type.width == 8)
444 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
445 if (type.width == 16)
446 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
447 }
448 }
449 if (type.width * type.length == 256) {
450 if (util_get_cpu_caps()->has_avx2) {
451 if (type.width == 8)
452 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
453 if (type.width == 16)
454 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
455 }
456 }
457 }
458
459 if (intrinsic)
460 return lp_build_intrinsic_binary(builder, intrinsic,
461 lp_build_vec_type(bld->gallivm, bld->type), a, b);
462 }
463
464 if (type.norm && !type.floating && !type.fixed) {
465 if (type.sign) {
466 uint64_t sign = (uint64_t)1 << (type.width - 1);
467 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
468 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
469 /* a_clamp_max is the maximum a for positive b,
470 a_clamp_min is the minimum a for negative b. */
471 LLVMValueRef a_clamp_max =
472 lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""),
473 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
474 LLVMValueRef a_clamp_min =
475 lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""),
476 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
478 bld->zero), a_clamp_max, a_clamp_min);
479 }
480 }
481
482 if (type.floating)
483 res = LLVMBuildFAdd(builder, a, b, "");
484 else
485 res = LLVMBuildAdd(builder, a, b, "");
486
487 /* clamp to ceiling of 1.0 */
488 if (bld->type.norm && (bld->type.floating || bld->type.fixed))
489 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
490
491 if (type.norm && !type.floating && !type.fixed) {
492 if (!type.sign) {
493 /*
494 * newer llvm versions no longer support the intrinsics, but recognize
495 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
496 * code, it is important we match the pattern llvm uses (and pray llvm
497 * doesn't change it - and hope they decide on the same pattern for
498 * all backends supporting it...).
499 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
500 * interfere with llvm's ability to recognize the pattern but seems
501 * a bit brittle.
502 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
503 */
504 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
505 res = lp_build_select(bld, overflowed,
506 LLVMConstAllOnes(bld->int_vec_type), res);
507 }
508 }
509
510 /* XXX clamp to floor of -1 or 0??? */
511
512 return res;
513 }
514
515
516 /** Return the scalar sum of the elements of a.
517 * Should avoid this operation whenever possible.
518 */
519 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)520 lp_build_horizontal_add(struct lp_build_context *bld,
521 LLVMValueRef a)
522 {
523 LLVMBuilderRef builder = bld->gallivm->builder;
524 const struct lp_type type = bld->type;
525 LLVMValueRef index, res;
526 unsigned i, length;
527 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
528 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
529 LLVMValueRef vecres, elem2;
530
531 assert(lp_check_value(type, a));
532
533 if (type.length == 1) {
534 return a;
535 }
536
537 assert(!bld->type.norm);
538
539 /*
540 * for byte vectors can do much better with psadbw.
541 * Using repeated shuffle/adds here. Note with multiple vectors
542 * this can be done more efficiently as outlined in the intel
543 * optimization manual.
544 * Note: could cause data rearrangement if used with smaller element
545 * sizes.
546 */
547
548 vecres = a;
549 length = type.length / 2;
550 while (length > 1) {
551 LLVMValueRef vec1, vec2;
552 for (i = 0; i < length; i++) {
553 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
554 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
555 }
556 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
557 LLVMConstVector(shuffles1, length), "");
558 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
559 LLVMConstVector(shuffles2, length), "");
560 if (type.floating) {
561 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
562 }
563 else {
564 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
565 }
566 length = length >> 1;
567 }
568
569 /* always have vector of size 2 here */
570 assert(length == 1);
571
572 index = lp_build_const_int32(bld->gallivm, 0);
573 res = LLVMBuildExtractElement(builder, vecres, index, "");
574 index = lp_build_const_int32(bld->gallivm, 1);
575 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
576
577 if (type.floating)
578 res = LLVMBuildFAdd(builder, res, elem2, "");
579 else
580 res = LLVMBuildAdd(builder, res, elem2, "");
581
582 return res;
583 }
584
585
586 /**
587 * Return the horizontal sums of 4 float vectors as a float4 vector.
588 * This uses the technique as outlined in Intel Optimization Manual.
589 */
590 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])591 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
592 LLVMValueRef src[4])
593 {
594 struct gallivm_state *gallivm = bld->gallivm;
595 LLVMBuilderRef builder = gallivm->builder;
596 LLVMValueRef shuffles[4];
597 LLVMValueRef tmp[4];
598 LLVMValueRef sumtmp[2], shuftmp[2];
599
600 /* lower half of regs */
601 shuffles[0] = lp_build_const_int32(gallivm, 0);
602 shuffles[1] = lp_build_const_int32(gallivm, 1);
603 shuffles[2] = lp_build_const_int32(gallivm, 4);
604 shuffles[3] = lp_build_const_int32(gallivm, 5);
605 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
606 LLVMConstVector(shuffles, 4), "");
607 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
608 LLVMConstVector(shuffles, 4), "");
609
610 /* upper half of regs */
611 shuffles[0] = lp_build_const_int32(gallivm, 2);
612 shuffles[1] = lp_build_const_int32(gallivm, 3);
613 shuffles[2] = lp_build_const_int32(gallivm, 6);
614 shuffles[3] = lp_build_const_int32(gallivm, 7);
615 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
616 LLVMConstVector(shuffles, 4), "");
617 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
618 LLVMConstVector(shuffles, 4), "");
619
620 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
621 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
622
623 shuffles[0] = lp_build_const_int32(gallivm, 0);
624 shuffles[1] = lp_build_const_int32(gallivm, 2);
625 shuffles[2] = lp_build_const_int32(gallivm, 4);
626 shuffles[3] = lp_build_const_int32(gallivm, 6);
627 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
628 LLVMConstVector(shuffles, 4), "");
629
630 shuffles[0] = lp_build_const_int32(gallivm, 1);
631 shuffles[1] = lp_build_const_int32(gallivm, 3);
632 shuffles[2] = lp_build_const_int32(gallivm, 5);
633 shuffles[3] = lp_build_const_int32(gallivm, 7);
634 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
635 LLVMConstVector(shuffles, 4), "");
636
637 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
638 }
639
640
641 /*
642 * partially horizontally add 2-4 float vectors with length nx4,
643 * i.e. only four adjacent values in each vector will be added,
644 * assuming values are really grouped in 4 which also determines
645 * output order.
646 *
647 * Return a vector of the same length as the initial vectors,
648 * with the excess elements (if any) being undefined.
649 * The element order is independent of number of input vectors.
650 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
651 * the output order thus will be
652 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
653 */
654 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)655 lp_build_hadd_partial4(struct lp_build_context *bld,
656 LLVMValueRef vectors[],
657 unsigned num_vecs)
658 {
659 struct gallivm_state *gallivm = bld->gallivm;
660 LLVMBuilderRef builder = gallivm->builder;
661 LLVMValueRef ret_vec;
662 LLVMValueRef tmp[4];
663 const char *intrinsic = NULL;
664
665 assert(num_vecs >= 2 && num_vecs <= 4);
666 assert(bld->type.floating);
667
668 /* only use this with at least 2 vectors, as it is sort of expensive
669 * (depending on cpu) and we always need two horizontal adds anyway,
670 * so a shuffle/add approach might be better.
671 */
672
673 tmp[0] = vectors[0];
674 tmp[1] = vectors[1];
675
676 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
677 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
678
679 if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
680 bld->type.length == 4) {
681 intrinsic = "llvm.x86.sse3.hadd.ps";
682 }
683 else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
684 bld->type.length == 8) {
685 intrinsic = "llvm.x86.avx.hadd.ps.256";
686 }
687 if (intrinsic) {
688 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
689 lp_build_vec_type(gallivm, bld->type),
690 tmp[0], tmp[1]);
691 if (num_vecs > 2) {
692 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
693 lp_build_vec_type(gallivm, bld->type),
694 tmp[2], tmp[3]);
695 }
696 else {
697 tmp[1] = tmp[0];
698 }
699 return lp_build_intrinsic_binary(builder, intrinsic,
700 lp_build_vec_type(gallivm, bld->type),
701 tmp[0], tmp[1]);
702 }
703
704 if (bld->type.length == 4) {
705 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
706 }
707 else {
708 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
709 unsigned j;
710 unsigned num_iter = bld->type.length / 4;
711 struct lp_type parttype = bld->type;
712 parttype.length = 4;
713 for (j = 0; j < num_iter; j++) {
714 LLVMValueRef partsrc[4];
715 unsigned i;
716 for (i = 0; i < 4; i++) {
717 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
718 }
719 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
720 }
721 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
722 }
723 return ret_vec;
724 }
725
726
727 /**
728 * Generate a - b
729 */
730 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)731 lp_build_sub(struct lp_build_context *bld,
732 LLVMValueRef a,
733 LLVMValueRef b)
734 {
735 LLVMBuilderRef builder = bld->gallivm->builder;
736 const struct lp_type type = bld->type;
737 LLVMValueRef res;
738
739 assert(lp_check_value(type, a));
740 assert(lp_check_value(type, b));
741
742 if (b == bld->zero)
743 return a;
744 if (a == bld->undef || b == bld->undef)
745 return bld->undef;
746 if (a == b)
747 return bld->zero;
748
749 if (type.norm) {
750 const char *intrinsic = NULL;
751
752 if (!type.sign && b == bld->one)
753 return bld->zero;
754
755 if (!type.floating && !type.fixed) {
756 if (LLVM_VERSION_MAJOR >= 8) {
757 char intrin[32];
758 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
759 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
760 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
761 }
762 if (type.width * type.length == 128) {
763 if (util_get_cpu_caps()->has_sse2) {
764 if (type.width == 8)
765 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
766 if (type.width == 16)
767 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
768 } else if (util_get_cpu_caps()->has_altivec) {
769 if (type.width == 8)
770 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
771 if (type.width == 16)
772 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
773 }
774 }
775 if (type.width * type.length == 256) {
776 if (util_get_cpu_caps()->has_avx2) {
777 if (type.width == 8)
778 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
779 if (type.width == 16)
780 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
781 }
782 }
783 }
784
785 if (intrinsic)
786 return lp_build_intrinsic_binary(builder, intrinsic,
787 lp_build_vec_type(bld->gallivm, bld->type), a, b);
788 }
789
790 if (type.norm && !type.floating && !type.fixed) {
791 if (type.sign) {
792 uint64_t sign = (uint64_t)1 << (type.width - 1);
793 LLVMValueRef max_val =
794 lp_build_const_int_vec(bld->gallivm, type, sign - 1);
795 LLVMValueRef min_val =
796 lp_build_const_int_vec(bld->gallivm, type, sign);
797 /* a_clamp_max is the maximum a for negative b,
798 a_clamp_min is the minimum a for positive b. */
799 LLVMValueRef a_clamp_max =
800 lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""),
801 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802 LLVMValueRef a_clamp_min =
803 lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""),
804 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
805 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
806 bld->zero),
807 a_clamp_min, a_clamp_max);
808 } else {
809 /*
810 * This must match llvm pattern for saturated unsigned sub.
811 * (lp_build_max_simple actually does the job with its current
812 * definition but do it explicitly here.)
813 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
814 * interfere with llvm's ability to recognize the pattern but seems
815 * a bit brittle.
816 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
817 */
818 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
819 a = lp_build_select(bld, no_ov, a, b);
820 }
821 }
822
823 if (type.floating)
824 res = LLVMBuildFSub(builder, a, b, "");
825 else
826 res = LLVMBuildSub(builder, a, b, "");
827
828 if (bld->type.norm && (bld->type.floating || bld->type.fixed))
829 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
830
831 return res;
832 }
833
834
835 /**
836 * Normalized multiplication.
837 *
838 * There are several approaches for (using 8-bit normalized multiplication as
839 * an example):
840 *
841 * - alpha plus one
842 *
843 * makes the following approximation to the division (Sree)
844 *
845 * a*b/255 ~= (a*(b + 1)) >> 256
846 *
847 * which is the fastest method that satisfies the following OpenGL
848 * criteria of
849 *
850 * 0*0 = 0 and 255*255 = 255
851 *
852 * - geometric series
853 *
854 * takes the geometric series approximation to the division
855 *
856 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
857 *
858 * in this case just the first two terms to fit in 16bit arithmetic
859 *
860 * t/255 ~= (t + (t >> 8)) >> 8
861 *
862 * note that just by itself it doesn't satisfies the OpenGL criteria,
863 * as 255*255 = 254, so the special case b = 255 must be accounted or
864 * roundoff must be used.
865 *
866 * - geometric series plus rounding
867 *
868 * when using a geometric series division instead of truncating the result
869 * use roundoff in the approximation (Jim Blinn)
870 *
871 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
872 *
873 * achieving the exact results.
874 *
875 *
876 *
877 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
878 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
879 * @sa Michael Herf, The "double blend trick", May 2000,
880 * http://www.stereopsis.com/doubleblend.html
881 */
882 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)883 lp_build_mul_norm(struct gallivm_state *gallivm,
884 struct lp_type wide_type,
885 LLVMValueRef a, LLVMValueRef b)
886 {
887 LLVMBuilderRef builder = gallivm->builder;
888 struct lp_build_context bld;
889 unsigned n;
890 LLVMValueRef half;
891 LLVMValueRef ab;
892
893 assert(!wide_type.floating);
894 assert(lp_check_value(wide_type, a));
895 assert(lp_check_value(wide_type, b));
896
897 lp_build_context_init(&bld, gallivm, wide_type);
898
899 n = wide_type.width / 2;
900 if (wide_type.sign) {
901 --n;
902 }
903
904 /*
905 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
906 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
907 */
908
909 /*
910 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
911 */
912
913 ab = LLVMBuildMul(builder, a, b, "");
914 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
915
916 /*
917 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
918 */
919
920 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
921 if (wide_type.sign) {
922 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
923 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
924 half = lp_build_select(&bld, sign, minus_half, half);
925 }
926 ab = LLVMBuildAdd(builder, ab, half, "");
927
928 /* Final division */
929 ab = lp_build_shr_imm(&bld, ab, n);
930
931 return ab;
932 }
933
934
935 /**
936 * Generate a * b
937 */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940 LLVMValueRef a,
941 LLVMValueRef b)
942 {
943 LLVMBuilderRef builder = bld->gallivm->builder;
944 const struct lp_type type = bld->type;
945
946 assert(lp_check_value(type, a));
947 assert(lp_check_value(type, b));
948
949 if (a == bld->zero)
950 return bld->zero;
951 if (a == bld->one)
952 return b;
953 if (b == bld->zero)
954 return bld->zero;
955 if (b == bld->one)
956 return a;
957 if (a == bld->undef || b == bld->undef)
958 return bld->undef;
959
960 if (!type.floating && !type.fixed && type.norm) {
961 struct lp_type wide_type = lp_wider_type(type);
962 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
963
964 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
965 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
966
967 /* PMULLW, PSRLW, PADDW */
968 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
969 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
970
971 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
972
973 return ab;
974 }
975
976 LLVMValueRef shift = type.fixed
977 ? lp_build_const_int_vec(bld->gallivm, type, type.width/2) : NULL;
978
979 LLVMValueRef res;
980 if (type.floating)
981 res = LLVMBuildFMul(builder, a, b, "");
982 else
983 res = LLVMBuildMul(builder, a, b, "");
984 if (shift) {
985 if (type.sign)
986 res = LLVMBuildAShr(builder, res, shift, "");
987 else
988 res = LLVMBuildLShr(builder, res, shift, "");
989 }
990
991 return res;
992 }
993
994
995 /*
996 * Widening mul, valid for 32x32 bit -> 64bit only.
997 * Result is low 32bits, high bits returned in res_hi.
998 *
999 * Emits code that is meant to be compiled for the host CPU.
1000 */
1001 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1002 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1003 LLVMValueRef a,
1004 LLVMValueRef b,
1005 LLVMValueRef *res_hi)
1006 {
1007 struct gallivm_state *gallivm = bld->gallivm;
1008 LLVMBuilderRef builder = gallivm->builder;
1009
1010 assert(bld->type.width == 32);
1011 assert(bld->type.floating == 0);
1012 assert(bld->type.fixed == 0);
1013 assert(bld->type.norm == 0);
1014
1015 /*
1016 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1017 * for x86 simd is atrocious (even if the high bits weren't required),
1018 * trying to handle real 64bit inputs (which of course can't happen due
1019 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1020 * apparently llvm does not recognize this widening mul). This includes 6
1021 * (instead of 2) pmuludq plus extra adds and shifts
1022 * The same story applies to signed mul, albeit fixing this requires sse41.
1023 * https://llvm.org/bugs/show_bug.cgi?id=30845
1024 * So, whip up our own code, albeit only for length 4 and 8 (which
1025 * should be good enough)...
1026 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1027 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1028 * for signed), which the fallback code does not, without this llvm
1029 * will likely still produce atrocious code.
1030 */
1031 if (LLVM_VERSION_MAJOR < 7 &&
1032 (bld->type.length == 4 || bld->type.length == 8) &&
1033 ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1034 util_get_cpu_caps()->has_sse4_1)) {
1035 const char *intrinsic = NULL;
1036 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1037 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1038 struct lp_type type_wide = lp_wider_type(bld->type);
1039 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1040 unsigned i;
1041 for (i = 0; i < bld->type.length; i += 2) {
1042 shuf[i] = lp_build_const_int32(gallivm, i+1);
1043 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1044 }
1045 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1046 aeven = a;
1047 beven = b;
1048 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1049 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1050
1051 if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1052 if (bld->type.sign) {
1053 intrinsic = "llvm.x86.avx2.pmul.dq";
1054 } else {
1055 intrinsic = "llvm.x86.avx2.pmulu.dq";
1056 }
1057 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1058 wider_type, aeven, beven);
1059 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1060 wider_type, aodd, bodd);
1061 }
1062 else {
1063 /* for consistent naming look elsewhere... */
1064 if (bld->type.sign) {
1065 intrinsic = "llvm.x86.sse41.pmuldq";
1066 } else {
1067 intrinsic = "llvm.x86.sse2.pmulu.dq";
1068 }
1069 /*
1070 * XXX If we only have AVX but not AVX2 this is a pain.
1071 * lp_build_intrinsic_binary_anylength() can't handle it
1072 * (due to src and dst type not being identical).
1073 */
1074 if (bld->type.length == 8) {
1075 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1076 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1077 LLVMValueRef muleven2[2], mulodd2[2];
1078 struct lp_type type_wide_half = type_wide;
1079 LLVMTypeRef wtype_half;
1080 type_wide_half.length = 2;
1081 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1082 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1083 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1084 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1085 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1086 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1087 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1088 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1089 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1090 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1091 wtype_half, aevenlo, bevenlo);
1092 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1093 wtype_half, aoddlo, boddlo);
1094 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1095 wtype_half, aevenhi, bevenhi);
1096 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1097 wtype_half, aoddhi, boddhi);
1098 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1099 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1100
1101 }
1102 else {
1103 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1104 wider_type, aeven, beven);
1105 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1106 wider_type, aodd, bodd);
1107 }
1108 }
1109 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1110 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1111
1112 for (i = 0; i < bld->type.length; i += 2) {
1113 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1114 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1115 }
1116 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1117 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1118
1119 for (i = 0; i < bld->type.length; i += 2) {
1120 shuf[i] = lp_build_const_int32(gallivm, i);
1121 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1122 }
1123 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1124 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1125 }
1126 else {
1127 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1128 }
1129 }
1130
1131
1132 /*
1133 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1134 * Result is low N bits, high bits returned in res_hi.
1135 *
1136 * Emits generic code.
1137 */
1138 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1139 lp_build_mul_32_lohi(struct lp_build_context *bld,
1140 LLVMValueRef a,
1141 LLVMValueRef b,
1142 LLVMValueRef *res_hi)
1143 {
1144 struct gallivm_state *gallivm = bld->gallivm;
1145 LLVMBuilderRef builder = gallivm->builder;
1146 LLVMValueRef tmp, shift, res_lo;
1147 struct lp_type type_tmp;
1148 LLVMTypeRef wide_type, narrow_type;
1149
1150 type_tmp = bld->type;
1151 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1152 if (bld->type.width < 32)
1153 type_tmp.width = 32;
1154 else
1155 type_tmp.width *= 2;
1156 wide_type = lp_build_vec_type(gallivm, type_tmp);
1157 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1158
1159 if (bld->type.sign) {
1160 a = LLVMBuildSExt(builder, a, wide_type, "");
1161 b = LLVMBuildSExt(builder, b, wide_type, "");
1162 } else {
1163 a = LLVMBuildZExt(builder, a, wide_type, "");
1164 b = LLVMBuildZExt(builder, b, wide_type, "");
1165 }
1166 tmp = LLVMBuildMul(builder, a, b, "");
1167
1168 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1169
1170 /* Since we truncate anyway, LShr and AShr are equivalent. */
1171 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1172 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1173
1174 return res_lo;
1175 }
1176
1177
1178 /* a * b + c */
1179 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1180 lp_build_mad(struct lp_build_context *bld,
1181 LLVMValueRef a,
1182 LLVMValueRef b,
1183 LLVMValueRef c)
1184 {
1185 const struct lp_type type = bld->type;
1186 if (type.floating) {
1187 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1188 } else {
1189 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1190 }
1191 }
1192
1193
1194 /**
1195 * Small vector x scale multiplication optimization.
1196 */
1197 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1198 lp_build_mul_imm(struct lp_build_context *bld,
1199 LLVMValueRef a,
1200 int b)
1201 {
1202 LLVMBuilderRef builder = bld->gallivm->builder;
1203 LLVMValueRef factor;
1204
1205 assert(lp_check_value(bld->type, a));
1206
1207 if (b == 0)
1208 return bld->zero;
1209
1210 if (b == 1)
1211 return a;
1212
1213 if (b == -1)
1214 return lp_build_negate(bld, a);
1215
1216 if (b == 2 && bld->type.floating)
1217 return lp_build_add(bld, a, a);
1218
1219 if (util_is_power_of_two_or_zero(b)) {
1220 unsigned shift = ffs(b) - 1;
1221
1222 if (bld->type.floating) {
1223 #if 0
1224 /*
1225 * Power of two multiplication by directly manipulating the exponent.
1226 *
1227 * XXX: This might not be always faster, it will introduce a small
1228 * error for multiplication by zero, and it will produce wrong results
1229 * for Inf and NaN.
1230 */
1231 unsigned mantissa = lp_mantissa(bld->type);
1232 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1233 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1234 a = LLVMBuildAdd(builder, a, factor, "");
1235 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1236 return a;
1237 #endif
1238 }
1239 else {
1240 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1241 return LLVMBuildShl(builder, a, factor, "");
1242 }
1243 }
1244
1245 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1246 return lp_build_mul(bld, a, factor);
1247 }
1248
1249
1250 /**
1251 * Generate a / b
1252 */
1253 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1254 lp_build_div(struct lp_build_context *bld,
1255 LLVMValueRef a,
1256 LLVMValueRef b)
1257 {
1258 LLVMBuilderRef builder = bld->gallivm->builder;
1259 const struct lp_type type = bld->type;
1260
1261 assert(lp_check_value(type, a));
1262 assert(lp_check_value(type, b));
1263
1264 if (a == bld->zero)
1265 return bld->zero;
1266 if (a == bld->one && type.floating)
1267 return lp_build_rcp(bld, b);
1268 if (b == bld->zero)
1269 return bld->undef;
1270 if (b == bld->one)
1271 return a;
1272 if (a == bld->undef || b == bld->undef)
1273 return bld->undef;
1274
1275 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1276 if (FALSE &&
1277 ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1278 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1279 type.floating)
1280 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1281
1282 if (type.floating)
1283 return LLVMBuildFDiv(builder, a, b, "");
1284 else if (type.sign)
1285 return LLVMBuildSDiv(builder, a, b, "");
1286 else
1287 return LLVMBuildUDiv(builder, a, b, "");
1288 }
1289
1290
1291 /**
1292 * Linear interpolation helper.
1293 *
1294 * @param normalized whether we are interpolating normalized values,
1295 * encoded in normalized integers, twice as wide.
1296 *
1297 * @sa http://www.stereopsis.com/doubleblend.html
1298 */
1299 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1300 lp_build_lerp_simple(struct lp_build_context *bld,
1301 LLVMValueRef x,
1302 LLVMValueRef v0,
1303 LLVMValueRef v1,
1304 unsigned flags)
1305 {
1306 unsigned half_width = bld->type.width/2;
1307 LLVMBuilderRef builder = bld->gallivm->builder;
1308 LLVMValueRef delta;
1309 LLVMValueRef res;
1310
1311 assert(lp_check_value(bld->type, x));
1312 assert(lp_check_value(bld->type, v0));
1313 assert(lp_check_value(bld->type, v1));
1314
1315 delta = lp_build_sub(bld, v1, v0);
1316
1317 if (bld->type.floating) {
1318 assert(flags == 0);
1319 return lp_build_mad(bld, x, delta, v0);
1320 }
1321
1322 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1323 if (!bld->type.sign) {
1324 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1325 /*
1326 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1327 * most-significant-bit to the lowest-significant-bit, so that
1328 * later we can just divide by 2**n instead of 2**n - 1.
1329 */
1330
1331 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1332 }
1333
1334 /* (x * delta) >> n */
1335 /*
1336 * For this multiply, higher internal precision is required to pass
1337 * CTS, the most efficient path to that is pmulhrsw on ssse3 and
1338 * above. This could be opencoded on other arches if conformance was
1339 * required.
1340 */
1341 if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1342 res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1343 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1344 } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1345 res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1346 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1347 } else {
1348 res = lp_build_mul(bld, x, delta);
1349 res = lp_build_shr_imm(bld, res, half_width);
1350 }
1351 } else {
1352 /*
1353 * The rescaling trick above doesn't work for signed numbers, so
1354 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1355 * instead.
1356 */
1357 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1358 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1359 }
1360 } else {
1361 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1362 res = lp_build_mul(bld, x, delta);
1363 }
1364
1365 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1366 /*
1367 * At this point both res and v0 only use the lower half of the bits,
1368 * the rest is zero. Instead of add / mask, do add with half wide type.
1369 */
1370 struct lp_type narrow_type;
1371 struct lp_build_context narrow_bld;
1372
1373 memset(&narrow_type, 0, sizeof narrow_type);
1374 narrow_type.sign = bld->type.sign;
1375 narrow_type.width = bld->type.width/2;
1376 narrow_type.length = bld->type.length*2;
1377
1378 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1379 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1380 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1381 res = lp_build_add(&narrow_bld, v0, res);
1382 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1383 } else {
1384 res = lp_build_add(bld, v0, res);
1385
1386 if (bld->type.fixed) {
1387 /*
1388 * We need to mask out the high order bits when lerping 8bit
1389 * normalized colors stored on 16bits
1390 */
1391 /* XXX: This step is necessary for lerping 8bit colors stored on
1392 * 16bits, but it will be wrong for true fixed point use cases.
1393 * Basically we need a more powerful lp_type, capable of further
1394 * distinguishing the values interpretation from the value storage.
1395 */
1396 LLVMValueRef low_bits;
1397 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1398 res = LLVMBuildAnd(builder, res, low_bits, "");
1399 }
1400 }
1401
1402 return res;
1403 }
1404
1405
1406 /**
1407 * Linear interpolation.
1408 */
1409 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1410 lp_build_lerp(struct lp_build_context *bld,
1411 LLVMValueRef x,
1412 LLVMValueRef v0,
1413 LLVMValueRef v1,
1414 unsigned flags)
1415 {
1416 const struct lp_type type = bld->type;
1417 LLVMValueRef res;
1418
1419 assert(lp_check_value(type, x));
1420 assert(lp_check_value(type, v0));
1421 assert(lp_check_value(type, v1));
1422
1423 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1424
1425 if (type.norm) {
1426 struct lp_type wide_type;
1427 struct lp_build_context wide_bld;
1428 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1429
1430 assert(type.length >= 2);
1431
1432 /*
1433 * Create a wider integer type, enough to hold the
1434 * intermediate result of the multiplication.
1435 */
1436 memset(&wide_type, 0, sizeof wide_type);
1437 wide_type.sign = type.sign;
1438 wide_type.width = type.width*2;
1439 wide_type.length = type.length/2;
1440
1441 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1442
1443 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1444 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1445 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1446
1447 /*
1448 * Lerp both halves.
1449 */
1450
1451 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1452
1453 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1454 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1455
1456 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1457 } else {
1458 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1459 }
1460
1461 return res;
1462 }
1463
1464
1465 /**
1466 * Bilinear interpolation.
1467 *
1468 * Values indices are in v_{yx}.
1469 */
1470 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1471 lp_build_lerp_2d(struct lp_build_context *bld,
1472 LLVMValueRef x,
1473 LLVMValueRef y,
1474 LLVMValueRef v00,
1475 LLVMValueRef v01,
1476 LLVMValueRef v10,
1477 LLVMValueRef v11,
1478 unsigned flags)
1479 {
1480 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1481 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1482 return lp_build_lerp(bld, y, v0, v1, flags);
1483 }
1484
1485
1486 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1487 lp_build_lerp_3d(struct lp_build_context *bld,
1488 LLVMValueRef x,
1489 LLVMValueRef y,
1490 LLVMValueRef z,
1491 LLVMValueRef v000,
1492 LLVMValueRef v001,
1493 LLVMValueRef v010,
1494 LLVMValueRef v011,
1495 LLVMValueRef v100,
1496 LLVMValueRef v101,
1497 LLVMValueRef v110,
1498 LLVMValueRef v111,
1499 unsigned flags)
1500 {
1501 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1502 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1503 return lp_build_lerp(bld, z, v0, v1, flags);
1504 }
1505
1506
1507 /**
1508 * Generate min(a, b)
1509 * Do checks for special cases but not for nans.
1510 */
1511 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1512 lp_build_min(struct lp_build_context *bld,
1513 LLVMValueRef a,
1514 LLVMValueRef b)
1515 {
1516 assert(lp_check_value(bld->type, a));
1517 assert(lp_check_value(bld->type, b));
1518
1519 if (a == bld->undef || b == bld->undef)
1520 return bld->undef;
1521
1522 if (a == b)
1523 return a;
1524
1525 if (bld->type.norm) {
1526 if (!bld->type.sign) {
1527 if (a == bld->zero || b == bld->zero) {
1528 return bld->zero;
1529 }
1530 }
1531 if (a == bld->one)
1532 return b;
1533 if (b == bld->one)
1534 return a;
1535 }
1536
1537 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1538 }
1539
1540
1541 /**
1542 * Generate min(a, b)
1543 * NaN's are handled according to the behavior specified by the
1544 * nan_behavior argument.
1545 */
1546 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1547 lp_build_min_ext(struct lp_build_context *bld,
1548 LLVMValueRef a,
1549 LLVMValueRef b,
1550 enum gallivm_nan_behavior nan_behavior)
1551 {
1552 assert(lp_check_value(bld->type, a));
1553 assert(lp_check_value(bld->type, b));
1554
1555 if (a == bld->undef || b == bld->undef)
1556 return bld->undef;
1557
1558 if (a == b)
1559 return a;
1560
1561 if (bld->type.norm) {
1562 if (!bld->type.sign) {
1563 if (a == bld->zero || b == bld->zero) {
1564 return bld->zero;
1565 }
1566 }
1567 if (a == bld->one)
1568 return b;
1569 if (b == bld->one)
1570 return a;
1571 }
1572
1573 return lp_build_min_simple(bld, a, b, nan_behavior);
1574 }
1575
1576
1577 /**
1578 * Generate max(a, b)
1579 * Do checks for special cases, but NaN behavior is undefined.
1580 */
1581 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1582 lp_build_max(struct lp_build_context *bld,
1583 LLVMValueRef a,
1584 LLVMValueRef b)
1585 {
1586 assert(lp_check_value(bld->type, a));
1587 assert(lp_check_value(bld->type, b));
1588
1589 if (a == bld->undef || b == bld->undef)
1590 return bld->undef;
1591
1592 if (a == b)
1593 return a;
1594
1595 if (bld->type.norm) {
1596 if (a == bld->one || b == bld->one)
1597 return bld->one;
1598 if (!bld->type.sign) {
1599 if (a == bld->zero) {
1600 return b;
1601 }
1602 if (b == bld->zero) {
1603 return a;
1604 }
1605 }
1606 }
1607
1608 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1609 }
1610
1611
1612 /**
1613 * Generate max(a, b)
1614 * Checks for special cases.
1615 * NaN's are handled according to the behavior specified by the
1616 * nan_behavior argument.
1617 */
1618 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1619 lp_build_max_ext(struct lp_build_context *bld,
1620 LLVMValueRef a,
1621 LLVMValueRef b,
1622 enum gallivm_nan_behavior nan_behavior)
1623 {
1624 assert(lp_check_value(bld->type, a));
1625 assert(lp_check_value(bld->type, b));
1626
1627 if (a == bld->undef || b == bld->undef)
1628 return bld->undef;
1629
1630 if (a == b)
1631 return a;
1632
1633 if (bld->type.norm) {
1634 if (a == bld->one || b == bld->one)
1635 return bld->one;
1636 if (!bld->type.sign) {
1637 if (a == bld->zero) {
1638 return b;
1639 }
1640 if (b == bld->zero) {
1641 return a;
1642 }
1643 }
1644 }
1645
1646 return lp_build_max_simple(bld, a, b, nan_behavior);
1647 }
1648
1649
1650 /**
1651 * Generate clamp(a, min, max)
1652 * NaN behavior (for any of a, min, max) is undefined.
1653 * Do checks for special cases.
1654 */
1655 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1656 lp_build_clamp(struct lp_build_context *bld,
1657 LLVMValueRef a,
1658 LLVMValueRef min,
1659 LLVMValueRef max)
1660 {
1661 assert(lp_check_value(bld->type, a));
1662 assert(lp_check_value(bld->type, min));
1663 assert(lp_check_value(bld->type, max));
1664
1665 a = lp_build_min(bld, a, max);
1666 a = lp_build_max(bld, a, min);
1667 return a;
1668 }
1669
1670
1671 /**
1672 * Generate clamp(a, 0, 1)
1673 * A NaN will get converted to zero.
1674 */
1675 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1676 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1677 LLVMValueRef a)
1678 {
1679 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1680 a = lp_build_min(bld, a, bld->one);
1681 return a;
1682 }
1683
1684
1685 /**
1686 * Generate abs(a)
1687 */
1688 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1689 lp_build_abs(struct lp_build_context *bld,
1690 LLVMValueRef a)
1691 {
1692 LLVMBuilderRef builder = bld->gallivm->builder;
1693 const struct lp_type type = bld->type;
1694 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1695
1696 assert(lp_check_value(type, a));
1697
1698 if (!type.sign)
1699 return a;
1700
1701 if (type.floating) {
1702 char intrinsic[32];
1703 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1704 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1705 }
1706
1707 if (type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1708 switch(type.width) {
1709 case 8:
1710 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1711 case 16:
1712 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1713 case 32:
1714 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1715 }
1716 }
1717 else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1718 switch(type.width) {
1719 case 8:
1720 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1721 case 16:
1722 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1723 case 32:
1724 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1725 }
1726 }
1727
1728 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1729 a, LLVMBuildNeg(builder, a, ""));
1730 }
1731
1732
1733 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1734 lp_build_negate(struct lp_build_context *bld,
1735 LLVMValueRef a)
1736 {
1737 LLVMBuilderRef builder = bld->gallivm->builder;
1738
1739 assert(lp_check_value(bld->type, a));
1740
1741 if (bld->type.floating)
1742 a = LLVMBuildFNeg(builder, a, "");
1743 else
1744 a = LLVMBuildNeg(builder, a, "");
1745
1746 return a;
1747 }
1748
1749
1750 /** Return -1, 0 or +1 depending on the sign of a */
1751 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1752 lp_build_sgn(struct lp_build_context *bld,
1753 LLVMValueRef a)
1754 {
1755 LLVMBuilderRef builder = bld->gallivm->builder;
1756 const struct lp_type type = bld->type;
1757 LLVMValueRef cond;
1758 LLVMValueRef res;
1759
1760 assert(lp_check_value(type, a));
1761
1762 /* Handle non-zero case */
1763 if (!type.sign) {
1764 /* if not zero then sign must be positive */
1765 res = bld->one;
1766 }
1767 else if (type.floating) {
1768 LLVMTypeRef vec_type;
1769 LLVMTypeRef int_type;
1770 LLVMValueRef mask;
1771 LLVMValueRef sign;
1772 LLVMValueRef one;
1773 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1774
1775 int_type = lp_build_int_vec_type(bld->gallivm, type);
1776 vec_type = lp_build_vec_type(bld->gallivm, type);
1777 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1778
1779 /* Take the sign bit and add it to 1 constant */
1780 sign = LLVMBuildBitCast(builder, a, int_type, "");
1781 sign = LLVMBuildAnd(builder, sign, mask, "");
1782 one = LLVMConstBitCast(bld->one, int_type);
1783 res = LLVMBuildOr(builder, sign, one, "");
1784 res = LLVMBuildBitCast(builder, res, vec_type, "");
1785 }
1786 else
1787 {
1788 /* signed int/norm/fixed point */
1789 /* could use psign with sse3 and appropriate vectors here */
1790 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1791 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1792 res = lp_build_select(bld, cond, bld->one, minus_one);
1793 }
1794
1795 /* Handle zero */
1796 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1797 res = lp_build_select(bld, cond, bld->zero, res);
1798
1799 return res;
1800 }
1801
1802
1803 /**
1804 * Set the sign of float vector 'a' according to 'sign'.
1805 * If sign==0, return abs(a).
1806 * If sign==1, return -abs(a);
1807 * Other values for sign produce undefined results.
1808 */
1809 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1810 lp_build_set_sign(struct lp_build_context *bld,
1811 LLVMValueRef a, LLVMValueRef sign)
1812 {
1813 LLVMBuilderRef builder = bld->gallivm->builder;
1814 const struct lp_type type = bld->type;
1815 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1816 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1817 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1818 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1819 ~((unsigned long long) 1 << (type.width - 1)));
1820 LLVMValueRef val, res;
1821
1822 assert(type.floating);
1823 assert(lp_check_value(type, a));
1824
1825 /* val = reinterpret_cast<int>(a) */
1826 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1827 /* val = val & mask */
1828 val = LLVMBuildAnd(builder, val, mask, "");
1829 /* sign = sign << shift */
1830 sign = LLVMBuildShl(builder, sign, shift, "");
1831 /* res = val | sign */
1832 res = LLVMBuildOr(builder, val, sign, "");
1833 /* res = reinterpret_cast<float>(res) */
1834 res = LLVMBuildBitCast(builder, res, vec_type, "");
1835
1836 return res;
1837 }
1838
1839
1840 /**
1841 * Convert vector of (or scalar) int to vector of (or scalar) float.
1842 */
1843 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1844 lp_build_int_to_float(struct lp_build_context *bld,
1845 LLVMValueRef a)
1846 {
1847 LLVMBuilderRef builder = bld->gallivm->builder;
1848 const struct lp_type type = bld->type;
1849 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1850
1851 assert(type.floating);
1852
1853 return LLVMBuildSIToFP(builder, a, vec_type, "");
1854 }
1855
1856
1857 static boolean
arch_rounding_available(const struct lp_type type)1858 arch_rounding_available(const struct lp_type type)
1859 {
1860 if ((util_get_cpu_caps()->has_sse4_1 &&
1861 (type.length == 1 || type.width*type.length == 128)) ||
1862 (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1863 (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1864 return TRUE;
1865 else if ((util_get_cpu_caps()->has_altivec &&
1866 (type.width == 32 && type.length == 4)))
1867 return TRUE;
1868 else if (util_get_cpu_caps()->has_neon)
1869 return TRUE;
1870 else if (util_get_cpu_caps()->family == CPU_S390X)
1871 return TRUE;
1872
1873 return FALSE;
1874 }
1875
1876 enum lp_build_round_mode
1877 {
1878 LP_BUILD_ROUND_NEAREST = 0,
1879 LP_BUILD_ROUND_FLOOR = 1,
1880 LP_BUILD_ROUND_CEIL = 2,
1881 LP_BUILD_ROUND_TRUNCATE = 3
1882 };
1883
1884
1885 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1886 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1887 LLVMValueRef a)
1888 {
1889 LLVMBuilderRef builder = bld->gallivm->builder;
1890 const struct lp_type type = bld->type;
1891 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1892 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1893 const char *intrinsic;
1894 LLVMValueRef res;
1895
1896 assert(type.floating);
1897 /* using the double precision conversions is a bit more complicated */
1898 assert(type.width == 32);
1899
1900 assert(lp_check_value(type, a));
1901 assert(util_get_cpu_caps()->has_sse2);
1902
1903 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1904 if (type.length == 1) {
1905 LLVMTypeRef vec_type;
1906 LLVMValueRef undef;
1907 LLVMValueRef arg;
1908 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1909
1910 vec_type = LLVMVectorType(bld->elem_type, 4);
1911
1912 intrinsic = "llvm.x86.sse.cvtss2si";
1913
1914 undef = LLVMGetUndef(vec_type);
1915
1916 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1917
1918 res = lp_build_intrinsic_unary(builder, intrinsic,
1919 ret_type, arg);
1920 }
1921 else {
1922 if (type.width* type.length == 128) {
1923 intrinsic = "llvm.x86.sse2.cvtps2dq";
1924 }
1925 else {
1926 assert(type.width*type.length == 256);
1927 assert(util_get_cpu_caps()->has_avx);
1928
1929 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1930 }
1931 res = lp_build_intrinsic_unary(builder, intrinsic,
1932 ret_type, a);
1933 }
1934
1935 return res;
1936 }
1937
1938
1939 /*
1940 */
1941 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1942 lp_build_round_altivec(struct lp_build_context *bld,
1943 LLVMValueRef a,
1944 enum lp_build_round_mode mode)
1945 {
1946 LLVMBuilderRef builder = bld->gallivm->builder;
1947 const struct lp_type type = bld->type;
1948 const char *intrinsic = NULL;
1949
1950 assert(type.floating);
1951
1952 assert(lp_check_value(type, a));
1953 assert(util_get_cpu_caps()->has_altivec);
1954
1955 (void)type;
1956
1957 switch (mode) {
1958 case LP_BUILD_ROUND_NEAREST:
1959 intrinsic = "llvm.ppc.altivec.vrfin";
1960 break;
1961 case LP_BUILD_ROUND_FLOOR:
1962 intrinsic = "llvm.ppc.altivec.vrfim";
1963 break;
1964 case LP_BUILD_ROUND_CEIL:
1965 intrinsic = "llvm.ppc.altivec.vrfip";
1966 break;
1967 case LP_BUILD_ROUND_TRUNCATE:
1968 intrinsic = "llvm.ppc.altivec.vrfiz";
1969 break;
1970 }
1971
1972 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1973 }
1974
1975
1976 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1977 lp_build_round_arch(struct lp_build_context *bld,
1978 LLVMValueRef a,
1979 enum lp_build_round_mode mode)
1980 {
1981 if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
1982 util_get_cpu_caps()->family == CPU_S390X) {
1983 LLVMBuilderRef builder = bld->gallivm->builder;
1984 const struct lp_type type = bld->type;
1985 const char *intrinsic_root;
1986 char intrinsic[32];
1987
1988 assert(type.floating);
1989 assert(lp_check_value(type, a));
1990 (void)type;
1991
1992 switch (mode) {
1993 case LP_BUILD_ROUND_NEAREST:
1994 intrinsic_root = "llvm.nearbyint";
1995 break;
1996 case LP_BUILD_ROUND_FLOOR:
1997 intrinsic_root = "llvm.floor";
1998 break;
1999 case LP_BUILD_ROUND_CEIL:
2000 intrinsic_root = "llvm.ceil";
2001 break;
2002 case LP_BUILD_ROUND_TRUNCATE:
2003 intrinsic_root = "llvm.trunc";
2004 break;
2005 default:
2006 unreachable("unhandled lp_build_round_mode");
2007 }
2008
2009 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2010 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2011 }
2012 else /* (util_get_cpu_caps()->has_altivec) */
2013 return lp_build_round_altivec(bld, a, mode);
2014 }
2015
2016
2017 /**
2018 * Return the integer part of a float (vector) value (== round toward zero).
2019 * The returned value is a float (vector).
2020 * Ex: trunc(-1.5) = -1.0
2021 */
2022 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2023 lp_build_trunc(struct lp_build_context *bld,
2024 LLVMValueRef a)
2025 {
2026 LLVMBuilderRef builder = bld->gallivm->builder;
2027 const struct lp_type type = bld->type;
2028
2029 assert(type.floating);
2030 assert(lp_check_value(type, a));
2031
2032 if (type.width == 16) {
2033 char intrinsic[64];
2034 lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2035 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2036 }
2037
2038 if (arch_rounding_available(type)) {
2039 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2040 }
2041 else {
2042 const struct lp_type type = bld->type;
2043 struct lp_type inttype;
2044 struct lp_build_context intbld;
2045 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2046 LLVMValueRef trunc, res, anosign, mask;
2047 LLVMTypeRef int_vec_type = bld->int_vec_type;
2048 LLVMTypeRef vec_type = bld->vec_type;
2049
2050 inttype = type;
2051 inttype.floating = 0;
2052 lp_build_context_init(&intbld, bld->gallivm, inttype);
2053
2054 /* round by truncation */
2055 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2056 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2057
2058 /* mask out sign bit */
2059 anosign = lp_build_abs(bld, a);
2060 /*
2061 * mask out all values if anosign > 2^24
2062 * This should work both for large ints (all rounding is no-op for them
2063 * because such floats are always exact) as well as special cases like
2064 * NaNs, Infs (taking advantage of the fact they use max exponent).
2065 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066 */
2067 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070 return lp_build_select(bld, mask, a, res);
2071 }
2072 }
2073
2074
2075 /**
2076 * Return float (vector) rounded to nearest integer (vector). The returned
2077 * value is a float (vector).
2078 * Ex: round(0.9) = 1.0
2079 * Ex: round(-1.5) = -2.0
2080 */
2081 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2082 lp_build_round(struct lp_build_context *bld,
2083 LLVMValueRef a)
2084 {
2085 LLVMBuilderRef builder = bld->gallivm->builder;
2086 const struct lp_type type = bld->type;
2087
2088 assert(type.floating);
2089 assert(lp_check_value(type, a));
2090
2091 if (type.width == 16) {
2092 char intrinsic[64];
2093 lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2094 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2095 }
2096
2097 if (arch_rounding_available(type)) {
2098 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2099 }
2100 else {
2101 const struct lp_type type = bld->type;
2102 struct lp_type inttype;
2103 struct lp_build_context intbld;
2104 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2105 LLVMValueRef res, anosign, mask;
2106 LLVMTypeRef int_vec_type = bld->int_vec_type;
2107 LLVMTypeRef vec_type = bld->vec_type;
2108
2109 inttype = type;
2110 inttype.floating = 0;
2111 lp_build_context_init(&intbld, bld->gallivm, inttype);
2112
2113 res = lp_build_iround(bld, a);
2114 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2115
2116 /* mask out sign bit */
2117 anosign = lp_build_abs(bld, a);
2118 /*
2119 * mask out all values if anosign > 2^24
2120 * This should work both for large ints (all rounding is no-op for them
2121 * because such floats are always exact) as well as special cases like
2122 * NaNs, Infs (taking advantage of the fact they use max exponent).
2123 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2124 */
2125 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2126 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2127 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2128 return lp_build_select(bld, mask, a, res);
2129 }
2130 }
2131
2132
2133 /**
2134 * Return floor of float (vector), result is a float (vector)
2135 * Ex: floor(1.1) = 1.0
2136 * Ex: floor(-1.1) = -2.0
2137 */
2138 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2139 lp_build_floor(struct lp_build_context *bld,
2140 LLVMValueRef a)
2141 {
2142 LLVMBuilderRef builder = bld->gallivm->builder;
2143 const struct lp_type type = bld->type;
2144
2145 assert(type.floating);
2146 assert(lp_check_value(type, a));
2147
2148 if (arch_rounding_available(type)) {
2149 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2150 }
2151 else {
2152 const struct lp_type type = bld->type;
2153 struct lp_type inttype;
2154 struct lp_build_context intbld;
2155 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2156 LLVMValueRef trunc, res, anosign, mask;
2157 LLVMTypeRef int_vec_type = bld->int_vec_type;
2158 LLVMTypeRef vec_type = bld->vec_type;
2159
2160 if (type.width != 32) {
2161 char intrinsic[32];
2162 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2163 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2164 }
2165
2166 assert(type.width == 32); /* might want to handle doubles at some point */
2167
2168 inttype = type;
2169 inttype.floating = 0;
2170 lp_build_context_init(&intbld, bld->gallivm, inttype);
2171
2172 /* round by truncation */
2173 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2174 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2175
2176 if (type.sign) {
2177 LLVMValueRef tmp;
2178
2179 /*
2180 * fix values if rounding is wrong (for non-special cases)
2181 * - this is the case if trunc > a
2182 */
2183 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2184 /* tmp = trunc > a ? 1.0 : 0.0 */
2185 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2186 tmp = lp_build_and(&intbld, mask, tmp);
2187 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2188 res = lp_build_sub(bld, res, tmp);
2189 }
2190
2191 /* mask out sign bit */
2192 anosign = lp_build_abs(bld, a);
2193 /*
2194 * mask out all values if anosign > 2^24
2195 * This should work both for large ints (all rounding is no-op for them
2196 * because such floats are always exact) as well as special cases like
2197 * NaNs, Infs (taking advantage of the fact they use max exponent).
2198 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2199 */
2200 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2201 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2202 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2203 return lp_build_select(bld, mask, a, res);
2204 }
2205 }
2206
2207
2208 /**
2209 * Return ceiling of float (vector), returning float (vector).
2210 * Ex: ceil( 1.1) = 2.0
2211 * Ex: ceil(-1.1) = -1.0
2212 */
2213 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2214 lp_build_ceil(struct lp_build_context *bld,
2215 LLVMValueRef a)
2216 {
2217 LLVMBuilderRef builder = bld->gallivm->builder;
2218 const struct lp_type type = bld->type;
2219
2220 assert(type.floating);
2221 assert(lp_check_value(type, a));
2222
2223 if (arch_rounding_available(type)) {
2224 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2225 }
2226 else {
2227 const struct lp_type type = bld->type;
2228 struct lp_type inttype;
2229 struct lp_build_context intbld;
2230 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2231 LLVMValueRef trunc, res, anosign, mask, tmp;
2232 LLVMTypeRef int_vec_type = bld->int_vec_type;
2233 LLVMTypeRef vec_type = bld->vec_type;
2234
2235 if (type.width != 32) {
2236 char intrinsic[32];
2237 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2238 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2239 }
2240
2241 assert(type.width == 32); /* might want to handle doubles at some point */
2242
2243 inttype = type;
2244 inttype.floating = 0;
2245 lp_build_context_init(&intbld, bld->gallivm, inttype);
2246
2247 /* round by truncation */
2248 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2249 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2250
2251 /*
2252 * fix values if rounding is wrong (for non-special cases)
2253 * - this is the case if trunc < a
2254 */
2255 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2256 /* tmp = trunc < a ? 1.0 : 0.0 */
2257 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2258 tmp = lp_build_and(&intbld, mask, tmp);
2259 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2260 res = lp_build_add(bld, trunc, tmp);
2261
2262 /* mask out sign bit */
2263 anosign = lp_build_abs(bld, a);
2264 /*
2265 * mask out all values if anosign > 2^24
2266 * This should work both for large ints (all rounding is no-op for them
2267 * because such floats are always exact) as well as special cases like
2268 * NaNs, Infs (taking advantage of the fact they use max exponent).
2269 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2270 */
2271 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2272 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2273 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2274 return lp_build_select(bld, mask, a, res);
2275 }
2276 }
2277
2278
2279 /**
2280 * Return fractional part of 'a' computed as a - floor(a)
2281 * Typically used in texture coord arithmetic.
2282 */
2283 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2284 lp_build_fract(struct lp_build_context *bld,
2285 LLVMValueRef a)
2286 {
2287 assert(bld->type.floating);
2288 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2289 }
2290
2291
2292 /**
2293 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2294 * against 0.99999(9). (Will also return that value for NaNs.)
2295 */
2296 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2297 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2298 {
2299 LLVMValueRef max;
2300
2301 /* this is the largest number smaller than 1.0 representable as float */
2302 max = lp_build_const_vec(bld->gallivm, bld->type,
2303 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2304 return lp_build_min_ext(bld, fract, max,
2305 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2306 }
2307
2308
2309 /**
2310 * Same as lp_build_fract, but guarantees that the result is always smaller
2311 * than one. Will also return the smaller-than-one value for infs, NaNs.
2312 */
2313 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2314 lp_build_fract_safe(struct lp_build_context *bld,
2315 LLVMValueRef a)
2316 {
2317 return clamp_fract(bld, lp_build_fract(bld, a));
2318 }
2319
2320
2321 /**
2322 * Return the integer part of a float (vector) value (== round toward zero).
2323 * The returned value is an integer (vector).
2324 * Ex: itrunc(-1.5) = -1
2325 */
2326 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2327 lp_build_itrunc(struct lp_build_context *bld,
2328 LLVMValueRef a)
2329 {
2330 LLVMBuilderRef builder = bld->gallivm->builder;
2331 const struct lp_type type = bld->type;
2332 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2333
2334 assert(type.floating);
2335 assert(lp_check_value(type, a));
2336
2337 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2338 }
2339
2340
2341 /**
2342 * Return float (vector) rounded to nearest integer (vector). The returned
2343 * value is an integer (vector).
2344 * Ex: iround(0.9) = 1
2345 * Ex: iround(-1.5) = -2
2346 */
2347 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2348 lp_build_iround(struct lp_build_context *bld,
2349 LLVMValueRef a)
2350 {
2351 LLVMBuilderRef builder = bld->gallivm->builder;
2352 const struct lp_type type = bld->type;
2353 LLVMTypeRef int_vec_type = bld->int_vec_type;
2354 LLVMValueRef res;
2355
2356 assert(type.floating);
2357
2358 assert(lp_check_value(type, a));
2359
2360 if ((util_get_cpu_caps()->has_sse2 &&
2361 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2362 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2363 return lp_build_iround_nearest_sse2(bld, a);
2364 }
2365 if (arch_rounding_available(type)) {
2366 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2367 }
2368 else {
2369 LLVMValueRef half;
2370
2371 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2372
2373 if (type.sign) {
2374 LLVMTypeRef vec_type = bld->vec_type;
2375 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2376 (unsigned long long)1 << (type.width - 1));
2377 LLVMValueRef sign;
2378
2379 /* get sign bit */
2380 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2381 sign = LLVMBuildAnd(builder, sign, mask, "");
2382
2383 /* sign * 0.5 */
2384 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2385 half = LLVMBuildOr(builder, sign, half, "");
2386 half = LLVMBuildBitCast(builder, half, vec_type, "");
2387 }
2388
2389 res = LLVMBuildFAdd(builder, a, half, "");
2390 }
2391
2392 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2393
2394 return res;
2395 }
2396
2397
2398 /**
2399 * Return floor of float (vector), result is an int (vector)
2400 * Ex: ifloor(1.1) = 1.0
2401 * Ex: ifloor(-1.1) = -2.0
2402 */
2403 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2404 lp_build_ifloor(struct lp_build_context *bld,
2405 LLVMValueRef a)
2406 {
2407 LLVMBuilderRef builder = bld->gallivm->builder;
2408 const struct lp_type type = bld->type;
2409 LLVMTypeRef int_vec_type = bld->int_vec_type;
2410 LLVMValueRef res;
2411
2412 assert(type.floating);
2413 assert(lp_check_value(type, a));
2414
2415 res = a;
2416 if (type.sign) {
2417 if (arch_rounding_available(type)) {
2418 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2419 }
2420 else {
2421 struct lp_type inttype;
2422 struct lp_build_context intbld;
2423 LLVMValueRef trunc, itrunc, mask;
2424
2425 assert(type.floating);
2426 assert(lp_check_value(type, a));
2427
2428 inttype = type;
2429 inttype.floating = 0;
2430 lp_build_context_init(&intbld, bld->gallivm, inttype);
2431
2432 /* round by truncation */
2433 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2434 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2435
2436 /*
2437 * fix values if rounding is wrong (for non-special cases)
2438 * - this is the case if trunc > a
2439 * The results of doing this with NaNs, very large values etc.
2440 * are undefined but this seems to be the case anyway.
2441 */
2442 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2443 /* cheapie minus one with mask since the mask is minus one / zero */
2444 return lp_build_add(&intbld, itrunc, mask);
2445 }
2446 }
2447
2448 /* round to nearest (toward zero) */
2449 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2450
2451 return res;
2452 }
2453
2454
2455 /**
2456 * Return ceiling of float (vector), returning int (vector).
2457 * Ex: iceil( 1.1) = 2
2458 * Ex: iceil(-1.1) = -1
2459 */
2460 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2461 lp_build_iceil(struct lp_build_context *bld,
2462 LLVMValueRef a)
2463 {
2464 LLVMBuilderRef builder = bld->gallivm->builder;
2465 const struct lp_type type = bld->type;
2466 LLVMTypeRef int_vec_type = bld->int_vec_type;
2467 LLVMValueRef res;
2468
2469 assert(type.floating);
2470 assert(lp_check_value(type, a));
2471
2472 if (arch_rounding_available(type)) {
2473 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2474 }
2475 else {
2476 struct lp_type inttype;
2477 struct lp_build_context intbld;
2478 LLVMValueRef trunc, itrunc, mask;
2479
2480 assert(type.floating);
2481 assert(lp_check_value(type, a));
2482
2483 inttype = type;
2484 inttype.floating = 0;
2485 lp_build_context_init(&intbld, bld->gallivm, inttype);
2486
2487 /* round by truncation */
2488 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2489 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2490
2491 /*
2492 * fix values if rounding is wrong (for non-special cases)
2493 * - this is the case if trunc < a
2494 * The results of doing this with NaNs, very large values etc.
2495 * are undefined but this seems to be the case anyway.
2496 */
2497 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2498 /* cheapie plus one with mask since the mask is minus one / zero */
2499 return lp_build_sub(&intbld, itrunc, mask);
2500 }
2501
2502 /* round to nearest (toward zero) */
2503 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2504
2505 return res;
2506 }
2507
2508
2509 /**
2510 * Combined ifloor() & fract().
2511 *
2512 * Preferred to calling the functions separately, as it will ensure that the
2513 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2514 */
2515 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2516 lp_build_ifloor_fract(struct lp_build_context *bld,
2517 LLVMValueRef a,
2518 LLVMValueRef *out_ipart,
2519 LLVMValueRef *out_fpart)
2520 {
2521 LLVMBuilderRef builder = bld->gallivm->builder;
2522 const struct lp_type type = bld->type;
2523 LLVMValueRef ipart;
2524
2525 assert(type.floating);
2526 assert(lp_check_value(type, a));
2527
2528 if (arch_rounding_available(type)) {
2529 /*
2530 * floor() is easier.
2531 */
2532
2533 ipart = lp_build_floor(bld, a);
2534 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2535 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2536 }
2537 else {
2538 /*
2539 * ifloor() is easier.
2540 */
2541
2542 *out_ipart = lp_build_ifloor(bld, a);
2543 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2544 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2545 }
2546 }
2547
2548
2549 /**
2550 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2551 * always smaller than one.
2552 */
2553 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2554 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2555 LLVMValueRef a,
2556 LLVMValueRef *out_ipart,
2557 LLVMValueRef *out_fpart)
2558 {
2559 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2560 *out_fpart = clamp_fract(bld, *out_fpart);
2561 }
2562
2563
2564 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2565 lp_build_sqrt(struct lp_build_context *bld,
2566 LLVMValueRef a)
2567 {
2568 LLVMBuilderRef builder = bld->gallivm->builder;
2569 const struct lp_type type = bld->type;
2570 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2571 char intrinsic[32];
2572
2573 assert(lp_check_value(type, a));
2574
2575 assert(type.floating);
2576 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2577
2578 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2579 }
2580
2581
2582 /**
2583 * Do one Newton-Raphson step to improve reciprocate precision:
2584 *
2585 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2586 *
2587 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2588 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2589 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2590 * halo. It would be necessary to clamp the argument to prevent this.
2591 *
2592 * See also:
2593 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2594 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2595 */
2596 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2597 lp_build_rcp_refine(struct lp_build_context *bld,
2598 LLVMValueRef a,
2599 LLVMValueRef rcp_a)
2600 {
2601 LLVMBuilderRef builder = bld->gallivm->builder;
2602 LLVMValueRef neg_a;
2603 LLVMValueRef res;
2604
2605 neg_a = LLVMBuildFNeg(builder, a, "");
2606 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2607 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2608
2609 return res;
2610 }
2611
2612
2613 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2614 lp_build_rcp(struct lp_build_context *bld,
2615 LLVMValueRef a)
2616 {
2617 LLVMBuilderRef builder = bld->gallivm->builder;
2618 const struct lp_type type = bld->type;
2619
2620 assert(lp_check_value(type, a));
2621
2622 if (a == bld->zero)
2623 return bld->undef;
2624 if (a == bld->one)
2625 return bld->one;
2626 if (a == bld->undef)
2627 return bld->undef;
2628
2629 assert(type.floating);
2630
2631 if (LLVMIsConstant(a))
2632 return LLVMBuildFDiv(builder, bld->one, a, "");
2633
2634 /*
2635 * We don't use RCPPS because:
2636 * - it only has 10bits of precision
2637 * - it doesn't even get the reciprocate of 1.0 exactly
2638 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2639 * - for recent processors the benefit over DIVPS is marginal, a case
2640 * dependent
2641 *
2642 * We could still use it on certain processors if benchmarks show that the
2643 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2644 * particular uses that require less workarounds.
2645 */
2646
2647 if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2648 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2649 const unsigned num_iterations = 0;
2650 LLVMValueRef res;
2651 unsigned i;
2652 const char *intrinsic = NULL;
2653
2654 if (type.length == 4) {
2655 intrinsic = "llvm.x86.sse.rcp.ps";
2656 }
2657 else {
2658 intrinsic = "llvm.x86.avx.rcp.ps.256";
2659 }
2660
2661 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2662
2663 for (i = 0; i < num_iterations; ++i) {
2664 res = lp_build_rcp_refine(bld, a, res);
2665 }
2666
2667 return res;
2668 }
2669
2670 return LLVMBuildFDiv(builder, bld->one, a, "");
2671 }
2672
2673
2674 /**
2675 * Do one Newton-Raphson step to improve rsqrt precision:
2676 *
2677 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2678 *
2679 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2680 */
2681 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2682 lp_build_rsqrt_refine(struct lp_build_context *bld,
2683 LLVMValueRef a,
2684 LLVMValueRef rsqrt_a)
2685 {
2686 LLVMBuilderRef builder = bld->gallivm->builder;
2687 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2688 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2689 LLVMValueRef res;
2690
2691 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2692 res = LLVMBuildFMul(builder, a, res, "");
2693 res = LLVMBuildFSub(builder, three, res, "");
2694 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2695 res = LLVMBuildFMul(builder, half, res, "");
2696
2697 return res;
2698 }
2699
2700
2701 /**
2702 * Generate 1/sqrt(a).
2703 * Result is undefined for values < 0, infinity for +0.
2704 */
2705 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2706 lp_build_rsqrt(struct lp_build_context *bld,
2707 LLVMValueRef a)
2708 {
2709 const struct lp_type type = bld->type;
2710
2711 assert(lp_check_value(type, a));
2712
2713 assert(type.floating);
2714
2715 /*
2716 * This should be faster but all denormals will end up as infinity.
2717 */
2718 if (0 && lp_build_fast_rsqrt_available(type)) {
2719 const unsigned num_iterations = 1;
2720 LLVMValueRef res;
2721 unsigned i;
2722
2723 /* rsqrt(1.0) != 1.0 here */
2724 res = lp_build_fast_rsqrt(bld, a);
2725
2726 if (num_iterations) {
2727 /*
2728 * Newton-Raphson will result in NaN instead of infinity for zero,
2729 * and NaN instead of zero for infinity.
2730 * Also, need to ensure rsqrt(1.0) == 1.0.
2731 * All numbers smaller than FLT_MIN will result in +infinity
2732 * (rsqrtps treats all denormals as zero).
2733 */
2734 LLVMValueRef cmp;
2735 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2736 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2737
2738 for (i = 0; i < num_iterations; ++i) {
2739 res = lp_build_rsqrt_refine(bld, a, res);
2740 }
2741 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2742 res = lp_build_select(bld, cmp, inf, res);
2743 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2744 res = lp_build_select(bld, cmp, bld->zero, res);
2745 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2746 res = lp_build_select(bld, cmp, bld->one, res);
2747 }
2748
2749 return res;
2750 }
2751
2752 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2753 }
2754
2755
2756 /**
2757 * If there's a fast (inaccurate) rsqrt instruction available
2758 * (caller may want to avoid to call rsqrt_fast if it's not available,
2759 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2760 * unavailable it would result in sqrt/div/mul so obviously
2761 * much better to just call sqrt, skipping both div and mul).
2762 */
2763 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2764 lp_build_fast_rsqrt_available(struct lp_type type)
2765 {
2766 assert(type.floating);
2767
2768 if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2769 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2770 return true;
2771 }
2772 return false;
2773 }
2774
2775
2776 /**
2777 * Generate 1/sqrt(a).
2778 * Result is undefined for values < 0, infinity for +0.
2779 * Precision is limited, only ~10 bits guaranteed
2780 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2781 */
2782 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2783 lp_build_fast_rsqrt(struct lp_build_context *bld,
2784 LLVMValueRef a)
2785 {
2786 LLVMBuilderRef builder = bld->gallivm->builder;
2787 const struct lp_type type = bld->type;
2788
2789 assert(lp_check_value(type, a));
2790
2791 if (lp_build_fast_rsqrt_available(type)) {
2792 const char *intrinsic = NULL;
2793
2794 if (type.length == 4) {
2795 intrinsic = "llvm.x86.sse.rsqrt.ps";
2796 }
2797 else {
2798 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2799 }
2800 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2801 }
2802 else {
2803 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2804 }
2805 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2806 }
2807
2808
2809 /**
2810 * Generate sin(a) or cos(a) using polynomial approximation.
2811 * TODO: it might be worth recognizing sin and cos using same source
2812 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2813 * would be way cheaper than calculating (nearly) everything twice...
2814 * Not sure it's common enough to be worth bothering however, scs
2815 * opcode could also benefit from calculating both though.
2816 */
2817 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2818 lp_build_sin_or_cos(struct lp_build_context *bld,
2819 LLVMValueRef a,
2820 boolean cos)
2821 {
2822 struct gallivm_state *gallivm = bld->gallivm;
2823 LLVMBuilderRef b = gallivm->builder;
2824 struct lp_type int_type = lp_int_type(bld->type);
2825
2826 /*
2827 * take the absolute value,
2828 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2829 */
2830
2831 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2832 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2833
2834 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2835 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2836
2837 /*
2838 * scale by 4/Pi
2839 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2840 */
2841
2842 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2843 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2844
2845 /*
2846 * store the integer part of y in mm0
2847 * emm2 = _mm_cvttps_epi32(y);
2848 */
2849
2850 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2851
2852 /*
2853 * j=(j+1) & (~1) (see the cephes sources)
2854 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2855 */
2856
2857 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2858 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2859 /*
2860 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2861 */
2862 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2863 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2864
2865 /*
2866 * y = _mm_cvtepi32_ps(emm2);
2867 */
2868 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2869
2870 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2871 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2872 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2873 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2874
2875 /*
2876 * Argument used for poly selection and sign bit determination
2877 * is different for sin vs. cos.
2878 */
2879 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2880 emm2_and;
2881
2882 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2883 LLVMBuildNot(b, emm2_2, ""), ""),
2884 const_29, "sign_bit") :
2885 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2886 LLVMBuildShl(b, emm2_add,
2887 const_29, ""), ""),
2888 sign_mask, "sign_bit");
2889
2890 /*
2891 * get the polynom selection mask
2892 * there is one polynom for 0 <= x <= Pi/4
2893 * and another one for Pi/4<x<=Pi/2
2894 * Both branches will be computed.
2895 *
2896 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2897 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2898 */
2899
2900 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2901 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2902 int_type, PIPE_FUNC_EQUAL,
2903 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2904
2905 /*
2906 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2907 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2908 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2909 */
2910 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2911 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2912 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2913
2914 /*
2915 * The magic pass: "Extended precision modular arithmetic"
2916 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2917 */
2918 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2919 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2920 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2921
2922 /*
2923 * Evaluate the first polynom (0 <= x <= Pi/4)
2924 *
2925 * z = _mm_mul_ps(x,x);
2926 */
2927 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2928
2929 /*
2930 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2931 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2932 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2933 */
2934 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2935 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2936 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2937
2938 /*
2939 * y = *(v4sf*)_ps_coscof_p0;
2940 * y = _mm_mul_ps(y, z);
2941 */
2942 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2943 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2944 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2945 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2946
2947
2948 /*
2949 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2950 * y = _mm_sub_ps(y, tmp);
2951 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2952 */
2953 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2954 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2955 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2956 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2957 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2958
2959 /*
2960 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2961 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2962 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2963 */
2964 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2965 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2966 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2967
2968 /*
2969 * Evaluate the second polynom (Pi/4 <= x <= 0)
2970 *
2971 * y2 = *(v4sf*)_ps_sincof_p0;
2972 * y2 = _mm_mul_ps(y2, z);
2973 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2974 * y2 = _mm_mul_ps(y2, z);
2975 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2976 * y2 = _mm_mul_ps(y2, z);
2977 * y2 = _mm_mul_ps(y2, x);
2978 * y2 = _mm_add_ps(y2, x);
2979 */
2980
2981 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2982 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2983 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2984 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2985
2986 /*
2987 * select the correct result from the two polynoms
2988 * xmm3 = poly_mask;
2989 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2990 * y = _mm_andnot_ps(xmm3, y);
2991 * y = _mm_or_ps(y,y2);
2992 */
2993 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2994 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2995 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2996 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2997 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2998 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2999
3000 /*
3001 * update the sign
3002 * y = _mm_xor_ps(y, sign_bit);
3003 */
3004 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3005 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3006
3007 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3008
3009 /* clamp output to be within [-1, 1] */
3010 y_result = lp_build_clamp(bld, y_result,
3011 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3012 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3013 /* If a is -inf, inf or NaN then return NaN */
3014 y_result = lp_build_select(bld, isfinite, y_result,
3015 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3016 return y_result;
3017 }
3018
3019
3020 /**
3021 * Generate sin(a)
3022 */
3023 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3024 lp_build_sin(struct lp_build_context *bld,
3025 LLVMValueRef a)
3026 {
3027 const struct lp_type type = bld->type;
3028
3029 if (type.width == 16) {
3030 LLVMBuilderRef builder = bld->gallivm->builder;
3031 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3032 char intrinsic[32];
3033 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3034 LLVMValueRef args[] = { a };
3035 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3036 }
3037
3038 return lp_build_sin_or_cos(bld, a, FALSE);
3039 }
3040
3041
3042 /**
3043 * Generate cos(a)
3044 */
3045 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3046 lp_build_cos(struct lp_build_context *bld,
3047 LLVMValueRef a)
3048 {
3049 const struct lp_type type = bld->type;
3050
3051 if (type.width == 16) {
3052 LLVMBuilderRef builder = bld->gallivm->builder;
3053 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3054 char intrinsic[32];
3055 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3056 LLVMValueRef args[] = { a };
3057 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3058 }
3059
3060 return lp_build_sin_or_cos(bld, a, TRUE);
3061 }
3062
3063
3064 /**
3065 * Generate pow(x, y)
3066 */
3067 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3068 lp_build_pow(struct lp_build_context *bld,
3069 LLVMValueRef x,
3070 LLVMValueRef y)
3071 {
3072 /* TODO: optimize the constant case */
3073 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3074 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3075 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3076 __FUNCTION__);
3077 }
3078
3079 LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3080 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3081
3082 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3083 return res;
3084 }
3085
3086
3087 /**
3088 * Generate exp(x)
3089 */
3090 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3091 lp_build_exp(struct lp_build_context *bld,
3092 LLVMValueRef x)
3093 {
3094 /* log2(e) = 1/log(2) */
3095 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3096 1.4426950408889634);
3097
3098 assert(lp_check_value(bld->type, x));
3099
3100 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3101 }
3102
3103
3104 /**
3105 * Generate log(x)
3106 * Behavior is undefined with infs, 0s and nans
3107 */
3108 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3109 lp_build_log(struct lp_build_context *bld,
3110 LLVMValueRef x)
3111 {
3112 /* log(2) */
3113 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3114 0.69314718055994529);
3115
3116 assert(lp_check_value(bld->type, x));
3117
3118 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3119 }
3120
3121
3122 /**
3123 * Generate log(x) that handles edge cases (infs, 0s and nans)
3124 */
3125 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3126 lp_build_log_safe(struct lp_build_context *bld,
3127 LLVMValueRef x)
3128 {
3129 /* log(2) */
3130 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3131 0.69314718055994529);
3132
3133 assert(lp_check_value(bld->type, x));
3134
3135 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3136 }
3137
3138
3139 /**
3140 * Generate polynomial.
3141 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3142 */
3143 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3144 lp_build_polynomial(struct lp_build_context *bld,
3145 LLVMValueRef x,
3146 const double *coeffs,
3147 unsigned num_coeffs)
3148 {
3149 const struct lp_type type = bld->type;
3150 LLVMValueRef even = NULL, odd = NULL;
3151 LLVMValueRef x2;
3152 unsigned i;
3153
3154 assert(lp_check_value(bld->type, x));
3155
3156 /* TODO: optimize the constant case */
3157 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3158 LLVMIsConstant(x)) {
3159 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3160 __FUNCTION__);
3161 }
3162
3163 /*
3164 * Calculate odd and even terms seperately to decrease data dependency
3165 * Ex:
3166 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3167 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3168 */
3169 x2 = lp_build_mul(bld, x, x);
3170
3171 for (i = num_coeffs; i--; ) {
3172 LLVMValueRef coeff;
3173
3174 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3175
3176 if (i % 2 == 0) {
3177 if (even)
3178 even = lp_build_mad(bld, x2, even, coeff);
3179 else
3180 even = coeff;
3181 } else {
3182 if (odd)
3183 odd = lp_build_mad(bld, x2, odd, coeff);
3184 else
3185 odd = coeff;
3186 }
3187 }
3188
3189 if (odd)
3190 return lp_build_mad(bld, odd, x, even);
3191 else if (even)
3192 return even;
3193 else
3194 return bld->undef;
3195 }
3196
3197
3198 /**
3199 * Minimax polynomial fit of 2**x, in range [0, 1[
3200 */
3201 static const double lp_build_exp2_polynomial[] = {
3202 #if EXP_POLY_DEGREE == 5
3203 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3204 0.693153073200168932794,
3205 0.240153617044375388211,
3206 0.0558263180532956664775,
3207 0.00898934009049466391101,
3208 0.00187757667519147912699
3209 #elif EXP_POLY_DEGREE == 4
3210 1.00000259337069434683,
3211 0.693003834469974940458,
3212 0.24144275689150793076,
3213 0.0520114606103070150235,
3214 0.0135341679161270268764
3215 #elif EXP_POLY_DEGREE == 3
3216 0.999925218562710312959,
3217 0.695833540494823811697,
3218 0.226067155427249155588,
3219 0.0780245226406372992967
3220 #elif EXP_POLY_DEGREE == 2
3221 1.00172476321474503578,
3222 0.657636275736077639316,
3223 0.33718943461968720704
3224 #else
3225 #error
3226 #endif
3227 };
3228
3229
3230 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3231 lp_build_exp2(struct lp_build_context *bld,
3232 LLVMValueRef x)
3233 {
3234 LLVMBuilderRef builder = bld->gallivm->builder;
3235 const struct lp_type type = bld->type;
3236 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3237 LLVMValueRef ipart = NULL;
3238 LLVMValueRef fpart = NULL;
3239 LLVMValueRef expipart = NULL;
3240 LLVMValueRef expfpart = NULL;
3241 LLVMValueRef res = NULL;
3242
3243 if (type.floating && type.width == 16) {
3244 char intrinsic[32];
3245 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3246 LLVMValueRef args[] = { x };
3247 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3248 }
3249
3250 assert(lp_check_value(bld->type, x));
3251
3252 /* TODO: optimize the constant case */
3253 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3254 LLVMIsConstant(x)) {
3255 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3256 __FUNCTION__);
3257 }
3258
3259 assert(type.floating && type.width == 32);
3260
3261 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3262 * the result is INF and if it's smaller than -126.9 the result is 0 */
3263 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3264 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3265 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3266 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3267
3268 /* ipart = floor(x) */
3269 /* fpart = x - ipart */
3270 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3271
3272 /* expipart = (float) (1 << ipart) */
3273 expipart = LLVMBuildAdd(builder, ipart,
3274 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3275 expipart = LLVMBuildShl(builder, expipart,
3276 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3277 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3278
3279 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3280 ARRAY_SIZE(lp_build_exp2_polynomial));
3281
3282 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3283
3284 return res;
3285 }
3286
3287
3288 /**
3289 * Extract the exponent of a IEEE-754 floating point value.
3290 *
3291 * Optionally apply an integer bias.
3292 *
3293 * Result is an integer value with
3294 *
3295 * ifloor(log2(x)) + bias
3296 */
3297 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3298 lp_build_extract_exponent(struct lp_build_context *bld,
3299 LLVMValueRef x,
3300 int bias)
3301 {
3302 LLVMBuilderRef builder = bld->gallivm->builder;
3303 const struct lp_type type = bld->type;
3304 unsigned mantissa = lp_mantissa(type);
3305 LLVMValueRef res;
3306
3307 assert(type.floating);
3308
3309 assert(lp_check_value(bld->type, x));
3310
3311 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3312
3313 res = LLVMBuildLShr(builder, x,
3314 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3315 res = LLVMBuildAnd(builder, res,
3316 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3317 res = LLVMBuildSub(builder, res,
3318 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3319
3320 return res;
3321 }
3322
3323
3324 /**
3325 * Extract the mantissa of the a floating.
3326 *
3327 * Result is a floating point value with
3328 *
3329 * x / floor(log2(x))
3330 */
3331 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3332 lp_build_extract_mantissa(struct lp_build_context *bld,
3333 LLVMValueRef x)
3334 {
3335 LLVMBuilderRef builder = bld->gallivm->builder;
3336 const struct lp_type type = bld->type;
3337 unsigned mantissa = lp_mantissa(type);
3338 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3339 (1ULL << mantissa) - 1);
3340 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3341 LLVMValueRef res;
3342
3343 assert(lp_check_value(bld->type, x));
3344
3345 assert(type.floating);
3346
3347 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3348
3349 /* res = x / 2**ipart */
3350 res = LLVMBuildAnd(builder, x, mantmask, "");
3351 res = LLVMBuildOr(builder, res, one, "");
3352 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3353
3354 return res;
3355 }
3356
3357
3358
3359 /**
3360 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3361 * These coefficients can be generate with
3362 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3363 */
3364 static const double lp_build_log2_polynomial[] = {
3365 #if LOG_POLY_DEGREE == 5
3366 2.88539008148777786488L,
3367 0.961796878841293367824L,
3368 0.577058946784739859012L,
3369 0.412914355135828735411L,
3370 0.308591899232910175289L,
3371 0.352376952300281371868L,
3372 #elif LOG_POLY_DEGREE == 4
3373 2.88539009343309178325L,
3374 0.961791550404184197881L,
3375 0.577440339438736392009L,
3376 0.403343858251329912514L,
3377 0.406718052498846252698L,
3378 #elif LOG_POLY_DEGREE == 3
3379 2.88538959748872753838L,
3380 0.961932915889597772928L,
3381 0.571118517972136195241L,
3382 0.493997535084709500285L,
3383 #else
3384 #error
3385 #endif
3386 };
3387
3388
3389 /**
3390 * See http://www.devmaster.net/forums/showthread.php?p=43580
3391 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3392 * http://www.nezumi.demon.co.uk/consult/logx.htm
3393 *
3394 * If handle_edge_cases is true the function will perform computations
3395 * to match the required D3D10+ behavior for each of the edge cases.
3396 * That means that if input is:
3397 * - less than zero (to and including -inf) then NaN will be returned
3398 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3399 * - +infinity, then +infinity will be returned
3400 * - NaN, then NaN will be returned
3401 *
3402 * Those checks are fairly expensive so if you don't need them make sure
3403 * handle_edge_cases is false.
3404 */
3405 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3406 lp_build_log2_approx(struct lp_build_context *bld,
3407 LLVMValueRef x,
3408 LLVMValueRef *p_exp,
3409 LLVMValueRef *p_floor_log2,
3410 LLVMValueRef *p_log2,
3411 boolean handle_edge_cases)
3412 {
3413 LLVMBuilderRef builder = bld->gallivm->builder;
3414 const struct lp_type type = bld->type;
3415 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3416 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3417
3418 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3419 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3420 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3421
3422 LLVMValueRef i = NULL;
3423 LLVMValueRef y = NULL;
3424 LLVMValueRef z = NULL;
3425 LLVMValueRef exp = NULL;
3426 LLVMValueRef mant = NULL;
3427 LLVMValueRef logexp = NULL;
3428 LLVMValueRef p_z = NULL;
3429 LLVMValueRef res = NULL;
3430
3431 if (bld->type.width == 16) {
3432 char intrinsic[32];
3433 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3434 LLVMValueRef args[] = { x };
3435 if (p_log2)
3436 *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3437 return;
3438 }
3439
3440 assert(lp_check_value(bld->type, x));
3441
3442 if (p_exp || p_floor_log2 || p_log2) {
3443 /* TODO: optimize the constant case */
3444 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3445 LLVMIsConstant(x)) {
3446 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3447 __FUNCTION__);
3448 }
3449
3450 assert(type.floating && type.width == 32);
3451
3452 /*
3453 * We don't explicitly handle denormalized numbers. They will yield a
3454 * result in the neighbourhood of -127, which appears to be adequate
3455 * enough.
3456 */
3457
3458 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3459
3460 /* exp = (float) exponent(x) */
3461 exp = LLVMBuildAnd(builder, i, expmask, "");
3462 }
3463
3464 if (p_floor_log2 || p_log2) {
3465 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3466 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3467 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3468 }
3469
3470 if (p_log2) {
3471 /* mant = 1 + (float) mantissa(x) */
3472 mant = LLVMBuildAnd(builder, i, mantmask, "");
3473 mant = LLVMBuildOr(builder, mant, one, "");
3474 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3475
3476 /* y = (mant - 1) / (mant + 1) */
3477 y = lp_build_div(bld,
3478 lp_build_sub(bld, mant, bld->one),
3479 lp_build_add(bld, mant, bld->one));
3480
3481 /* z = y^2 */
3482 z = lp_build_mul(bld, y, y);
3483
3484 /* compute P(z) */
3485 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3486 ARRAY_SIZE(lp_build_log2_polynomial));
3487
3488 /* y * P(z) + logexp */
3489 res = lp_build_mad(bld, y, p_z, logexp);
3490
3491 if (type.floating && handle_edge_cases) {
3492 LLVMValueRef negmask, infmask, zmask;
3493 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3494 lp_build_const_vec(bld->gallivm, type, 0.0f));
3495 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3496 lp_build_const_vec(bld->gallivm, type, 0.0f));
3497 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3498 lp_build_const_vec(bld->gallivm, type, INFINITY));
3499
3500 /* If x is qual to inf make sure we return inf */
3501 res = lp_build_select(bld, infmask,
3502 lp_build_const_vec(bld->gallivm, type, INFINITY),
3503 res);
3504 /* If x is qual to 0, return -inf */
3505 res = lp_build_select(bld, zmask,
3506 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3507 res);
3508 /* If x is nan or less than 0, return nan */
3509 res = lp_build_select(bld, negmask,
3510 lp_build_const_vec(bld->gallivm, type, NAN),
3511 res);
3512 }
3513 }
3514
3515 if (p_exp) {
3516 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3517 *p_exp = exp;
3518 }
3519
3520 if (p_floor_log2)
3521 *p_floor_log2 = logexp;
3522
3523 if (p_log2)
3524 *p_log2 = res;
3525 }
3526
3527
3528 /*
3529 * log2 implementation which doesn't have special code to
3530 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3531 * the results for those cases are undefined.
3532 */
3533 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3534 lp_build_log2(struct lp_build_context *bld,
3535 LLVMValueRef x)
3536 {
3537 LLVMValueRef res;
3538 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3539 return res;
3540 }
3541
3542
3543 /*
3544 * Version of log2 which handles all edge cases.
3545 * Look at documentation of lp_build_log2_approx for
3546 * description of the behavior for each of the edge cases.
3547 */
3548 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3549 lp_build_log2_safe(struct lp_build_context *bld,
3550 LLVMValueRef x)
3551 {
3552 LLVMValueRef res;
3553 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3554 return res;
3555 }
3556
3557
3558 /**
3559 * Faster (and less accurate) log2.
3560 *
3561 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3562 *
3563 * Piece-wise linear approximation, with exact results when x is a
3564 * power of two.
3565 *
3566 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3567 */
3568 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3569 lp_build_fast_log2(struct lp_build_context *bld,
3570 LLVMValueRef x)
3571 {
3572 LLVMBuilderRef builder = bld->gallivm->builder;
3573 LLVMValueRef ipart;
3574 LLVMValueRef fpart;
3575
3576 assert(lp_check_value(bld->type, x));
3577
3578 assert(bld->type.floating);
3579
3580 /* ipart = floor(log2(x)) - 1 */
3581 ipart = lp_build_extract_exponent(bld, x, -1);
3582 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3583
3584 /* fpart = x / 2**ipart */
3585 fpart = lp_build_extract_mantissa(bld, x);
3586
3587 /* ipart + fpart */
3588 return LLVMBuildFAdd(builder, ipart, fpart, "");
3589 }
3590
3591
3592 /**
3593 * Fast implementation of iround(log2(x)).
3594 *
3595 * Not an approximation -- it should give accurate results all the time.
3596 */
3597 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3598 lp_build_ilog2(struct lp_build_context *bld,
3599 LLVMValueRef x)
3600 {
3601 LLVMBuilderRef builder = bld->gallivm->builder;
3602 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3603 LLVMValueRef ipart;
3604
3605 assert(bld->type.floating);
3606
3607 assert(lp_check_value(bld->type, x));
3608
3609 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3610 x = LLVMBuildFMul(builder, x, sqrt2, "");
3611
3612 /* ipart = floor(log2(x) + 0.5) */
3613 ipart = lp_build_extract_exponent(bld, x, 0);
3614
3615 return ipart;
3616 }
3617
3618 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3619 lp_build_mod(struct lp_build_context *bld,
3620 LLVMValueRef x,
3621 LLVMValueRef y)
3622 {
3623 LLVMBuilderRef builder = bld->gallivm->builder;
3624 LLVMValueRef res;
3625 const struct lp_type type = bld->type;
3626
3627 assert(lp_check_value(type, x));
3628 assert(lp_check_value(type, y));
3629
3630 if (type.floating)
3631 res = LLVMBuildFRem(builder, x, y, "");
3632 else if (type.sign)
3633 res = LLVMBuildSRem(builder, x, y, "");
3634 else
3635 res = LLVMBuildURem(builder, x, y, "");
3636 return res;
3637 }
3638
3639
3640 /*
3641 * For floating inputs it creates and returns a mask
3642 * which is all 1's for channels which are NaN.
3643 * Channels inside x which are not NaN will be 0.
3644 */
3645 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3646 lp_build_isnan(struct lp_build_context *bld,
3647 LLVMValueRef x)
3648 {
3649 LLVMValueRef mask;
3650 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3651
3652 assert(bld->type.floating);
3653 assert(lp_check_value(bld->type, x));
3654
3655 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3656 "isnotnan");
3657 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3658 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3659 return mask;
3660 }
3661
3662
3663 /* Returns all 1's for floating point numbers that are
3664 * finite numbers and returns all zeros for -inf,
3665 * inf and nan's */
3666 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3667 lp_build_isfinite(struct lp_build_context *bld,
3668 LLVMValueRef x)
3669 {
3670 LLVMBuilderRef builder = bld->gallivm->builder;
3671 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3672 struct lp_type int_type = lp_int_type(bld->type);
3673 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3674 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3675 0x7f800000);
3676
3677 if (!bld->type.floating) {
3678 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3679 }
3680 assert(bld->type.floating);
3681 assert(lp_check_value(bld->type, x));
3682 assert(bld->type.width == 32);
3683
3684 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3685 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3686 intx, infornan32);
3687 }
3688
3689
3690 /*
3691 * Returns true if the number is nan or inf and false otherwise.
3692 * The input has to be a floating point vector.
3693 */
3694 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3695 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3696 const struct lp_type type,
3697 LLVMValueRef x)
3698 {
3699 LLVMBuilderRef builder = gallivm->builder;
3700 struct lp_type int_type = lp_int_type(type);
3701 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3702 0x7f800000);
3703 LLVMValueRef ret;
3704
3705 assert(type.floating);
3706
3707 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3708 ret = LLVMBuildAnd(builder, ret, const0, "");
3709 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3710 ret, const0);
3711
3712 return ret;
3713 }
3714
3715
3716 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3717 lp_build_fpstate_get(struct gallivm_state *gallivm)
3718 {
3719 if (util_get_cpu_caps()->has_sse) {
3720 LLVMBuilderRef builder = gallivm->builder;
3721 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3722 gallivm,
3723 LLVMInt32TypeInContext(gallivm->context),
3724 "mxcsr_ptr");
3725 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3726 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3727 lp_build_intrinsic(builder,
3728 "llvm.x86.sse.stmxcsr",
3729 LLVMVoidTypeInContext(gallivm->context),
3730 &mxcsr_ptr8, 1, 0);
3731 return mxcsr_ptr;
3732 }
3733 return 0;
3734 }
3735
3736 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3737 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3738 boolean zero)
3739 {
3740 if (util_get_cpu_caps()->has_sse) {
3741 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3742 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3743
3744 LLVMBuilderRef builder = gallivm->builder;
3745 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3746 LLVMValueRef mxcsr =
3747 LLVMBuildLoad2(builder, LLVMInt32TypeInContext(gallivm->context), mxcsr_ptr, "mxcsr");
3748
3749 if (util_get_cpu_caps()->has_daz) {
3750 /* Enable denormals are zero mode */
3751 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3752 }
3753 if (zero) {
3754 mxcsr = LLVMBuildOr(builder, mxcsr,
3755 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3756 } else {
3757 mxcsr = LLVMBuildAnd(builder, mxcsr,
3758 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3759 }
3760
3761 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3762 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3763 }
3764 }
3765
3766
3767 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3768 lp_build_fpstate_set(struct gallivm_state *gallivm,
3769 LLVMValueRef mxcsr_ptr)
3770 {
3771 if (util_get_cpu_caps()->has_sse) {
3772 LLVMBuilderRef builder = gallivm->builder;
3773 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3774 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3775 lp_build_intrinsic(builder,
3776 "llvm.x86.sse.ldmxcsr",
3777 LLVMVoidTypeInContext(gallivm->context),
3778 &mxcsr_ptr, 1, 0);
3779 }
3780 }
3781