1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if DETECT_ARCH_SSE
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_get_cpu_caps()->has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_get_cpu_caps()->has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __func__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_get_cpu_caps()->has_altivec) {
147 intr_size = 128;
148 if (type.width == 8) {
149 if (!type.sign) {
150 intrinsic = "llvm.ppc.altivec.vminub";
151 } else {
152 intrinsic = "llvm.ppc.altivec.vminsb";
153 }
154 } else if (type.width == 16) {
155 if (!type.sign) {
156 intrinsic = "llvm.ppc.altivec.vminuh";
157 } else {
158 intrinsic = "llvm.ppc.altivec.vminsh";
159 }
160 } else if (type.width == 32) {
161 if (!type.sign) {
162 intrinsic = "llvm.ppc.altivec.vminuw";
163 } else {
164 intrinsic = "llvm.ppc.altivec.vminsw";
165 }
166 }
167 }
168
169 if (intrinsic) {
170 /* We need to handle nan's for floating point numbers. If one of the
171 * inputs is nan the other should be returned (required by both D3D10+
172 * and OpenCL).
173 * The sse intrinsics return the second operator in case of nan by
174 * default so we need to special code to handle those.
175 */
176 if (util_get_cpu_caps()->has_sse && type.floating &&
177 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178 LLVMValueRef isnan, min;
179 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 isnan = lp_build_isnan(bld, b);
183 return lp_build_select(bld, isnan, a, min);
184 } else {
185 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186 type,
187 intr_size, a, b);
188 }
189 }
190
191 if (type.floating) {
192 switch (nan_behavior) {
193 case GALLIVM_NAN_RETURN_OTHER: {
194 LLVMValueRef isnan = lp_build_isnan(bld, a);
195 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197 return lp_build_select(bld, cond, a, b);
198 }
199 break;
200 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202 return lp_build_select(bld, cond, a, b);
203 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205 return lp_build_select(bld, cond, b, a);
206 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208 return lp_build_select(bld, cond, a, b);
209 break;
210 default:
211 assert(0);
212 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213 return lp_build_select(bld, cond, a, b);
214 }
215 } else {
216 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217 return lp_build_select(bld, cond, a, b);
218 }
219 }
220
221
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224 LLVMValueRef a,
225 LLVMValueRef b,
226 LLVMValueRef c)
227 {
228 LLVMTypeRef type = LLVMTypeOf(a);
229 assert(type == LLVMTypeOf(b));
230 assert(type == LLVMTypeOf(c));
231
232 char intrinsic[32];
233 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234 LLVMValueRef args[] = { a, b, c };
235 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237
238
239 /**
240 * Generate max(a, b)
241 * No checks for special case values of a or b = 1 or 0 are done.
242 * NaN's are handled according to the behavior specified by the
243 * nan_behavior argument.
244 */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247 LLVMValueRef a,
248 LLVMValueRef b,
249 enum gallivm_nan_behavior nan_behavior)
250 {
251 const struct lp_type type = bld->type;
252 const char *intrinsic = NULL;
253 unsigned intr_size = 0;
254 LLVMValueRef cond;
255
256 assert(lp_check_value(type, a));
257 assert(lp_check_value(type, b));
258
259 /* TODO: optimize the constant case */
260
261 if (type.floating && util_get_cpu_caps()->has_sse) {
262 if (type.width == 32) {
263 if (type.length == 1) {
264 intrinsic = "llvm.x86.sse.max.ss";
265 intr_size = 128;
266 }
267 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268 intrinsic = "llvm.x86.sse.max.ps";
269 intr_size = 128;
270 }
271 else {
272 intrinsic = "llvm.x86.avx.max.ps.256";
273 intr_size = 256;
274 }
275 }
276 if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277 if (type.length == 1) {
278 intrinsic = "llvm.x86.sse2.max.sd";
279 intr_size = 128;
280 }
281 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282 intrinsic = "llvm.x86.sse2.max.pd";
283 intr_size = 128;
284 }
285 else {
286 intrinsic = "llvm.x86.avx.max.pd.256";
287 intr_size = 256;
288 }
289 }
290 }
291 else if (type.floating && util_get_cpu_caps()->has_altivec) {
292 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294 __func__);
295 }
296 if (type.width == 32 || type.length == 4) {
297 intrinsic = "llvm.ppc.altivec.vmaxfp";
298 intr_size = 128;
299 }
300 } else if (util_get_cpu_caps()->has_altivec) {
301 intr_size = 128;
302 if (type.width == 8) {
303 if (!type.sign) {
304 intrinsic = "llvm.ppc.altivec.vmaxub";
305 } else {
306 intrinsic = "llvm.ppc.altivec.vmaxsb";
307 }
308 } else if (type.width == 16) {
309 if (!type.sign) {
310 intrinsic = "llvm.ppc.altivec.vmaxuh";
311 } else {
312 intrinsic = "llvm.ppc.altivec.vmaxsh";
313 }
314 } else if (type.width == 32) {
315 if (!type.sign) {
316 intrinsic = "llvm.ppc.altivec.vmaxuw";
317 } else {
318 intrinsic = "llvm.ppc.altivec.vmaxsw";
319 }
320 }
321 }
322
323 if (intrinsic) {
324 if (util_get_cpu_caps()->has_sse && type.floating &&
325 nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326 LLVMValueRef isnan, max;
327 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328 type,
329 intr_size, a, b);
330 isnan = lp_build_isnan(bld, b);
331 return lp_build_select(bld, isnan, a, max);
332 } else {
333 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334 type,
335 intr_size, a, b);
336 }
337 }
338
339 if (type.floating) {
340 switch (nan_behavior) {
341 case GALLIVM_NAN_RETURN_OTHER: {
342 LLVMValueRef isnan = lp_build_isnan(bld, a);
343 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345 return lp_build_select(bld, cond, a, b);
346 }
347 break;
348 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350 return lp_build_select(bld, cond, a, b);
351 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353 return lp_build_select(bld, cond, b, a);
354 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356 return lp_build_select(bld, cond, a, b);
357 break;
358 default:
359 assert(0);
360 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361 return lp_build_select(bld, cond, a, b);
362 }
363 } else {
364 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365 return lp_build_select(bld, cond, a, b);
366 }
367 }
368
369
370 /**
371 * Generate 1 - a, or ~a depending on bld->type.
372 */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375 LLVMValueRef a)
376 {
377 LLVMBuilderRef builder = bld->gallivm->builder;
378 const struct lp_type type = bld->type;
379
380 assert(lp_check_value(type, a));
381
382 if (a == bld->one)
383 return bld->zero;
384 if (a == bld->zero)
385 return bld->one;
386
387 if (type.norm && !type.floating && !type.fixed && !type.sign) {
388 if (LLVMIsConstant(a))
389 return LLVMConstNot(a);
390 else
391 return LLVMBuildNot(builder, a, "");
392 }
393
394 if (type.floating)
395 return LLVMBuildFSub(builder, bld->one, a, "");
396 else
397 return LLVMBuildSub(builder, bld->one, a, "");
398 }
399
400
401 /**
402 * Generate a + b
403 */
404 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)405 lp_build_add(struct lp_build_context *bld,
406 LLVMValueRef a,
407 LLVMValueRef b)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411 LLVMValueRef res;
412
413 assert(lp_check_value(type, a));
414 assert(lp_check_value(type, b));
415
416 if (a == bld->zero)
417 return b;
418 if (b == bld->zero)
419 return a;
420 if (a == bld->undef || b == bld->undef)
421 return bld->undef;
422
423 if (type.norm) {
424 const char *intrinsic = NULL;
425
426 if (!type.sign && (a == bld->one || b == bld->one))
427 return bld->one;
428
429 if (!type.floating && !type.fixed) {
430 if (LLVM_VERSION_MAJOR >= 8) {
431 char intrin[32];
432 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
433 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
434 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
435 }
436 if (type.width * type.length == 128) {
437 if (util_get_cpu_caps()->has_sse2) {
438 if (type.width == 8)
439 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
440 if (type.width == 16)
441 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
442 } else if (util_get_cpu_caps()->has_altivec) {
443 if (type.width == 8)
444 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
445 if (type.width == 16)
446 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
447 }
448 }
449 if (type.width * type.length == 256) {
450 if (util_get_cpu_caps()->has_avx2) {
451 if (type.width == 8)
452 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
453 if (type.width == 16)
454 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
455 }
456 }
457 }
458
459 if (intrinsic)
460 return lp_build_intrinsic_binary(builder, intrinsic,
461 lp_build_vec_type(bld->gallivm, bld->type), a, b);
462 }
463
464 if (type.norm && !type.floating && !type.fixed) {
465 if (type.sign) {
466 uint64_t sign = (uint64_t)1 << (type.width - 1);
467 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
468 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
469 /* a_clamp_max is the maximum a for positive b,
470 a_clamp_min is the minimum a for negative b. */
471 LLVMValueRef a_clamp_max =
472 lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""),
473 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
474 LLVMValueRef a_clamp_min =
475 lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""),
476 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
478 bld->zero), a_clamp_max, a_clamp_min);
479 }
480 }
481
482 if (type.floating)
483 res = LLVMBuildFAdd(builder, a, b, "");
484 else
485 res = LLVMBuildAdd(builder, a, b, "");
486
487 /* clamp to ceiling of 1.0 */
488 if (bld->type.norm && (bld->type.floating || bld->type.fixed))
489 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
490
491 if (type.norm && !type.floating && !type.fixed) {
492 if (!type.sign) {
493 /*
494 * newer llvm versions no longer support the intrinsics, but recognize
495 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
496 * code, it is important we match the pattern llvm uses (and pray llvm
497 * doesn't change it - and hope they decide on the same pattern for
498 * all backends supporting it...).
499 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
500 * interfere with llvm's ability to recognize the pattern but seems
501 * a bit brittle.
502 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
503 */
504 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
505 res = lp_build_select(bld, overflowed,
506 LLVMConstAllOnes(bld->int_vec_type), res);
507 }
508 }
509
510 /* XXX clamp to floor of -1 or 0??? */
511
512 return res;
513 }
514
515
516 /** Return the scalar sum of the elements of a.
517 * Should avoid this operation whenever possible.
518 */
519 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)520 lp_build_horizontal_add(struct lp_build_context *bld,
521 LLVMValueRef a)
522 {
523 LLVMBuilderRef builder = bld->gallivm->builder;
524 const struct lp_type type = bld->type;
525 LLVMValueRef index, res;
526 unsigned i, length;
527 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
528 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
529 LLVMValueRef vecres, elem2;
530
531 assert(lp_check_value(type, a));
532
533 if (type.length == 1) {
534 return a;
535 }
536
537 assert(!bld->type.norm);
538
539 /*
540 * for byte vectors can do much better with psadbw.
541 * Using repeated shuffle/adds here. Note with multiple vectors
542 * this can be done more efficiently as outlined in the intel
543 * optimization manual.
544 * Note: could cause data rearrangement if used with smaller element
545 * sizes.
546 */
547
548 vecres = a;
549 length = type.length / 2;
550 while (length > 1) {
551 LLVMValueRef vec1, vec2;
552 for (i = 0; i < length; i++) {
553 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
554 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
555 }
556 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
557 LLVMConstVector(shuffles1, length), "");
558 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
559 LLVMConstVector(shuffles2, length), "");
560 if (type.floating) {
561 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
562 }
563 else {
564 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
565 }
566 length = length >> 1;
567 }
568
569 /* always have vector of size 2 here */
570 assert(length == 1);
571
572 index = lp_build_const_int32(bld->gallivm, 0);
573 res = LLVMBuildExtractElement(builder, vecres, index, "");
574 index = lp_build_const_int32(bld->gallivm, 1);
575 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
576
577 if (type.floating)
578 res = LLVMBuildFAdd(builder, res, elem2, "");
579 else
580 res = LLVMBuildAdd(builder, res, elem2, "");
581
582 return res;
583 }
584
585
586 /**
587 * Return the horizontal sums of 4 float vectors as a float4 vector.
588 * This uses the technique as outlined in Intel Optimization Manual.
589 */
590 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])591 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
592 LLVMValueRef src[4])
593 {
594 struct gallivm_state *gallivm = bld->gallivm;
595 LLVMBuilderRef builder = gallivm->builder;
596 LLVMValueRef shuffles[4];
597 LLVMValueRef tmp[4];
598 LLVMValueRef sumtmp[2], shuftmp[2];
599
600 /* lower half of regs */
601 shuffles[0] = lp_build_const_int32(gallivm, 0);
602 shuffles[1] = lp_build_const_int32(gallivm, 1);
603 shuffles[2] = lp_build_const_int32(gallivm, 4);
604 shuffles[3] = lp_build_const_int32(gallivm, 5);
605 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
606 LLVMConstVector(shuffles, 4), "");
607 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
608 LLVMConstVector(shuffles, 4), "");
609
610 /* upper half of regs */
611 shuffles[0] = lp_build_const_int32(gallivm, 2);
612 shuffles[1] = lp_build_const_int32(gallivm, 3);
613 shuffles[2] = lp_build_const_int32(gallivm, 6);
614 shuffles[3] = lp_build_const_int32(gallivm, 7);
615 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
616 LLVMConstVector(shuffles, 4), "");
617 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
618 LLVMConstVector(shuffles, 4), "");
619
620 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
621 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
622
623 shuffles[0] = lp_build_const_int32(gallivm, 0);
624 shuffles[1] = lp_build_const_int32(gallivm, 2);
625 shuffles[2] = lp_build_const_int32(gallivm, 4);
626 shuffles[3] = lp_build_const_int32(gallivm, 6);
627 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
628 LLVMConstVector(shuffles, 4), "");
629
630 shuffles[0] = lp_build_const_int32(gallivm, 1);
631 shuffles[1] = lp_build_const_int32(gallivm, 3);
632 shuffles[2] = lp_build_const_int32(gallivm, 5);
633 shuffles[3] = lp_build_const_int32(gallivm, 7);
634 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
635 LLVMConstVector(shuffles, 4), "");
636
637 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
638 }
639
640
641 /*
642 * partially horizontally add 2-4 float vectors with length nx4,
643 * i.e. only four adjacent values in each vector will be added,
644 * assuming values are really grouped in 4 which also determines
645 * output order.
646 *
647 * Return a vector of the same length as the initial vectors,
648 * with the excess elements (if any) being undefined.
649 * The element order is independent of number of input vectors.
650 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
651 * the output order thus will be
652 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
653 */
654 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)655 lp_build_hadd_partial4(struct lp_build_context *bld,
656 LLVMValueRef vectors[],
657 unsigned num_vecs)
658 {
659 struct gallivm_state *gallivm = bld->gallivm;
660 LLVMBuilderRef builder = gallivm->builder;
661 LLVMValueRef ret_vec;
662 LLVMValueRef tmp[4];
663 const char *intrinsic = NULL;
664
665 assert(num_vecs >= 2 && num_vecs <= 4);
666 assert(bld->type.floating);
667
668 /* only use this with at least 2 vectors, as it is sort of expensive
669 * (depending on cpu) and we always need two horizontal adds anyway,
670 * so a shuffle/add approach might be better.
671 */
672
673 tmp[0] = vectors[0];
674 tmp[1] = vectors[1];
675
676 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
677 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
678
679 if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
680 bld->type.length == 4) {
681 intrinsic = "llvm.x86.sse3.hadd.ps";
682 }
683 else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
684 bld->type.length == 8) {
685 intrinsic = "llvm.x86.avx.hadd.ps.256";
686 }
687 if (intrinsic) {
688 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
689 lp_build_vec_type(gallivm, bld->type),
690 tmp[0], tmp[1]);
691 if (num_vecs > 2) {
692 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
693 lp_build_vec_type(gallivm, bld->type),
694 tmp[2], tmp[3]);
695 }
696 else {
697 tmp[1] = tmp[0];
698 }
699 return lp_build_intrinsic_binary(builder, intrinsic,
700 lp_build_vec_type(gallivm, bld->type),
701 tmp[0], tmp[1]);
702 }
703
704 if (bld->type.length == 4) {
705 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
706 }
707 else {
708 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
709 unsigned j;
710 unsigned num_iter = bld->type.length / 4;
711 struct lp_type parttype = bld->type;
712 parttype.length = 4;
713 for (j = 0; j < num_iter; j++) {
714 LLVMValueRef partsrc[4];
715 unsigned i;
716 for (i = 0; i < 4; i++) {
717 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
718 }
719 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
720 }
721 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
722 }
723 return ret_vec;
724 }
725
726
727 /**
728 * Generate a - b
729 */
730 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)731 lp_build_sub(struct lp_build_context *bld,
732 LLVMValueRef a,
733 LLVMValueRef b)
734 {
735 LLVMBuilderRef builder = bld->gallivm->builder;
736 const struct lp_type type = bld->type;
737 LLVMValueRef res;
738
739 assert(lp_check_value(type, a));
740 assert(lp_check_value(type, b));
741
742 if (b == bld->zero)
743 return a;
744 if (a == bld->undef || b == bld->undef)
745 return bld->undef;
746 if (a == b)
747 return bld->zero;
748
749 if (type.norm) {
750 const char *intrinsic = NULL;
751
752 if (!type.sign && b == bld->one)
753 return bld->zero;
754
755 if (!type.floating && !type.fixed) {
756 if (LLVM_VERSION_MAJOR >= 8) {
757 char intrin[32];
758 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
759 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
760 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
761 }
762 if (type.width * type.length == 128) {
763 if (util_get_cpu_caps()->has_sse2) {
764 if (type.width == 8)
765 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
766 if (type.width == 16)
767 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
768 } else if (util_get_cpu_caps()->has_altivec) {
769 if (type.width == 8)
770 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
771 if (type.width == 16)
772 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
773 }
774 }
775 if (type.width * type.length == 256) {
776 if (util_get_cpu_caps()->has_avx2) {
777 if (type.width == 8)
778 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
779 if (type.width == 16)
780 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
781 }
782 }
783 }
784
785 if (intrinsic)
786 return lp_build_intrinsic_binary(builder, intrinsic,
787 lp_build_vec_type(bld->gallivm, bld->type), a, b);
788 }
789
790 if (type.norm && !type.floating && !type.fixed) {
791 if (type.sign) {
792 uint64_t sign = (uint64_t)1 << (type.width - 1);
793 LLVMValueRef max_val =
794 lp_build_const_int_vec(bld->gallivm, type, sign - 1);
795 LLVMValueRef min_val =
796 lp_build_const_int_vec(bld->gallivm, type, sign);
797 /* a_clamp_max is the maximum a for negative b,
798 a_clamp_min is the minimum a for positive b. */
799 LLVMValueRef a_clamp_max =
800 lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""),
801 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802 LLVMValueRef a_clamp_min =
803 lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""),
804 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
805 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
806 bld->zero),
807 a_clamp_min, a_clamp_max);
808 } else {
809 /*
810 * This must match llvm pattern for saturated unsigned sub.
811 * (lp_build_max_simple actually does the job with its current
812 * definition but do it explicitly here.)
813 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
814 * interfere with llvm's ability to recognize the pattern but seems
815 * a bit brittle.
816 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
817 */
818 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
819 a = lp_build_select(bld, no_ov, a, b);
820 }
821 }
822
823 if (type.floating)
824 res = LLVMBuildFSub(builder, a, b, "");
825 else
826 res = LLVMBuildSub(builder, a, b, "");
827
828 if (bld->type.norm && (bld->type.floating || bld->type.fixed))
829 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
830
831 return res;
832 }
833
834
835 /**
836 * Normalized multiplication.
837 *
838 * There are several approaches for (using 8-bit normalized multiplication as
839 * an example):
840 *
841 * - alpha plus one
842 *
843 * makes the following approximation to the division (Sree)
844 *
845 * a*b/255 ~= (a*(b + 1)) >> 256
846 *
847 * which is the fastest method that satisfies the following OpenGL
848 * criteria of
849 *
850 * 0*0 = 0 and 255*255 = 255
851 *
852 * - geometric series
853 *
854 * takes the geometric series approximation to the division
855 *
856 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
857 *
858 * in this case just the first two terms to fit in 16bit arithmetic
859 *
860 * t/255 ~= (t + (t >> 8)) >> 8
861 *
862 * note that just by itself it doesn't satisfies the OpenGL criteria,
863 * as 255*255 = 254, so the special case b = 255 must be accounted or
864 * roundoff must be used.
865 *
866 * - geometric series plus rounding
867 *
868 * when using a geometric series division instead of truncating the result
869 * use roundoff in the approximation (Jim Blinn)
870 *
871 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
872 *
873 * achieving the exact results.
874 *
875 *
876 *
877 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
878 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
879 * @sa Michael Herf, The "double blend trick", May 2000,
880 * http://www.stereopsis.com/doubleblend.html
881 */
882 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)883 lp_build_mul_norm(struct gallivm_state *gallivm,
884 struct lp_type wide_type,
885 LLVMValueRef a, LLVMValueRef b)
886 {
887 LLVMBuilderRef builder = gallivm->builder;
888 struct lp_build_context bld;
889 unsigned n;
890 LLVMValueRef half;
891 LLVMValueRef ab;
892
893 assert(!wide_type.floating);
894 assert(lp_check_value(wide_type, a));
895 assert(lp_check_value(wide_type, b));
896
897 lp_build_context_init(&bld, gallivm, wide_type);
898
899 n = wide_type.width / 2;
900 if (wide_type.sign) {
901 --n;
902 }
903
904 /*
905 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
906 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
907 */
908
909 /*
910 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
911 */
912
913 ab = LLVMBuildMul(builder, a, b, "");
914 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
915
916 /*
917 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
918 */
919
920 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
921 if (wide_type.sign) {
922 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
923 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
924 half = lp_build_select(&bld, sign, minus_half, half);
925 }
926 ab = LLVMBuildAdd(builder, ab, half, "");
927
928 /* Final division */
929 ab = lp_build_shr_imm(&bld, ab, n);
930
931 return ab;
932 }
933
934
935 /**
936 * Generate a * b
937 */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940 LLVMValueRef a,
941 LLVMValueRef b)
942 {
943 LLVMBuilderRef builder = bld->gallivm->builder;
944 const struct lp_type type = bld->type;
945
946 assert(lp_check_value(type, a));
947 assert(lp_check_value(type, b));
948
949 if (!type.floating || !type.nan_preserve) {
950 if (a == bld->zero)
951 return bld->zero;
952 if (b == bld->zero)
953 return bld->zero;
954 }
955
956 if (a == bld->one)
957 return b;
958 if (b == bld->one)
959 return a;
960 if (a == bld->undef || b == bld->undef)
961 return bld->undef;
962
963 if (!type.floating && !type.fixed && type.norm) {
964 struct lp_type wide_type = lp_wider_type(type);
965 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
966
967 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
968 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
969
970 /* PMULLW, PSRLW, PADDW */
971 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
972 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
973
974 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
975
976 return ab;
977 }
978
979 LLVMValueRef shift = type.fixed
980 ? lp_build_const_int_vec(bld->gallivm, type, type.width/2) : NULL;
981
982 LLVMValueRef res;
983 if (type.floating)
984 res = LLVMBuildFMul(builder, a, b, "");
985 else
986 res = LLVMBuildMul(builder, a, b, "");
987 if (shift) {
988 if (type.sign)
989 res = LLVMBuildAShr(builder, res, shift, "");
990 else
991 res = LLVMBuildLShr(builder, res, shift, "");
992 }
993
994 return res;
995 }
996
997
998 /*
999 * Widening mul, valid for 32x32 bit -> 64bit only.
1000 * Result is low 32bits, high bits returned in res_hi.
1001 *
1002 * Emits code that is meant to be compiled for the host CPU.
1003 */
1004 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1005 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1006 LLVMValueRef a,
1007 LLVMValueRef b,
1008 LLVMValueRef *res_hi)
1009 {
1010 struct gallivm_state *gallivm = bld->gallivm;
1011 LLVMBuilderRef builder = gallivm->builder;
1012
1013 assert(bld->type.width == 32);
1014 assert(bld->type.floating == 0);
1015 assert(bld->type.fixed == 0);
1016 assert(bld->type.norm == 0);
1017
1018 /*
1019 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1020 * for x86 simd is atrocious (even if the high bits weren't required),
1021 * trying to handle real 64bit inputs (which of course can't happen due
1022 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1023 * apparently llvm does not recognize this widening mul). This includes 6
1024 * (instead of 2) pmuludq plus extra adds and shifts
1025 * The same story applies to signed mul, albeit fixing this requires sse41.
1026 * https://llvm.org/bugs/show_bug.cgi?id=30845
1027 * So, whip up our own code, albeit only for length 4 and 8 (which
1028 * should be good enough)...
1029 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1030 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1031 * for signed), which the fallback code does not, without this llvm
1032 * will likely still produce atrocious code.
1033 */
1034 if (LLVM_VERSION_MAJOR < 7 &&
1035 (bld->type.length == 4 || bld->type.length == 8) &&
1036 ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1037 util_get_cpu_caps()->has_sse4_1)) {
1038 const char *intrinsic = NULL;
1039 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1040 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1041 struct lp_type type_wide = lp_wider_type(bld->type);
1042 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1043 unsigned i;
1044 for (i = 0; i < bld->type.length; i += 2) {
1045 shuf[i] = lp_build_const_int32(gallivm, i+1);
1046 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1047 }
1048 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1049 aeven = a;
1050 beven = b;
1051 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1052 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1053
1054 if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1055 if (bld->type.sign) {
1056 intrinsic = "llvm.x86.avx2.pmul.dq";
1057 } else {
1058 intrinsic = "llvm.x86.avx2.pmulu.dq";
1059 }
1060 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1061 wider_type, aeven, beven);
1062 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1063 wider_type, aodd, bodd);
1064 }
1065 else {
1066 /* for consistent naming look elsewhere... */
1067 if (bld->type.sign) {
1068 intrinsic = "llvm.x86.sse41.pmuldq";
1069 } else {
1070 intrinsic = "llvm.x86.sse2.pmulu.dq";
1071 }
1072 /*
1073 * XXX If we only have AVX but not AVX2 this is a pain.
1074 * lp_build_intrinsic_binary_anylength() can't handle it
1075 * (due to src and dst type not being identical).
1076 */
1077 if (bld->type.length == 8) {
1078 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1079 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1080 LLVMValueRef muleven2[2], mulodd2[2];
1081 struct lp_type type_wide_half = type_wide;
1082 LLVMTypeRef wtype_half;
1083 type_wide_half.length = 2;
1084 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1085 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1086 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1087 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1088 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1089 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1090 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1091 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1092 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1093 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1094 wtype_half, aevenlo, bevenlo);
1095 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1096 wtype_half, aoddlo, boddlo);
1097 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1098 wtype_half, aevenhi, bevenhi);
1099 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1100 wtype_half, aoddhi, boddhi);
1101 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1102 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1103
1104 }
1105 else {
1106 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1107 wider_type, aeven, beven);
1108 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1109 wider_type, aodd, bodd);
1110 }
1111 }
1112 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1113 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1114
1115 for (i = 0; i < bld->type.length; i += 2) {
1116 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1117 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1118 }
1119 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1120 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1121
1122 for (i = 0; i < bld->type.length; i += 2) {
1123 shuf[i] = lp_build_const_int32(gallivm, i);
1124 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1125 }
1126 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1127 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1128 }
1129 else {
1130 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1131 }
1132 }
1133
1134
1135 /*
1136 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1137 * Result is low N bits, high bits returned in res_hi.
1138 *
1139 * Emits generic code.
1140 */
1141 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1142 lp_build_mul_32_lohi(struct lp_build_context *bld,
1143 LLVMValueRef a,
1144 LLVMValueRef b,
1145 LLVMValueRef *res_hi)
1146 {
1147 struct gallivm_state *gallivm = bld->gallivm;
1148 LLVMBuilderRef builder = gallivm->builder;
1149 LLVMValueRef tmp, shift, res_lo;
1150 struct lp_type type_tmp;
1151 LLVMTypeRef wide_type, narrow_type;
1152
1153 type_tmp = bld->type;
1154 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1155 if (bld->type.width < 32)
1156 type_tmp.width = 32;
1157 else
1158 type_tmp.width *= 2;
1159 wide_type = lp_build_vec_type(gallivm, type_tmp);
1160 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1161
1162 if (bld->type.sign) {
1163 a = LLVMBuildSExt(builder, a, wide_type, "");
1164 b = LLVMBuildSExt(builder, b, wide_type, "");
1165 } else {
1166 a = LLVMBuildZExt(builder, a, wide_type, "");
1167 b = LLVMBuildZExt(builder, b, wide_type, "");
1168 }
1169 tmp = LLVMBuildMul(builder, a, b, "");
1170
1171 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1172
1173 /* Since we truncate anyway, LShr and AShr are equivalent. */
1174 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1175 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1176
1177 return res_lo;
1178 }
1179
1180
1181 /* a * b + c */
1182 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1183 lp_build_mad(struct lp_build_context *bld,
1184 LLVMValueRef a,
1185 LLVMValueRef b,
1186 LLVMValueRef c)
1187 {
1188 const struct lp_type type = bld->type;
1189 if (type.floating) {
1190 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1191 } else {
1192 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1193 }
1194 }
1195
1196
1197 /**
1198 * Small vector x scale multiplication optimization.
1199 */
1200 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1201 lp_build_mul_imm(struct lp_build_context *bld,
1202 LLVMValueRef a,
1203 int b)
1204 {
1205 LLVMBuilderRef builder = bld->gallivm->builder;
1206 LLVMValueRef factor;
1207
1208 assert(lp_check_value(bld->type, a));
1209
1210 if (b == 0)
1211 return bld->zero;
1212
1213 if (b == 1)
1214 return a;
1215
1216 if (b == -1)
1217 return lp_build_negate(bld, a);
1218
1219 if (b == 2 && bld->type.floating)
1220 return lp_build_add(bld, a, a);
1221
1222 if (util_is_power_of_two_or_zero(b)) {
1223 unsigned shift = ffs(b) - 1;
1224
1225 if (bld->type.floating) {
1226 #if 0
1227 /*
1228 * Power of two multiplication by directly manipulating the exponent.
1229 *
1230 * XXX: This might not be always faster, it will introduce a small
1231 * error for multiplication by zero, and it will produce wrong results
1232 * for Inf and NaN.
1233 */
1234 unsigned mantissa = lp_mantissa(bld->type);
1235 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1236 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1237 a = LLVMBuildAdd(builder, a, factor, "");
1238 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1239 return a;
1240 #endif
1241 }
1242 else {
1243 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1244 return LLVMBuildShl(builder, a, factor, "");
1245 }
1246 }
1247
1248 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1249 return lp_build_mul(bld, a, factor);
1250 }
1251
1252
1253 /**
1254 * Generate a / b
1255 */
1256 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1257 lp_build_div(struct lp_build_context *bld,
1258 LLVMValueRef a,
1259 LLVMValueRef b)
1260 {
1261 LLVMBuilderRef builder = bld->gallivm->builder;
1262 const struct lp_type type = bld->type;
1263
1264 assert(lp_check_value(type, a));
1265 assert(lp_check_value(type, b));
1266
1267 if (a == bld->zero)
1268 return bld->zero;
1269 if (a == bld->one && type.floating)
1270 return lp_build_rcp(bld, b);
1271 if (b == bld->zero)
1272 return bld->undef;
1273 if (b == bld->one)
1274 return a;
1275 if (a == bld->undef || b == bld->undef)
1276 return bld->undef;
1277
1278 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1279 if (false &&
1280 ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1281 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1282 type.floating)
1283 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1284
1285 if (type.floating)
1286 return LLVMBuildFDiv(builder, a, b, "");
1287 else if (type.sign)
1288 return LLVMBuildSDiv(builder, a, b, "");
1289 else
1290 return LLVMBuildUDiv(builder, a, b, "");
1291 }
1292
1293
1294 /**
1295 * Linear interpolation helper.
1296 *
1297 * @param normalized whether we are interpolating normalized values,
1298 * encoded in normalized integers, twice as wide.
1299 *
1300 * @sa http://www.stereopsis.com/doubleblend.html
1301 */
1302 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1303 lp_build_lerp_simple(struct lp_build_context *bld,
1304 LLVMValueRef x,
1305 LLVMValueRef v0,
1306 LLVMValueRef v1,
1307 unsigned flags)
1308 {
1309 unsigned half_width = bld->type.width/2;
1310 LLVMBuilderRef builder = bld->gallivm->builder;
1311 LLVMValueRef delta;
1312 LLVMValueRef res;
1313
1314 assert(lp_check_value(bld->type, x));
1315 assert(lp_check_value(bld->type, v0));
1316 assert(lp_check_value(bld->type, v1));
1317
1318 delta = lp_build_sub(bld, v1, v0);
1319
1320 if (bld->type.floating) {
1321 assert(flags == 0);
1322 return lp_build_mad(bld, x, delta, v0);
1323 }
1324
1325 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1326 if (!bld->type.sign) {
1327 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1328 /*
1329 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1330 * most-significant-bit to the lowest-significant-bit, so that
1331 * later we can just divide by 2**n instead of 2**n - 1.
1332 */
1333
1334 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1335 }
1336
1337 /* (x * delta) >> n */
1338 /*
1339 * For this multiply, higher internal precision is required to pass
1340 * CTS, the most efficient path to that is pmulhrsw on ssse3 and
1341 * above. This could be opencoded on other arches if conformance was
1342 * required.
1343 */
1344 if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1345 res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1346 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1347 } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1348 res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1349 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1350 } else {
1351 res = lp_build_mul(bld, x, delta);
1352 res = lp_build_shr_imm(bld, res, half_width);
1353 }
1354 } else {
1355 /*
1356 * The rescaling trick above doesn't work for signed numbers, so
1357 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1358 * instead.
1359 */
1360 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1361 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1362 }
1363 } else {
1364 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1365 res = lp_build_mul(bld, x, delta);
1366 }
1367
1368 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1369 /*
1370 * At this point both res and v0 only use the lower half of the bits,
1371 * the rest is zero. Instead of add / mask, do add with half wide type.
1372 */
1373 struct lp_type narrow_type;
1374 struct lp_build_context narrow_bld;
1375
1376 memset(&narrow_type, 0, sizeof narrow_type);
1377 narrow_type.sign = bld->type.sign;
1378 narrow_type.width = bld->type.width/2;
1379 narrow_type.length = bld->type.length*2;
1380
1381 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1382 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1383 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1384 res = lp_build_add(&narrow_bld, v0, res);
1385 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1386 } else {
1387 res = lp_build_add(bld, v0, res);
1388
1389 if (bld->type.fixed) {
1390 /*
1391 * We need to mask out the high order bits when lerping 8bit
1392 * normalized colors stored on 16bits
1393 */
1394 /* XXX: This step is necessary for lerping 8bit colors stored on
1395 * 16bits, but it will be wrong for true fixed point use cases.
1396 * Basically we need a more powerful lp_type, capable of further
1397 * distinguishing the values interpretation from the value storage.
1398 */
1399 LLVMValueRef low_bits;
1400 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1401 res = LLVMBuildAnd(builder, res, low_bits, "");
1402 }
1403 }
1404
1405 return res;
1406 }
1407
1408
1409 /**
1410 * Linear interpolation.
1411 */
1412 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1413 lp_build_lerp(struct lp_build_context *bld,
1414 LLVMValueRef x,
1415 LLVMValueRef v0,
1416 LLVMValueRef v1,
1417 unsigned flags)
1418 {
1419 const struct lp_type type = bld->type;
1420 LLVMValueRef res;
1421
1422 assert(lp_check_value(type, x));
1423 assert(lp_check_value(type, v0));
1424 assert(lp_check_value(type, v1));
1425
1426 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1427
1428 if (type.norm) {
1429 struct lp_type wide_type;
1430 struct lp_build_context wide_bld;
1431 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1432
1433 assert(type.length >= 2);
1434
1435 /*
1436 * Create a wider integer type, enough to hold the
1437 * intermediate result of the multiplication.
1438 */
1439 memset(&wide_type, 0, sizeof wide_type);
1440 wide_type.sign = type.sign;
1441 wide_type.width = type.width*2;
1442 wide_type.length = type.length/2;
1443
1444 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1445
1446 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1447 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1448 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1449
1450 /*
1451 * Lerp both halves.
1452 */
1453
1454 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1455
1456 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1457 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1458
1459 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1460 } else {
1461 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1462 }
1463
1464 return res;
1465 }
1466
1467
1468 /**
1469 * Bilinear interpolation.
1470 *
1471 * Values indices are in v_{yx}.
1472 */
1473 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1474 lp_build_lerp_2d(struct lp_build_context *bld,
1475 LLVMValueRef x,
1476 LLVMValueRef y,
1477 LLVMValueRef v00,
1478 LLVMValueRef v01,
1479 LLVMValueRef v10,
1480 LLVMValueRef v11,
1481 unsigned flags)
1482 {
1483 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1484 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1485 return lp_build_lerp(bld, y, v0, v1, flags);
1486 }
1487
1488
1489 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1490 lp_build_lerp_3d(struct lp_build_context *bld,
1491 LLVMValueRef x,
1492 LLVMValueRef y,
1493 LLVMValueRef z,
1494 LLVMValueRef v000,
1495 LLVMValueRef v001,
1496 LLVMValueRef v010,
1497 LLVMValueRef v011,
1498 LLVMValueRef v100,
1499 LLVMValueRef v101,
1500 LLVMValueRef v110,
1501 LLVMValueRef v111,
1502 unsigned flags)
1503 {
1504 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1505 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1506 return lp_build_lerp(bld, z, v0, v1, flags);
1507 }
1508
1509
1510 /**
1511 * Generate min(a, b)
1512 * Do checks for special cases but not for nans.
1513 */
1514 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1515 lp_build_min(struct lp_build_context *bld,
1516 LLVMValueRef a,
1517 LLVMValueRef b)
1518 {
1519 assert(lp_check_value(bld->type, a));
1520 assert(lp_check_value(bld->type, b));
1521
1522 if (a == bld->undef || b == bld->undef)
1523 return bld->undef;
1524
1525 if (a == b)
1526 return a;
1527
1528 if (bld->type.norm) {
1529 if (!bld->type.sign) {
1530 if (a == bld->zero || b == bld->zero) {
1531 return bld->zero;
1532 }
1533 }
1534 if (a == bld->one)
1535 return b;
1536 if (b == bld->one)
1537 return a;
1538 }
1539
1540 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1541 }
1542
1543
1544 /**
1545 * Generate min(a, b)
1546 * NaN's are handled according to the behavior specified by the
1547 * nan_behavior argument.
1548 */
1549 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1550 lp_build_min_ext(struct lp_build_context *bld,
1551 LLVMValueRef a,
1552 LLVMValueRef b,
1553 enum gallivm_nan_behavior nan_behavior)
1554 {
1555 assert(lp_check_value(bld->type, a));
1556 assert(lp_check_value(bld->type, b));
1557
1558 if (a == bld->undef || b == bld->undef)
1559 return bld->undef;
1560
1561 if (a == b)
1562 return a;
1563
1564 if (bld->type.norm) {
1565 if (!bld->type.sign) {
1566 if (a == bld->zero || b == bld->zero) {
1567 return bld->zero;
1568 }
1569 }
1570 if (a == bld->one)
1571 return b;
1572 if (b == bld->one)
1573 return a;
1574 }
1575
1576 return lp_build_min_simple(bld, a, b, nan_behavior);
1577 }
1578
1579
1580 /**
1581 * Generate max(a, b)
1582 * Do checks for special cases, but NaN behavior is undefined.
1583 */
1584 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1585 lp_build_max(struct lp_build_context *bld,
1586 LLVMValueRef a,
1587 LLVMValueRef b)
1588 {
1589 assert(lp_check_value(bld->type, a));
1590 assert(lp_check_value(bld->type, b));
1591
1592 if (a == bld->undef || b == bld->undef)
1593 return bld->undef;
1594
1595 if (a == b)
1596 return a;
1597
1598 if (bld->type.norm) {
1599 if (a == bld->one || b == bld->one)
1600 return bld->one;
1601 if (!bld->type.sign) {
1602 if (a == bld->zero) {
1603 return b;
1604 }
1605 if (b == bld->zero) {
1606 return a;
1607 }
1608 }
1609 }
1610
1611 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1612 }
1613
1614
1615 /**
1616 * Generate max(a, b)
1617 * Checks for special cases.
1618 * NaN's are handled according to the behavior specified by the
1619 * nan_behavior argument.
1620 */
1621 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1622 lp_build_max_ext(struct lp_build_context *bld,
1623 LLVMValueRef a,
1624 LLVMValueRef b,
1625 enum gallivm_nan_behavior nan_behavior)
1626 {
1627 assert(lp_check_value(bld->type, a));
1628 assert(lp_check_value(bld->type, b));
1629
1630 if (a == bld->undef || b == bld->undef)
1631 return bld->undef;
1632
1633 if (a == b)
1634 return a;
1635
1636 if (bld->type.norm) {
1637 if (a == bld->one || b == bld->one)
1638 return bld->one;
1639 if (!bld->type.sign) {
1640 if (a == bld->zero) {
1641 return b;
1642 }
1643 if (b == bld->zero) {
1644 return a;
1645 }
1646 }
1647 }
1648
1649 return lp_build_max_simple(bld, a, b, nan_behavior);
1650 }
1651
1652
1653 /**
1654 * Generate clamp(a, min, max)
1655 * NaN behavior (for any of a, min, max) is undefined.
1656 * Do checks for special cases.
1657 */
1658 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1659 lp_build_clamp(struct lp_build_context *bld,
1660 LLVMValueRef a,
1661 LLVMValueRef min,
1662 LLVMValueRef max)
1663 {
1664 assert(lp_check_value(bld->type, a));
1665 assert(lp_check_value(bld->type, min));
1666 assert(lp_check_value(bld->type, max));
1667
1668 a = lp_build_min(bld, a, max);
1669 a = lp_build_max(bld, a, min);
1670 return a;
1671 }
1672
1673
1674 /**
1675 * Generate clamp(a, 0, 1)
1676 * A NaN will get converted to zero.
1677 */
1678 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1679 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1680 LLVMValueRef a)
1681 {
1682 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1683 a = lp_build_min(bld, a, bld->one);
1684 return a;
1685 }
1686
1687
1688 /**
1689 * Generate abs(a)
1690 */
1691 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1692 lp_build_abs(struct lp_build_context *bld,
1693 LLVMValueRef a)
1694 {
1695 LLVMBuilderRef builder = bld->gallivm->builder;
1696 const struct lp_type type = bld->type;
1697 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1698
1699 assert(lp_check_value(type, a));
1700
1701 if (!type.sign)
1702 return a;
1703
1704 if (type.floating) {
1705 char intrinsic[32];
1706 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1707 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1708 }
1709
1710 if (type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1711 switch(type.width) {
1712 case 8:
1713 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1714 case 16:
1715 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1716 case 32:
1717 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1718 }
1719 }
1720 else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1721 switch(type.width) {
1722 case 8:
1723 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1724 case 16:
1725 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1726 case 32:
1727 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1728 }
1729 }
1730
1731 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1732 a, LLVMBuildNeg(builder, a, ""));
1733 }
1734
1735
1736 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1737 lp_build_negate(struct lp_build_context *bld,
1738 LLVMValueRef a)
1739 {
1740 LLVMBuilderRef builder = bld->gallivm->builder;
1741
1742 assert(lp_check_value(bld->type, a));
1743
1744 if (bld->type.floating)
1745 a = LLVMBuildFNeg(builder, a, "");
1746 else
1747 a = LLVMBuildNeg(builder, a, "");
1748
1749 return a;
1750 }
1751
1752
1753 /** Return -1, 0 or +1 depending on the sign of a */
1754 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_sgn(struct lp_build_context *bld,
1756 LLVMValueRef a)
1757 {
1758 LLVMBuilderRef builder = bld->gallivm->builder;
1759 const struct lp_type type = bld->type;
1760 LLVMValueRef cond;
1761 LLVMValueRef res;
1762
1763 assert(lp_check_value(type, a));
1764
1765 /* Handle non-zero case */
1766 if (!type.sign) {
1767 /* if not zero then sign must be positive */
1768 res = bld->one;
1769 }
1770 else if (type.floating) {
1771 LLVMTypeRef vec_type;
1772 LLVMTypeRef int_type;
1773 LLVMValueRef mask;
1774 LLVMValueRef sign;
1775 LLVMValueRef one;
1776 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1777
1778 int_type = lp_build_int_vec_type(bld->gallivm, type);
1779 vec_type = lp_build_vec_type(bld->gallivm, type);
1780 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1781
1782 /* Take the sign bit and add it to 1 constant */
1783 sign = LLVMBuildBitCast(builder, a, int_type, "");
1784 sign = LLVMBuildAnd(builder, sign, mask, "");
1785 one = LLVMConstBitCast(bld->one, int_type);
1786 res = LLVMBuildOr(builder, sign, one, "");
1787 res = LLVMBuildBitCast(builder, res, vec_type, "");
1788 }
1789 else
1790 {
1791 /* signed int/norm/fixed point */
1792 /* could use psign with sse3 and appropriate vectors here */
1793 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1794 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1795 res = lp_build_select(bld, cond, bld->one, minus_one);
1796 }
1797
1798 /* Handle zero */
1799 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1800 res = lp_build_select(bld, cond, bld->zero, res);
1801
1802 return res;
1803 }
1804
1805
1806 /**
1807 * Set the sign of float vector 'a' according to 'sign'.
1808 * If sign==0, return abs(a).
1809 * If sign==1, return -abs(a);
1810 * Other values for sign produce undefined results.
1811 */
1812 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1813 lp_build_set_sign(struct lp_build_context *bld,
1814 LLVMValueRef a, LLVMValueRef sign)
1815 {
1816 LLVMBuilderRef builder = bld->gallivm->builder;
1817 const struct lp_type type = bld->type;
1818 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1819 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1820 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1821 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1822 ~((unsigned long long) 1 << (type.width - 1)));
1823 LLVMValueRef val, res;
1824
1825 assert(type.floating);
1826 assert(lp_check_value(type, a));
1827
1828 /* val = reinterpret_cast<int>(a) */
1829 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1830 /* val = val & mask */
1831 val = LLVMBuildAnd(builder, val, mask, "");
1832 /* sign = sign << shift */
1833 sign = LLVMBuildShl(builder, sign, shift, "");
1834 /* res = val | sign */
1835 res = LLVMBuildOr(builder, val, sign, "");
1836 /* res = reinterpret_cast<float>(res) */
1837 res = LLVMBuildBitCast(builder, res, vec_type, "");
1838
1839 return res;
1840 }
1841
1842
1843 /**
1844 * Convert vector of (or scalar) int to vector of (or scalar) float.
1845 */
1846 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1847 lp_build_int_to_float(struct lp_build_context *bld,
1848 LLVMValueRef a)
1849 {
1850 LLVMBuilderRef builder = bld->gallivm->builder;
1851 const struct lp_type type = bld->type;
1852 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1853
1854 assert(type.floating);
1855
1856 return LLVMBuildSIToFP(builder, a, vec_type, "");
1857 }
1858
1859
1860 static bool
arch_rounding_available(const struct lp_type type)1861 arch_rounding_available(const struct lp_type type)
1862 {
1863 if ((util_get_cpu_caps()->has_sse4_1 &&
1864 (type.length == 1 || (LLVM_VERSION_MAJOR >= 8 && type.length == 2) ||
1865 type.width * type.length == 128)) ||
1866 (util_get_cpu_caps()->has_avx && type.width * type.length == 256) ||
1867 (util_get_cpu_caps()->has_avx512f && type.width * type.length == 512))
1868 return true;
1869 else if ((util_get_cpu_caps()->has_altivec &&
1870 (type.width == 32 && type.length == 4)))
1871 return true;
1872 else if (util_get_cpu_caps()->has_neon)
1873 return true;
1874 else if (util_get_cpu_caps()->family == CPU_S390X)
1875 return true;
1876
1877 return false;
1878 }
1879
1880 enum lp_build_round_mode
1881 {
1882 LP_BUILD_ROUND_NEAREST = 0,
1883 LP_BUILD_ROUND_FLOOR = 1,
1884 LP_BUILD_ROUND_CEIL = 2,
1885 LP_BUILD_ROUND_TRUNCATE = 3
1886 };
1887
1888
1889 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1890 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1891 LLVMValueRef a)
1892 {
1893 LLVMBuilderRef builder = bld->gallivm->builder;
1894 const struct lp_type type = bld->type;
1895 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1896 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1897 const char *intrinsic;
1898 LLVMValueRef res;
1899
1900 assert(type.floating);
1901 /* using the double precision conversions is a bit more complicated */
1902 assert(type.width == 32);
1903
1904 assert(lp_check_value(type, a));
1905 assert(util_get_cpu_caps()->has_sse2);
1906
1907 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1908 if (type.length == 1) {
1909 LLVMTypeRef vec_type;
1910 LLVMValueRef undef;
1911 LLVMValueRef arg;
1912 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1913
1914 vec_type = LLVMVectorType(bld->elem_type, 4);
1915
1916 intrinsic = "llvm.x86.sse.cvtss2si";
1917
1918 undef = LLVMGetUndef(vec_type);
1919
1920 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1921
1922 res = lp_build_intrinsic_unary(builder, intrinsic,
1923 ret_type, arg);
1924 }
1925 else {
1926 if (type.width* type.length == 128) {
1927 intrinsic = "llvm.x86.sse2.cvtps2dq";
1928 }
1929 else {
1930 assert(type.width*type.length == 256);
1931 assert(util_get_cpu_caps()->has_avx);
1932
1933 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1934 }
1935 res = lp_build_intrinsic_unary(builder, intrinsic,
1936 ret_type, a);
1937 }
1938
1939 return res;
1940 }
1941
1942
1943 /*
1944 */
1945 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1946 lp_build_round_altivec(struct lp_build_context *bld,
1947 LLVMValueRef a,
1948 enum lp_build_round_mode mode)
1949 {
1950 LLVMBuilderRef builder = bld->gallivm->builder;
1951 const struct lp_type type = bld->type;
1952 const char *intrinsic = NULL;
1953
1954 assert(type.floating);
1955
1956 assert(lp_check_value(type, a));
1957 assert(util_get_cpu_caps()->has_altivec);
1958
1959 (void)type;
1960
1961 switch (mode) {
1962 case LP_BUILD_ROUND_NEAREST:
1963 intrinsic = "llvm.ppc.altivec.vrfin";
1964 break;
1965 case LP_BUILD_ROUND_FLOOR:
1966 intrinsic = "llvm.ppc.altivec.vrfim";
1967 break;
1968 case LP_BUILD_ROUND_CEIL:
1969 intrinsic = "llvm.ppc.altivec.vrfip";
1970 break;
1971 case LP_BUILD_ROUND_TRUNCATE:
1972 intrinsic = "llvm.ppc.altivec.vrfiz";
1973 break;
1974 }
1975
1976 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1977 }
1978
1979
1980 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1981 lp_build_round_arch(struct lp_build_context *bld,
1982 LLVMValueRef a,
1983 enum lp_build_round_mode mode)
1984 {
1985 if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
1986 util_get_cpu_caps()->family == CPU_S390X) {
1987 LLVMBuilderRef builder = bld->gallivm->builder;
1988 const struct lp_type type = bld->type;
1989 const char *intrinsic_root;
1990 char intrinsic[32];
1991
1992 assert(type.floating);
1993 assert(lp_check_value(type, a));
1994 (void)type;
1995
1996 switch (mode) {
1997 case LP_BUILD_ROUND_NEAREST:
1998 intrinsic_root = "llvm.nearbyint";
1999 break;
2000 case LP_BUILD_ROUND_FLOOR:
2001 intrinsic_root = "llvm.floor";
2002 break;
2003 case LP_BUILD_ROUND_CEIL:
2004 intrinsic_root = "llvm.ceil";
2005 break;
2006 case LP_BUILD_ROUND_TRUNCATE:
2007 intrinsic_root = "llvm.trunc";
2008 break;
2009 default:
2010 unreachable("unhandled lp_build_round_mode");
2011 }
2012
2013 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2014 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2015 }
2016 else /* (util_get_cpu_caps()->has_altivec) */
2017 return lp_build_round_altivec(bld, a, mode);
2018 }
2019
2020
2021 /**
2022 * Return the integer part of a float (vector) value (== round toward zero).
2023 * The returned value is a float (vector).
2024 * Ex: trunc(-1.5) = -1.0
2025 */
2026 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2027 lp_build_trunc(struct lp_build_context *bld,
2028 LLVMValueRef a)
2029 {
2030 LLVMBuilderRef builder = bld->gallivm->builder;
2031 const struct lp_type type = bld->type;
2032
2033 assert(type.floating);
2034 assert(lp_check_value(type, a));
2035
2036 if (type.width == 16) {
2037 char intrinsic[64];
2038 lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2039 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2040 }
2041
2042 if (arch_rounding_available(type)) {
2043 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2044 }
2045 else {
2046 const struct lp_type type = bld->type;
2047 struct lp_type inttype;
2048 struct lp_build_context intbld;
2049 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2050 LLVMValueRef trunc, res, anosign, mask;
2051 LLVMTypeRef int_vec_type = bld->int_vec_type;
2052 LLVMTypeRef vec_type = bld->vec_type;
2053
2054 inttype = type;
2055 inttype.floating = 0;
2056 lp_build_context_init(&intbld, bld->gallivm, inttype);
2057
2058 /* round by truncation */
2059 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2060 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2061
2062 if (type.signed_zero_preserve) {
2063 char intrinsic[64];
2064 lp_format_intrinsic(intrinsic, 64, "llvm.copysign", bld->vec_type);
2065 res = lp_build_intrinsic_binary(builder, intrinsic, vec_type, res, a);
2066 }
2067
2068 /* mask out sign bit */
2069 anosign = lp_build_abs(bld, a);
2070 /*
2071 * mask out all values if anosign > 2^24
2072 * This should work both for large ints (all rounding is no-op for them
2073 * because such floats are always exact) as well as special cases like
2074 * NaNs, Infs (taking advantage of the fact they use max exponent).
2075 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2076 */
2077 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2078 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2079 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2080 return lp_build_select(bld, mask, a, res);
2081 }
2082 }
2083
2084
2085 /**
2086 * Return float (vector) rounded to nearest integer (vector). The returned
2087 * value is a float (vector).
2088 * Ex: round(0.9) = 1.0
2089 * Ex: round(-1.5) = -2.0
2090 */
2091 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2092 lp_build_round(struct lp_build_context *bld,
2093 LLVMValueRef a)
2094 {
2095 LLVMBuilderRef builder = bld->gallivm->builder;
2096 const struct lp_type type = bld->type;
2097
2098 assert(type.floating);
2099 assert(lp_check_value(type, a));
2100
2101 if (type.width == 16) {
2102 char intrinsic[64];
2103 lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2104 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2105 }
2106
2107 if (arch_rounding_available(type)) {
2108 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2109 }
2110 else {
2111 const struct lp_type type = bld->type;
2112 struct lp_type inttype;
2113 struct lp_build_context intbld;
2114 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2115 LLVMValueRef res, anosign, mask;
2116 LLVMTypeRef int_vec_type = bld->int_vec_type;
2117 LLVMTypeRef vec_type = bld->vec_type;
2118
2119 inttype = type;
2120 inttype.floating = 0;
2121 lp_build_context_init(&intbld, bld->gallivm, inttype);
2122
2123 res = lp_build_iround(bld, a);
2124 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2125
2126 if (type.signed_zero_preserve) {
2127 LLVMValueRef sign_mask =
2128 lp_build_const_int_vec(bld->gallivm, type, 1llu << (type.width - 1));
2129 LLVMValueRef a_sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2130 a_sign = LLVMBuildAnd(builder, a_sign, sign_mask, "");
2131
2132 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
2133 res = LLVMBuildOr(builder, res, a_sign, "");
2134 res = LLVMBuildBitCast(builder, res, vec_type, "");
2135 }
2136
2137 /* mask out sign bit */
2138 anosign = lp_build_abs(bld, a);
2139 /*
2140 * mask out all values if anosign > 2^24
2141 * This should work both for large ints (all rounding is no-op for them
2142 * because such floats are always exact) as well as special cases like
2143 * NaNs, Infs (taking advantage of the fact they use max exponent).
2144 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2145 */
2146 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2147 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2148 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2149 return lp_build_select(bld, mask, a, res);
2150 }
2151 }
2152
2153
2154 /**
2155 * Return floor of float (vector), result is a float (vector)
2156 * Ex: floor(1.1) = 1.0
2157 * Ex: floor(-1.1) = -2.0
2158 */
2159 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2160 lp_build_floor(struct lp_build_context *bld,
2161 LLVMValueRef a)
2162 {
2163 LLVMBuilderRef builder = bld->gallivm->builder;
2164 const struct lp_type type = bld->type;
2165
2166 assert(type.floating);
2167 assert(lp_check_value(type, a));
2168
2169 if (arch_rounding_available(type)) {
2170 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2171 }
2172 else {
2173 const struct lp_type type = bld->type;
2174 struct lp_type inttype;
2175 struct lp_build_context intbld;
2176 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177 LLVMValueRef trunc, res, anosign, mask;
2178 LLVMTypeRef int_vec_type = bld->int_vec_type;
2179 LLVMTypeRef vec_type = bld->vec_type;
2180
2181 if (type.width != 32) {
2182 char intrinsic[32];
2183 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2184 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2185 }
2186
2187 assert(type.width == 32); /* might want to handle doubles at some point */
2188
2189 inttype = type;
2190 inttype.floating = 0;
2191 lp_build_context_init(&intbld, bld->gallivm, inttype);
2192
2193 /* round by truncation */
2194 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2195 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2196
2197 if (type.sign) {
2198 LLVMValueRef tmp;
2199
2200 /*
2201 * fix values if rounding is wrong (for non-special cases)
2202 * - this is the case if trunc > a
2203 */
2204 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2205 /* tmp = trunc > a ? 1.0 : 0.0 */
2206 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2207 tmp = lp_build_and(&intbld, mask, tmp);
2208 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2209 res = lp_build_sub(bld, res, tmp);
2210 }
2211
2212 /* mask out sign bit */
2213 anosign = lp_build_abs(bld, a);
2214 /*
2215 * mask out all values if anosign > 2^24
2216 * This should work both for large ints (all rounding is no-op for them
2217 * because such floats are always exact) as well as special cases like
2218 * NaNs, Infs (taking advantage of the fact they use max exponent).
2219 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2220 */
2221 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2222 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2223 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2224 return lp_build_select(bld, mask, a, res);
2225 }
2226 }
2227
2228
2229 /**
2230 * Return ceiling of float (vector), returning float (vector).
2231 * Ex: ceil( 1.1) = 2.0
2232 * Ex: ceil(-1.1) = -1.0
2233 */
2234 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2235 lp_build_ceil(struct lp_build_context *bld,
2236 LLVMValueRef a)
2237 {
2238 LLVMBuilderRef builder = bld->gallivm->builder;
2239 const struct lp_type type = bld->type;
2240
2241 assert(type.floating);
2242 assert(lp_check_value(type, a));
2243
2244 if (arch_rounding_available(type)) {
2245 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2246 }
2247 else {
2248 const struct lp_type type = bld->type;
2249 struct lp_type inttype;
2250 struct lp_build_context intbld;
2251 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2252 LLVMValueRef trunc, res, anosign, mask, tmp;
2253 LLVMTypeRef int_vec_type = bld->int_vec_type;
2254 LLVMTypeRef vec_type = bld->vec_type;
2255
2256 if (type.width != 32) {
2257 char intrinsic[32];
2258 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2259 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2260 }
2261
2262 assert(type.width == 32); /* might want to handle doubles at some point */
2263
2264 inttype = type;
2265 inttype.floating = 0;
2266 lp_build_context_init(&intbld, bld->gallivm, inttype);
2267
2268 /* round by truncation */
2269 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2270 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2271
2272 /*
2273 * fix values if rounding is wrong (for non-special cases)
2274 * - this is the case if trunc < a
2275 */
2276 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2277 /* tmp = trunc < a ? 1.0 : 0.0 */
2278 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2279 tmp = lp_build_and(&intbld, mask, tmp);
2280 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2281 res = lp_build_add(bld, trunc, tmp);
2282
2283 /* mask out sign bit */
2284 anosign = lp_build_abs(bld, a);
2285 /*
2286 * mask out all values if anosign > 2^24
2287 * This should work both for large ints (all rounding is no-op for them
2288 * because such floats are always exact) as well as special cases like
2289 * NaNs, Infs (taking advantage of the fact they use max exponent).
2290 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2291 */
2292 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2293 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2294 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2295 return lp_build_select(bld, mask, a, res);
2296 }
2297 }
2298
2299
2300 /**
2301 * Return fractional part of 'a' computed as a - floor(a)
2302 * Typically used in texture coord arithmetic.
2303 */
2304 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2305 lp_build_fract(struct lp_build_context *bld,
2306 LLVMValueRef a)
2307 {
2308 assert(bld->type.floating);
2309 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2310 }
2311
2312
2313 /**
2314 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2315 * against 0.99999(9). (Will also return that value for NaNs.)
2316 */
2317 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2318 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2319 {
2320 LLVMValueRef max;
2321
2322 /* this is the largest number smaller than 1.0 representable as float */
2323 max = lp_build_const_vec(bld->gallivm, bld->type,
2324 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2325 return lp_build_min_ext(bld, fract, max,
2326 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2327 }
2328
2329
2330 /**
2331 * Same as lp_build_fract, but guarantees that the result is always smaller
2332 * than one. Will also return the smaller-than-one value for infs, NaNs.
2333 */
2334 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2335 lp_build_fract_safe(struct lp_build_context *bld,
2336 LLVMValueRef a)
2337 {
2338 return clamp_fract(bld, lp_build_fract(bld, a));
2339 }
2340
2341
2342 /**
2343 * Return the integer part of a float (vector) value (== round toward zero).
2344 * The returned value is an integer (vector).
2345 * Ex: itrunc(-1.5) = -1
2346 */
2347 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2348 lp_build_itrunc(struct lp_build_context *bld,
2349 LLVMValueRef a)
2350 {
2351 LLVMBuilderRef builder = bld->gallivm->builder;
2352 const struct lp_type type = bld->type;
2353 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2354
2355 assert(type.floating);
2356 assert(lp_check_value(type, a));
2357
2358 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2359 }
2360
2361
2362 /**
2363 * Return float (vector) rounded to nearest integer (vector). The returned
2364 * value is an integer (vector).
2365 * Ex: iround(0.9) = 1
2366 * Ex: iround(-1.5) = -2
2367 */
2368 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2369 lp_build_iround(struct lp_build_context *bld,
2370 LLVMValueRef a)
2371 {
2372 LLVMBuilderRef builder = bld->gallivm->builder;
2373 const struct lp_type type = bld->type;
2374 LLVMTypeRef int_vec_type = bld->int_vec_type;
2375 LLVMValueRef res;
2376
2377 assert(type.floating);
2378
2379 assert(lp_check_value(type, a));
2380
2381 if ((util_get_cpu_caps()->has_sse2 &&
2382 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2383 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2384 return lp_build_iround_nearest_sse2(bld, a);
2385 }
2386 if (arch_rounding_available(type)) {
2387 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2388 }
2389 else {
2390 LLVMValueRef half;
2391
2392 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2393
2394 if (type.sign) {
2395 LLVMTypeRef vec_type = bld->vec_type;
2396 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2397 (unsigned long long)1 << (type.width - 1));
2398 LLVMValueRef sign;
2399
2400 /* get sign bit */
2401 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2402 sign = LLVMBuildAnd(builder, sign, mask, "");
2403
2404 /* sign * 0.5 */
2405 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2406 half = LLVMBuildOr(builder, sign, half, "");
2407 half = LLVMBuildBitCast(builder, half, vec_type, "");
2408 }
2409
2410 res = LLVMBuildFAdd(builder, a, half, "");
2411 }
2412
2413 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2414
2415 return res;
2416 }
2417
2418
2419 /**
2420 * Return floor of float (vector), result is an int (vector)
2421 * Ex: ifloor(1.1) = 1.0
2422 * Ex: ifloor(-1.1) = -2.0
2423 */
2424 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2425 lp_build_ifloor(struct lp_build_context *bld,
2426 LLVMValueRef a)
2427 {
2428 LLVMBuilderRef builder = bld->gallivm->builder;
2429 const struct lp_type type = bld->type;
2430 LLVMTypeRef int_vec_type = bld->int_vec_type;
2431 LLVMValueRef res;
2432
2433 assert(type.floating);
2434 assert(lp_check_value(type, a));
2435
2436 res = a;
2437 if (type.sign) {
2438 if (arch_rounding_available(type)) {
2439 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2440 }
2441 else {
2442 struct lp_type inttype;
2443 struct lp_build_context intbld;
2444 LLVMValueRef trunc, itrunc, mask;
2445
2446 assert(type.floating);
2447 assert(lp_check_value(type, a));
2448
2449 inttype = type;
2450 inttype.floating = 0;
2451 lp_build_context_init(&intbld, bld->gallivm, inttype);
2452
2453 /* round by truncation */
2454 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2455 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2456
2457 /*
2458 * fix values if rounding is wrong (for non-special cases)
2459 * - this is the case if trunc > a
2460 * The results of doing this with NaNs, very large values etc.
2461 * are undefined but this seems to be the case anyway.
2462 */
2463 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2464 /* cheapie minus one with mask since the mask is minus one / zero */
2465 return lp_build_add(&intbld, itrunc, mask);
2466 }
2467 }
2468
2469 /* round to nearest (toward zero) */
2470 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2471
2472 return res;
2473 }
2474
2475
2476 /**
2477 * Return ceiling of float (vector), returning int (vector).
2478 * Ex: iceil( 1.1) = 2
2479 * Ex: iceil(-1.1) = -1
2480 */
2481 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2482 lp_build_iceil(struct lp_build_context *bld,
2483 LLVMValueRef a)
2484 {
2485 LLVMBuilderRef builder = bld->gallivm->builder;
2486 const struct lp_type type = bld->type;
2487 LLVMTypeRef int_vec_type = bld->int_vec_type;
2488 LLVMValueRef res;
2489
2490 assert(type.floating);
2491 assert(lp_check_value(type, a));
2492
2493 if (arch_rounding_available(type)) {
2494 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2495 }
2496 else {
2497 struct lp_type inttype;
2498 struct lp_build_context intbld;
2499 LLVMValueRef trunc, itrunc, mask;
2500
2501 assert(type.floating);
2502 assert(lp_check_value(type, a));
2503
2504 inttype = type;
2505 inttype.floating = 0;
2506 lp_build_context_init(&intbld, bld->gallivm, inttype);
2507
2508 /* round by truncation */
2509 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2511
2512 /*
2513 * fix values if rounding is wrong (for non-special cases)
2514 * - this is the case if trunc < a
2515 * The results of doing this with NaNs, very large values etc.
2516 * are undefined but this seems to be the case anyway.
2517 */
2518 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2519 /* cheapie plus one with mask since the mask is minus one / zero */
2520 return lp_build_sub(&intbld, itrunc, mask);
2521 }
2522
2523 /* round to nearest (toward zero) */
2524 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2525
2526 return res;
2527 }
2528
2529
2530 /**
2531 * Combined ifloor() & fract().
2532 *
2533 * Preferred to calling the functions separately, as it will ensure that the
2534 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2535 */
2536 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2537 lp_build_ifloor_fract(struct lp_build_context *bld,
2538 LLVMValueRef a,
2539 LLVMValueRef *out_ipart,
2540 LLVMValueRef *out_fpart)
2541 {
2542 LLVMBuilderRef builder = bld->gallivm->builder;
2543 const struct lp_type type = bld->type;
2544 LLVMValueRef ipart;
2545
2546 assert(type.floating);
2547 assert(lp_check_value(type, a));
2548
2549 if (arch_rounding_available(type)) {
2550 /*
2551 * floor() is easier.
2552 */
2553
2554 ipart = lp_build_floor(bld, a);
2555 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2556 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2557 }
2558 else {
2559 /*
2560 * ifloor() is easier.
2561 */
2562
2563 *out_ipart = lp_build_ifloor(bld, a);
2564 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2565 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2566 }
2567 }
2568
2569
2570 /**
2571 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2572 * always smaller than one.
2573 */
2574 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2575 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2576 LLVMValueRef a,
2577 LLVMValueRef *out_ipart,
2578 LLVMValueRef *out_fpart)
2579 {
2580 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2581 *out_fpart = clamp_fract(bld, *out_fpart);
2582 }
2583
2584
2585 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2586 lp_build_sqrt(struct lp_build_context *bld,
2587 LLVMValueRef a)
2588 {
2589 LLVMBuilderRef builder = bld->gallivm->builder;
2590 const struct lp_type type = bld->type;
2591 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2592 char intrinsic[32];
2593
2594 assert(lp_check_value(type, a));
2595
2596 assert(type.floating);
2597 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2598
2599 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2600 }
2601
2602
2603 /**
2604 * Do one Newton-Raphson step to improve reciprocate precision:
2605 *
2606 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2607 *
2608 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2609 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2610 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2611 * halo. It would be necessary to clamp the argument to prevent this.
2612 *
2613 * See also:
2614 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2615 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2616 */
2617 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2618 lp_build_rcp_refine(struct lp_build_context *bld,
2619 LLVMValueRef a,
2620 LLVMValueRef rcp_a)
2621 {
2622 LLVMBuilderRef builder = bld->gallivm->builder;
2623 LLVMValueRef neg_a;
2624 LLVMValueRef res;
2625
2626 neg_a = LLVMBuildFNeg(builder, a, "");
2627 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2628 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2629
2630 return res;
2631 }
2632
2633
2634 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2635 lp_build_rcp(struct lp_build_context *bld,
2636 LLVMValueRef a)
2637 {
2638 LLVMBuilderRef builder = bld->gallivm->builder;
2639 const struct lp_type type = bld->type;
2640
2641 assert(lp_check_value(type, a));
2642
2643 if (a == bld->zero)
2644 return bld->undef;
2645 if (a == bld->one)
2646 return bld->one;
2647 if (a == bld->undef)
2648 return bld->undef;
2649
2650 assert(type.floating);
2651
2652 if (LLVMIsConstant(a))
2653 return LLVMBuildFDiv(builder, bld->one, a, "");
2654
2655 /*
2656 * We don't use RCPPS because:
2657 * - it only has 10bits of precision
2658 * - it doesn't even get the reciprocate of 1.0 exactly
2659 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2660 * - for recent processors the benefit over DIVPS is marginal, a case
2661 * dependent
2662 *
2663 * We could still use it on certain processors if benchmarks show that the
2664 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2665 * particular uses that require less workarounds.
2666 */
2667
2668 if (false && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2669 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2670 const unsigned num_iterations = 0;
2671 LLVMValueRef res;
2672 unsigned i;
2673 const char *intrinsic = NULL;
2674
2675 if (type.length == 4) {
2676 intrinsic = "llvm.x86.sse.rcp.ps";
2677 }
2678 else {
2679 intrinsic = "llvm.x86.avx.rcp.ps.256";
2680 }
2681
2682 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2683
2684 for (i = 0; i < num_iterations; ++i) {
2685 res = lp_build_rcp_refine(bld, a, res);
2686 }
2687
2688 return res;
2689 }
2690
2691 return LLVMBuildFDiv(builder, bld->one, a, "");
2692 }
2693
2694
2695 /**
2696 * Do one Newton-Raphson step to improve rsqrt precision:
2697 *
2698 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2699 *
2700 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2701 */
2702 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2703 lp_build_rsqrt_refine(struct lp_build_context *bld,
2704 LLVMValueRef a,
2705 LLVMValueRef rsqrt_a)
2706 {
2707 LLVMBuilderRef builder = bld->gallivm->builder;
2708 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2709 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2710 LLVMValueRef res;
2711
2712 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2713 res = LLVMBuildFMul(builder, a, res, "");
2714 res = LLVMBuildFSub(builder, three, res, "");
2715 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2716 res = LLVMBuildFMul(builder, half, res, "");
2717
2718 return res;
2719 }
2720
2721
2722 /**
2723 * Generate 1/sqrt(a).
2724 * Result is undefined for values < 0, infinity for +0.
2725 */
2726 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2727 lp_build_rsqrt(struct lp_build_context *bld,
2728 LLVMValueRef a)
2729 {
2730 const struct lp_type type = bld->type;
2731
2732 assert(lp_check_value(type, a));
2733
2734 assert(type.floating);
2735
2736 /*
2737 * This should be faster but all denormals will end up as infinity.
2738 */
2739 if (0 && lp_build_fast_rsqrt_available(type)) {
2740 const unsigned num_iterations = 1;
2741 LLVMValueRef res;
2742 unsigned i;
2743
2744 /* rsqrt(1.0) != 1.0 here */
2745 res = lp_build_fast_rsqrt(bld, a);
2746
2747 if (num_iterations) {
2748 /*
2749 * Newton-Raphson will result in NaN instead of infinity for zero,
2750 * and NaN instead of zero for infinity.
2751 * Also, need to ensure rsqrt(1.0) == 1.0.
2752 * All numbers smaller than FLT_MIN will result in +infinity
2753 * (rsqrtps treats all denormals as zero).
2754 */
2755 LLVMValueRef cmp;
2756 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2757 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2758
2759 for (i = 0; i < num_iterations; ++i) {
2760 res = lp_build_rsqrt_refine(bld, a, res);
2761 }
2762 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2763 res = lp_build_select(bld, cmp, inf, res);
2764 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2765 res = lp_build_select(bld, cmp, bld->zero, res);
2766 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2767 res = lp_build_select(bld, cmp, bld->one, res);
2768 }
2769
2770 return res;
2771 }
2772
2773 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2774 }
2775
2776
2777 /**
2778 * If there's a fast (inaccurate) rsqrt instruction available
2779 * (caller may want to avoid to call rsqrt_fast if it's not available,
2780 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2781 * unavailable it would result in sqrt/div/mul so obviously
2782 * much better to just call sqrt, skipping both div and mul).
2783 */
2784 bool
lp_build_fast_rsqrt_available(struct lp_type type)2785 lp_build_fast_rsqrt_available(struct lp_type type)
2786 {
2787 assert(type.floating);
2788
2789 if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2790 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2791 return true;
2792 }
2793 return false;
2794 }
2795
2796
2797 /**
2798 * Generate 1/sqrt(a).
2799 * Result is undefined for values < 0, infinity for +0.
2800 * Precision is limited, only ~10 bits guaranteed
2801 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2802 */
2803 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2804 lp_build_fast_rsqrt(struct lp_build_context *bld,
2805 LLVMValueRef a)
2806 {
2807 LLVMBuilderRef builder = bld->gallivm->builder;
2808 const struct lp_type type = bld->type;
2809
2810 assert(lp_check_value(type, a));
2811
2812 if (lp_build_fast_rsqrt_available(type)) {
2813 const char *intrinsic = NULL;
2814
2815 if (type.length == 4) {
2816 intrinsic = "llvm.x86.sse.rsqrt.ps";
2817 }
2818 else {
2819 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2820 }
2821 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2822 }
2823 else {
2824 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __func__);
2825 }
2826 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2827 }
2828
2829
2830 /**
2831 * Generate sin(a) or cos(a) using polynomial approximation.
2832 * TODO: it might be worth recognizing sin and cos using same source
2833 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2834 * would be way cheaper than calculating (nearly) everything twice...
2835 * Not sure it's common enough to be worth bothering however, scs
2836 * opcode could also benefit from calculating both though.
2837 */
2838 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,bool cos)2839 lp_build_sin_or_cos(struct lp_build_context *bld,
2840 LLVMValueRef a,
2841 bool cos)
2842 {
2843 struct gallivm_state *gallivm = bld->gallivm;
2844 LLVMBuilderRef b = gallivm->builder;
2845 struct lp_type int_type = lp_int_type(bld->type);
2846
2847 /*
2848 * take the absolute value,
2849 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2850 */
2851
2852 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2853 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2854
2855 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2856 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2857
2858 /*
2859 * scale by 4/Pi
2860 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2861 */
2862
2863 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2864 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2865
2866 /*
2867 * store the integer part of y in mm0
2868 * emm2 = _mm_cvttps_epi32(y);
2869 */
2870
2871 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2872
2873 /*
2874 * j=(j+1) & (~1) (see the cephes sources)
2875 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2876 */
2877
2878 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2879 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2880 /*
2881 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2882 */
2883 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2884 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2885
2886 /*
2887 * y = _mm_cvtepi32_ps(emm2);
2888 */
2889 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2890
2891 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2892 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2893 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2894 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2895
2896 /*
2897 * Argument used for poly selection and sign bit determination
2898 * is different for sin vs. cos.
2899 */
2900 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2901 emm2_and;
2902
2903 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2904 LLVMBuildNot(b, emm2_2, ""), ""),
2905 const_29, "sign_bit") :
2906 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2907 LLVMBuildShl(b, emm2_add,
2908 const_29, ""), ""),
2909 sign_mask, "sign_bit");
2910
2911 /*
2912 * get the polynom selection mask
2913 * there is one polynom for 0 <= x <= Pi/4
2914 * and another one for Pi/4<x<=Pi/2
2915 * Both branches will be computed.
2916 *
2917 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2918 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2919 */
2920
2921 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2922 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2923 int_type, PIPE_FUNC_EQUAL,
2924 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2925
2926 /*
2927 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2928 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2929 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2930 */
2931 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2932 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2933 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2934
2935 /*
2936 * The magic pass: "Extended precision modular arithmetic"
2937 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2938 */
2939 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2940 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2941 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2942
2943 /*
2944 * Evaluate the first polynom (0 <= x <= Pi/4)
2945 *
2946 * z = _mm_mul_ps(x,x);
2947 */
2948 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2949
2950 /*
2951 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2952 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2953 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2954 */
2955 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2956 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2957 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2958
2959 /*
2960 * y = *(v4sf*)_ps_coscof_p0;
2961 * y = _mm_mul_ps(y, z);
2962 */
2963 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2964 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2965 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2966 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2967
2968
2969 /*
2970 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2971 * y = _mm_sub_ps(y, tmp);
2972 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2973 */
2974 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2975 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2976 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2977 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2978 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2979
2980 /*
2981 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2982 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2983 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2984 */
2985 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2986 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2987 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2988
2989 /*
2990 * Evaluate the second polynom (Pi/4 <= x <= 0)
2991 *
2992 * y2 = *(v4sf*)_ps_sincof_p0;
2993 * y2 = _mm_mul_ps(y2, z);
2994 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2995 * y2 = _mm_mul_ps(y2, z);
2996 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2997 * y2 = _mm_mul_ps(y2, z);
2998 * y2 = _mm_mul_ps(y2, x);
2999 * y2 = _mm_add_ps(y2, x);
3000 */
3001
3002 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3003 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3004 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3005 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3006
3007 /*
3008 * select the correct result from the two polynoms
3009 * xmm3 = poly_mask;
3010 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3011 * y = _mm_andnot_ps(xmm3, y);
3012 * y = _mm_or_ps(y,y2);
3013 */
3014 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3015 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3016 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3017 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3018 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3019 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3020
3021 /*
3022 * update the sign
3023 * y = _mm_xor_ps(y, sign_bit);
3024 */
3025 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3026 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3027
3028 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3029
3030 /* clamp output to be within [-1, 1] */
3031 y_result = lp_build_clamp(bld, y_result,
3032 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3033 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3034 /* If a is -inf, inf or NaN then return NaN */
3035 y_result = lp_build_select(bld, isfinite, y_result,
3036 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3037 return y_result;
3038 }
3039
3040
3041 /**
3042 * Generate sin(a)
3043 */
3044 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3045 lp_build_sin(struct lp_build_context *bld,
3046 LLVMValueRef a)
3047 {
3048 const struct lp_type type = bld->type;
3049
3050 if (type.width == 16) {
3051 LLVMBuilderRef builder = bld->gallivm->builder;
3052 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3053 char intrinsic[32];
3054 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3055 LLVMValueRef args[] = { a };
3056 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3057 }
3058
3059 return lp_build_sin_or_cos(bld, a, false);
3060 }
3061
3062
3063 /**
3064 * Generate cos(a)
3065 */
3066 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3067 lp_build_cos(struct lp_build_context *bld,
3068 LLVMValueRef a)
3069 {
3070 const struct lp_type type = bld->type;
3071
3072 if (type.width == 16) {
3073 LLVMBuilderRef builder = bld->gallivm->builder;
3074 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3075 char intrinsic[32];
3076 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3077 LLVMValueRef args[] = { a };
3078 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3079 }
3080
3081 return lp_build_sin_or_cos(bld, a, true);
3082 }
3083
3084
3085 /**
3086 * Generate pow(x, y)
3087 */
3088 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3089 lp_build_pow(struct lp_build_context *bld,
3090 LLVMValueRef x,
3091 LLVMValueRef y)
3092 {
3093 /* TODO: optimize the constant case */
3094 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3095 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3096 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3097 __func__);
3098 }
3099
3100 LLVMValueRef cmp = lp_build_cmp_ordered(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3101 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3102
3103 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3104 return res;
3105 }
3106
3107
3108 /**
3109 * Generate exp(x)
3110 */
3111 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3112 lp_build_exp(struct lp_build_context *bld,
3113 LLVMValueRef x)
3114 {
3115 /* log2(e) = 1/log(2) */
3116 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3117 1.4426950408889634);
3118
3119 assert(lp_check_value(bld->type, x));
3120
3121 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3122 }
3123
3124
3125 /**
3126 * Generate log(x)
3127 * Behavior is undefined with infs, 0s and nans
3128 */
3129 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3130 lp_build_log(struct lp_build_context *bld,
3131 LLVMValueRef x)
3132 {
3133 /* log(2) */
3134 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3135 0.69314718055994529);
3136
3137 assert(lp_check_value(bld->type, x));
3138
3139 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3140 }
3141
3142
3143 /**
3144 * Generate log(x) that handles edge cases (infs, 0s and nans)
3145 */
3146 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3147 lp_build_log_safe(struct lp_build_context *bld,
3148 LLVMValueRef x)
3149 {
3150 /* log(2) */
3151 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3152 0.69314718055994529);
3153
3154 assert(lp_check_value(bld->type, x));
3155
3156 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3157 }
3158
3159
3160 /**
3161 * Generate polynomial.
3162 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3163 */
3164 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3165 lp_build_polynomial(struct lp_build_context *bld,
3166 LLVMValueRef x,
3167 const double *coeffs,
3168 unsigned num_coeffs)
3169 {
3170 const struct lp_type type = bld->type;
3171 LLVMValueRef even = NULL, odd = NULL;
3172 LLVMValueRef x2;
3173 unsigned i;
3174
3175 assert(lp_check_value(bld->type, x));
3176
3177 /* TODO: optimize the constant case */
3178 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3179 LLVMIsConstant(x)) {
3180 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3181 __func__);
3182 }
3183
3184 /*
3185 * Calculate odd and even terms seperately to decrease data dependency
3186 * Ex:
3187 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3188 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3189 */
3190 x2 = lp_build_mul(bld, x, x);
3191
3192 for (i = num_coeffs; i--; ) {
3193 LLVMValueRef coeff;
3194
3195 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3196
3197 if (i % 2 == 0) {
3198 if (even)
3199 even = lp_build_mad(bld, x2, even, coeff);
3200 else
3201 even = coeff;
3202 } else {
3203 if (odd)
3204 odd = lp_build_mad(bld, x2, odd, coeff);
3205 else
3206 odd = coeff;
3207 }
3208 }
3209
3210 if (odd)
3211 return lp_build_mad(bld, odd, x, even);
3212 else if (even)
3213 return even;
3214 else
3215 return bld->undef;
3216 }
3217
3218
3219 /**
3220 * Minimax polynomial fit of 2**x, in range [0, 1[
3221 */
3222 static const double lp_build_exp2_polynomial[] = {
3223 #if EXP_POLY_DEGREE == 5
3224 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3225 0.693153073200168932794,
3226 0.240153617044375388211,
3227 0.0558263180532956664775,
3228 0.00898934009049466391101,
3229 0.00187757667519147912699
3230 #elif EXP_POLY_DEGREE == 4
3231 1.00000259337069434683,
3232 0.693003834469974940458,
3233 0.24144275689150793076,
3234 0.0520114606103070150235,
3235 0.0135341679161270268764
3236 #elif EXP_POLY_DEGREE == 3
3237 0.999925218562710312959,
3238 0.695833540494823811697,
3239 0.226067155427249155588,
3240 0.0780245226406372992967
3241 #elif EXP_POLY_DEGREE == 2
3242 1.00172476321474503578,
3243 0.657636275736077639316,
3244 0.33718943461968720704
3245 #else
3246 #error
3247 #endif
3248 };
3249
3250
3251 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3252 lp_build_exp2(struct lp_build_context *bld,
3253 LLVMValueRef x)
3254 {
3255 LLVMBuilderRef builder = bld->gallivm->builder;
3256 const struct lp_type type = bld->type;
3257 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3258 LLVMValueRef ipart = NULL;
3259 LLVMValueRef fpart = NULL;
3260 LLVMValueRef expipart = NULL;
3261 LLVMValueRef expfpart = NULL;
3262 LLVMValueRef res = NULL;
3263
3264 if (type.floating && type.width == 16) {
3265 char intrinsic[32];
3266 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3267 LLVMValueRef args[] = { x };
3268 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3269 }
3270
3271 assert(lp_check_value(bld->type, x));
3272
3273 /* TODO: optimize the constant case */
3274 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3275 LLVMIsConstant(x)) {
3276 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3277 __func__);
3278 }
3279
3280 assert(type.floating && type.width == 32);
3281
3282 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3283 * the result is INF and if it's smaller than -126.9 the result is 0 */
3284 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3285 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3286 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3287 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3288
3289 /* ipart = floor(x) */
3290 /* fpart = x - ipart */
3291 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3292
3293 /* expipart = (float) (1 << ipart) */
3294 expipart = LLVMBuildAdd(builder, ipart,
3295 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3296 expipart = LLVMBuildShl(builder, expipart,
3297 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3298 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3299
3300 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3301 ARRAY_SIZE(lp_build_exp2_polynomial));
3302
3303 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3304
3305 return res;
3306 }
3307
3308
3309 /**
3310 * Extract the exponent of a IEEE-754 floating point value.
3311 *
3312 * Optionally apply an integer bias.
3313 *
3314 * Result is an integer value with
3315 *
3316 * ifloor(log2(x)) + bias
3317 */
3318 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3319 lp_build_extract_exponent(struct lp_build_context *bld,
3320 LLVMValueRef x,
3321 int bias)
3322 {
3323 LLVMBuilderRef builder = bld->gallivm->builder;
3324 const struct lp_type type = bld->type;
3325 unsigned mantissa = lp_mantissa(type);
3326 LLVMValueRef res;
3327
3328 assert(type.floating);
3329
3330 assert(lp_check_value(bld->type, x));
3331
3332 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3333
3334 res = LLVMBuildLShr(builder, x,
3335 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3336 res = LLVMBuildAnd(builder, res,
3337 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3338 res = LLVMBuildSub(builder, res,
3339 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3340
3341 return res;
3342 }
3343
3344
3345 /**
3346 * Extract the mantissa of the a floating.
3347 *
3348 * Result is a floating point value with
3349 *
3350 * x / floor(log2(x))
3351 */
3352 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3353 lp_build_extract_mantissa(struct lp_build_context *bld,
3354 LLVMValueRef x)
3355 {
3356 LLVMBuilderRef builder = bld->gallivm->builder;
3357 const struct lp_type type = bld->type;
3358 unsigned mantissa = lp_mantissa(type);
3359 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3360 (1ULL << mantissa) - 1);
3361 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3362 LLVMValueRef res;
3363
3364 assert(lp_check_value(bld->type, x));
3365
3366 assert(type.floating);
3367
3368 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3369
3370 /* res = x / 2**ipart */
3371 res = LLVMBuildAnd(builder, x, mantmask, "");
3372 res = LLVMBuildOr(builder, res, one, "");
3373 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3374
3375 return res;
3376 }
3377
3378
3379
3380 /**
3381 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3382 * These coefficients can be generate with
3383 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3384 */
3385 static const double lp_build_log2_polynomial[] = {
3386 #if LOG_POLY_DEGREE == 5
3387 2.88539008148777786488L,
3388 0.961796878841293367824L,
3389 0.577058946784739859012L,
3390 0.412914355135828735411L,
3391 0.308591899232910175289L,
3392 0.352376952300281371868L,
3393 #elif LOG_POLY_DEGREE == 4
3394 2.88539009343309178325L,
3395 0.961791550404184197881L,
3396 0.577440339438736392009L,
3397 0.403343858251329912514L,
3398 0.406718052498846252698L,
3399 #elif LOG_POLY_DEGREE == 3
3400 2.88538959748872753838L,
3401 0.961932915889597772928L,
3402 0.571118517972136195241L,
3403 0.493997535084709500285L,
3404 #else
3405 #error
3406 #endif
3407 };
3408
3409
3410 /**
3411 * See http://www.devmaster.net/forums/showthread.php?p=43580
3412 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3413 * http://www.nezumi.demon.co.uk/consult/logx.htm
3414 *
3415 * If handle_edge_cases is true the function will perform computations
3416 * to match the required D3D10+ behavior for each of the edge cases.
3417 * That means that if input is:
3418 * - less than zero (to and including -inf) then NaN will be returned
3419 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3420 * - +infinity, then +infinity will be returned
3421 * - NaN, then NaN will be returned
3422 *
3423 * Those checks are fairly expensive so if you don't need them make sure
3424 * handle_edge_cases is false.
3425 */
3426 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,bool handle_edge_cases)3427 lp_build_log2_approx(struct lp_build_context *bld,
3428 LLVMValueRef x,
3429 LLVMValueRef *p_exp,
3430 LLVMValueRef *p_floor_log2,
3431 LLVMValueRef *p_log2,
3432 bool handle_edge_cases)
3433 {
3434 LLVMBuilderRef builder = bld->gallivm->builder;
3435 const struct lp_type type = bld->type;
3436 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3437 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3438
3439 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3440 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3441 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3442
3443 LLVMValueRef i = NULL;
3444 LLVMValueRef y = NULL;
3445 LLVMValueRef z = NULL;
3446 LLVMValueRef exp = NULL;
3447 LLVMValueRef mant = NULL;
3448 LLVMValueRef logexp = NULL;
3449 LLVMValueRef p_z = NULL;
3450 LLVMValueRef res = NULL;
3451
3452 if (bld->type.width == 16) {
3453 char intrinsic[32];
3454 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3455 LLVMValueRef args[] = { x };
3456 if (p_log2)
3457 *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3458 return;
3459 }
3460
3461 assert(lp_check_value(bld->type, x));
3462
3463 if (p_exp || p_floor_log2 || p_log2) {
3464 /* TODO: optimize the constant case */
3465 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3466 LLVMIsConstant(x)) {
3467 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3468 __func__);
3469 }
3470
3471 assert(type.floating && type.width == 32);
3472
3473 /*
3474 * We don't explicitly handle denormalized numbers. They will yield a
3475 * result in the neighbourhood of -127, which appears to be adequate
3476 * enough.
3477 */
3478
3479 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3480
3481 /* exp = (float) exponent(x) */
3482 exp = LLVMBuildAnd(builder, i, expmask, "");
3483 }
3484
3485 if (p_floor_log2 || p_log2) {
3486 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3487 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3488 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3489 }
3490
3491 if (p_log2) {
3492 /* mant = 1 + (float) mantissa(x) */
3493 mant = LLVMBuildAnd(builder, i, mantmask, "");
3494 mant = LLVMBuildOr(builder, mant, one, "");
3495 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3496
3497 /* y = (mant - 1) / (mant + 1) */
3498 y = lp_build_div(bld,
3499 lp_build_sub(bld, mant, bld->one),
3500 lp_build_add(bld, mant, bld->one));
3501
3502 /* z = y^2 */
3503 z = lp_build_mul(bld, y, y);
3504
3505 /* compute P(z) */
3506 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3507 ARRAY_SIZE(lp_build_log2_polynomial));
3508
3509 /* y * P(z) + logexp */
3510 res = lp_build_mad(bld, y, p_z, logexp);
3511
3512 if (type.floating && handle_edge_cases) {
3513 LLVMValueRef negmask, infmask, zmask;
3514 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3515 lp_build_const_vec(bld->gallivm, type, 0.0f));
3516 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3517 lp_build_const_vec(bld->gallivm, type, 0.0f));
3518 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3519 lp_build_const_vec(bld->gallivm, type, INFINITY));
3520
3521 /* If x is qual to inf make sure we return inf */
3522 res = lp_build_select(bld, infmask,
3523 lp_build_const_vec(bld->gallivm, type, INFINITY),
3524 res);
3525 /* If x is qual to 0, return -inf */
3526 res = lp_build_select(bld, zmask,
3527 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3528 res);
3529 /* If x is nan or less than 0, return nan */
3530 res = lp_build_select(bld, negmask,
3531 lp_build_const_vec(bld->gallivm, type, NAN),
3532 res);
3533 }
3534 }
3535
3536 if (p_exp) {
3537 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3538 *p_exp = exp;
3539 }
3540
3541 if (p_floor_log2)
3542 *p_floor_log2 = logexp;
3543
3544 if (p_log2)
3545 *p_log2 = res;
3546 }
3547
3548
3549 /*
3550 * log2 implementation which doesn't have special code to
3551 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3552 * the results for those cases are undefined.
3553 */
3554 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3555 lp_build_log2(struct lp_build_context *bld,
3556 LLVMValueRef x)
3557 {
3558 LLVMValueRef res;
3559 lp_build_log2_approx(bld, x, NULL, NULL, &res, false);
3560 return res;
3561 }
3562
3563
3564 /*
3565 * Version of log2 which handles all edge cases.
3566 * Look at documentation of lp_build_log2_approx for
3567 * description of the behavior for each of the edge cases.
3568 */
3569 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3570 lp_build_log2_safe(struct lp_build_context *bld,
3571 LLVMValueRef x)
3572 {
3573 LLVMValueRef res;
3574 lp_build_log2_approx(bld, x, NULL, NULL, &res, true);
3575 return res;
3576 }
3577
3578
3579 /**
3580 * Faster (and less accurate) log2.
3581 *
3582 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3583 *
3584 * Piece-wise linear approximation, with exact results when x is a
3585 * power of two.
3586 *
3587 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3588 */
3589 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3590 lp_build_fast_log2(struct lp_build_context *bld,
3591 LLVMValueRef x)
3592 {
3593 LLVMBuilderRef builder = bld->gallivm->builder;
3594 LLVMValueRef ipart;
3595 LLVMValueRef fpart;
3596
3597 assert(lp_check_value(bld->type, x));
3598
3599 assert(bld->type.floating);
3600
3601 /* ipart = floor(log2(x)) - 1 */
3602 ipart = lp_build_extract_exponent(bld, x, -1);
3603 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3604
3605 /* fpart = x / 2**ipart */
3606 fpart = lp_build_extract_mantissa(bld, x);
3607
3608 /* ipart + fpart */
3609 return LLVMBuildFAdd(builder, ipart, fpart, "");
3610 }
3611
3612
3613 /**
3614 * Fast implementation of iround(log2(x)).
3615 *
3616 * Not an approximation -- it should give accurate results all the time.
3617 */
3618 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3619 lp_build_ilog2(struct lp_build_context *bld,
3620 LLVMValueRef x)
3621 {
3622 LLVMBuilderRef builder = bld->gallivm->builder;
3623 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3624 LLVMValueRef ipart;
3625
3626 assert(bld->type.floating);
3627
3628 assert(lp_check_value(bld->type, x));
3629
3630 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3631 x = LLVMBuildFMul(builder, x, sqrt2, "");
3632
3633 /* ipart = floor(log2(x) + 0.5) */
3634 ipart = lp_build_extract_exponent(bld, x, 0);
3635
3636 return ipart;
3637 }
3638
3639 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3640 lp_build_mod(struct lp_build_context *bld,
3641 LLVMValueRef x,
3642 LLVMValueRef y)
3643 {
3644 LLVMBuilderRef builder = bld->gallivm->builder;
3645 LLVMValueRef res;
3646 const struct lp_type type = bld->type;
3647
3648 assert(lp_check_value(type, x));
3649 assert(lp_check_value(type, y));
3650
3651 if (type.floating)
3652 res = LLVMBuildFRem(builder, x, y, "");
3653 else if (type.sign)
3654 res = LLVMBuildSRem(builder, x, y, "");
3655 else
3656 res = LLVMBuildURem(builder, x, y, "");
3657 return res;
3658 }
3659
3660
3661 /*
3662 * For floating inputs it creates and returns a mask
3663 * which is all 1's for channels which are NaN.
3664 * Channels inside x which are not NaN will be 0.
3665 */
3666 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3667 lp_build_isnan(struct lp_build_context *bld,
3668 LLVMValueRef x)
3669 {
3670 LLVMValueRef mask;
3671 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3672
3673 assert(bld->type.floating);
3674 assert(lp_check_value(bld->type, x));
3675
3676 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3677 "isnotnan");
3678 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3679 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3680 return mask;
3681 }
3682
3683
3684 /* Returns all 1's for floating point numbers that are
3685 * finite numbers and returns all zeros for -inf,
3686 * inf and nan's */
3687 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3688 lp_build_isfinite(struct lp_build_context *bld,
3689 LLVMValueRef x)
3690 {
3691 LLVMBuilderRef builder = bld->gallivm->builder;
3692 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3693 struct lp_type int_type = lp_int_type(bld->type);
3694 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3695 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3696 0x7f800000);
3697
3698 if (!bld->type.floating) {
3699 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3700 }
3701 assert(bld->type.floating);
3702 assert(lp_check_value(bld->type, x));
3703 assert(bld->type.width == 32);
3704
3705 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3706 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3707 intx, infornan32);
3708 }
3709
3710
3711 /*
3712 * Returns true if the number is nan or inf and false otherwise.
3713 * The input has to be a floating point vector.
3714 */
3715 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3716 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3717 const struct lp_type type,
3718 LLVMValueRef x)
3719 {
3720 LLVMBuilderRef builder = gallivm->builder;
3721 struct lp_type int_type = lp_int_type(type);
3722 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3723 0x7f800000);
3724 LLVMValueRef ret;
3725
3726 assert(type.floating);
3727
3728 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3729 ret = LLVMBuildAnd(builder, ret, const0, "");
3730 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3731 ret, const0);
3732
3733 return ret;
3734 }
3735
3736
3737 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3738 lp_build_fpstate_get(struct gallivm_state *gallivm)
3739 {
3740 if (util_get_cpu_caps()->has_sse) {
3741 LLVMBuilderRef builder = gallivm->builder;
3742 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3743 gallivm,
3744 LLVMInt32TypeInContext(gallivm->context),
3745 "mxcsr_ptr");
3746 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3747 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3748 lp_build_intrinsic(builder,
3749 "llvm.x86.sse.stmxcsr",
3750 LLVMVoidTypeInContext(gallivm->context),
3751 &mxcsr_ptr8, 1, 0);
3752 return mxcsr_ptr;
3753 }
3754 return 0;
3755 }
3756
3757 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,bool zero)3758 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3759 bool zero)
3760 {
3761 if (util_get_cpu_caps()->has_sse) {
3762 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3763 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3764
3765 LLVMBuilderRef builder = gallivm->builder;
3766 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3767 LLVMValueRef mxcsr =
3768 LLVMBuildLoad2(builder, LLVMInt32TypeInContext(gallivm->context), mxcsr_ptr, "mxcsr");
3769
3770 if (util_get_cpu_caps()->has_daz) {
3771 /* Enable denormals are zero mode */
3772 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3773 }
3774 if (zero) {
3775 mxcsr = LLVMBuildOr(builder, mxcsr,
3776 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3777 } else {
3778 mxcsr = LLVMBuildAnd(builder, mxcsr,
3779 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3780 }
3781
3782 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3783 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3784 }
3785 }
3786
3787
3788 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3789 lp_build_fpstate_set(struct gallivm_state *gallivm,
3790 LLVMValueRef mxcsr_ptr)
3791 {
3792 if (util_get_cpu_caps()->has_sse) {
3793 LLVMBuilderRef builder = gallivm->builder;
3794 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3795 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3796 lp_build_intrinsic(builder,
3797 "llvm.x86.sse.ldmxcsr",
3798 LLVMVoidTypeInContext(gallivm->context),
3799 &mxcsr_ptr, 1, 0);
3800 }
3801 }
3802