1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_cpu_caps.has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_cpu_caps.has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_cpu_caps.has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_cpu_caps.has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 __FUNCTION__);
142 }
143 if (type.width == 32 && type.length == 4) {
144 intrinsic = "llvm.ppc.altivec.vminfp";
145 intr_size = 128;
146 }
147 } else if (util_cpu_caps.has_altivec) {
148 intr_size = 128;
149 if (type.width == 8) {
150 if (!type.sign) {
151 intrinsic = "llvm.ppc.altivec.vminub";
152 } else {
153 intrinsic = "llvm.ppc.altivec.vminsb";
154 }
155 } else if (type.width == 16) {
156 if (!type.sign) {
157 intrinsic = "llvm.ppc.altivec.vminuh";
158 } else {
159 intrinsic = "llvm.ppc.altivec.vminsh";
160 }
161 } else if (type.width == 32) {
162 if (!type.sign) {
163 intrinsic = "llvm.ppc.altivec.vminuw";
164 } else {
165 intrinsic = "llvm.ppc.altivec.vminsw";
166 }
167 }
168 }
169
170 if (intrinsic) {
171 /* We need to handle nan's for floating point numbers. If one of the
172 * inputs is nan the other should be returned (required by both D3D10+
173 * and OpenCL).
174 * The sse intrinsics return the second operator in case of nan by
175 * default so we need to special code to handle those.
176 */
177 if (util_cpu_caps.has_sse && type.floating &&
178 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
179 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
180 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
181 LLVMValueRef isnan, min;
182 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
183 type,
184 intr_size, a, b);
185 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
186 isnan = lp_build_isnan(bld, b);
187 return lp_build_select(bld, isnan, a, min);
188 } else {
189 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
190 isnan = lp_build_isnan(bld, a);
191 return lp_build_select(bld, isnan, a, min);
192 }
193 } else {
194 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
195 type,
196 intr_size, a, b);
197 }
198 }
199
200 if (type.floating) {
201 switch (nan_behavior) {
202 case GALLIVM_NAN_RETURN_NAN: {
203 LLVMValueRef isnan = lp_build_isnan(bld, b);
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
205 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
206 return lp_build_select(bld, cond, a, b);
207 }
208 break;
209 case GALLIVM_NAN_RETURN_OTHER: {
210 LLVMValueRef isnan = lp_build_isnan(bld, a);
211 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
212 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
213 return lp_build_select(bld, cond, a, b);
214 }
215 break;
216 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
217 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
218 return lp_build_select(bld, cond, a, b);
219 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
220 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
221 return lp_build_select(bld, cond, b, a);
222 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
223 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
224 return lp_build_select(bld, cond, a, b);
225 break;
226 default:
227 assert(0);
228 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
229 return lp_build_select(bld, cond, a, b);
230 }
231 } else {
232 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
233 return lp_build_select(bld, cond, a, b);
234 }
235 }
236
237
238 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)239 lp_build_fmuladd(LLVMBuilderRef builder,
240 LLVMValueRef a,
241 LLVMValueRef b,
242 LLVMValueRef c)
243 {
244 LLVMTypeRef type = LLVMTypeOf(a);
245 assert(type == LLVMTypeOf(b));
246 assert(type == LLVMTypeOf(c));
247
248 char intrinsic[32];
249 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
250 LLVMValueRef args[] = { a, b, c };
251 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
252 }
253
254
255 /**
256 * Generate max(a, b)
257 * No checks for special case values of a or b = 1 or 0 are done.
258 * NaN's are handled according to the behavior specified by the
259 * nan_behavior argument.
260 */
261 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)262 lp_build_max_simple(struct lp_build_context *bld,
263 LLVMValueRef a,
264 LLVMValueRef b,
265 enum gallivm_nan_behavior nan_behavior)
266 {
267 const struct lp_type type = bld->type;
268 const char *intrinsic = NULL;
269 unsigned intr_size = 0;
270 LLVMValueRef cond;
271
272 assert(lp_check_value(type, a));
273 assert(lp_check_value(type, b));
274
275 /* TODO: optimize the constant case */
276
277 if (type.floating && util_cpu_caps.has_sse) {
278 if (type.width == 32) {
279 if (type.length == 1) {
280 intrinsic = "llvm.x86.sse.max.ss";
281 intr_size = 128;
282 }
283 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
284 intrinsic = "llvm.x86.sse.max.ps";
285 intr_size = 128;
286 }
287 else {
288 intrinsic = "llvm.x86.avx.max.ps.256";
289 intr_size = 256;
290 }
291 }
292 if (type.width == 64 && util_cpu_caps.has_sse2) {
293 if (type.length == 1) {
294 intrinsic = "llvm.x86.sse2.max.sd";
295 intr_size = 128;
296 }
297 else if (type.length == 2 || !util_cpu_caps.has_avx) {
298 intrinsic = "llvm.x86.sse2.max.pd";
299 intr_size = 128;
300 }
301 else {
302 intrinsic = "llvm.x86.avx.max.pd.256";
303 intr_size = 256;
304 }
305 }
306 }
307 else if (type.floating && util_cpu_caps.has_altivec) {
308 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
309 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
310 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
311 __FUNCTION__);
312 }
313 if (type.width == 32 || type.length == 4) {
314 intrinsic = "llvm.ppc.altivec.vmaxfp";
315 intr_size = 128;
316 }
317 } else if (util_cpu_caps.has_altivec) {
318 intr_size = 128;
319 if (type.width == 8) {
320 if (!type.sign) {
321 intrinsic = "llvm.ppc.altivec.vmaxub";
322 } else {
323 intrinsic = "llvm.ppc.altivec.vmaxsb";
324 }
325 } else if (type.width == 16) {
326 if (!type.sign) {
327 intrinsic = "llvm.ppc.altivec.vmaxuh";
328 } else {
329 intrinsic = "llvm.ppc.altivec.vmaxsh";
330 }
331 } else if (type.width == 32) {
332 if (!type.sign) {
333 intrinsic = "llvm.ppc.altivec.vmaxuw";
334 } else {
335 intrinsic = "llvm.ppc.altivec.vmaxsw";
336 }
337 }
338 }
339
340 if (intrinsic) {
341 if (util_cpu_caps.has_sse && type.floating &&
342 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
343 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
344 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
345 LLVMValueRef isnan, max;
346 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
347 type,
348 intr_size, a, b);
349 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
350 isnan = lp_build_isnan(bld, b);
351 return lp_build_select(bld, isnan, a, max);
352 } else {
353 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
354 isnan = lp_build_isnan(bld, a);
355 return lp_build_select(bld, isnan, a, max);
356 }
357 } else {
358 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
359 type,
360 intr_size, a, b);
361 }
362 }
363
364 if (type.floating) {
365 switch (nan_behavior) {
366 case GALLIVM_NAN_RETURN_NAN: {
367 LLVMValueRef isnan = lp_build_isnan(bld, b);
368 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
369 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
370 return lp_build_select(bld, cond, a, b);
371 }
372 break;
373 case GALLIVM_NAN_RETURN_OTHER: {
374 LLVMValueRef isnan = lp_build_isnan(bld, a);
375 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
376 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
377 return lp_build_select(bld, cond, a, b);
378 }
379 break;
380 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
381 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
382 return lp_build_select(bld, cond, a, b);
383 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
384 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
385 return lp_build_select(bld, cond, b, a);
386 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
387 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
388 return lp_build_select(bld, cond, a, b);
389 break;
390 default:
391 assert(0);
392 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
393 return lp_build_select(bld, cond, a, b);
394 }
395 } else {
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 return lp_build_select(bld, cond, a, b);
398 }
399 }
400
401
402 /**
403 * Generate 1 - a, or ~a depending on bld->type.
404 */
405 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)406 lp_build_comp(struct lp_build_context *bld,
407 LLVMValueRef a)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411
412 assert(lp_check_value(type, a));
413
414 if(a == bld->one)
415 return bld->zero;
416 if(a == bld->zero)
417 return bld->one;
418
419 if(type.norm && !type.floating && !type.fixed && !type.sign) {
420 if(LLVMIsConstant(a))
421 return LLVMConstNot(a);
422 else
423 return LLVMBuildNot(builder, a, "");
424 }
425
426 if(LLVMIsConstant(a))
427 if (type.floating)
428 return LLVMConstFSub(bld->one, a);
429 else
430 return LLVMConstSub(bld->one, a);
431 else
432 if (type.floating)
433 return LLVMBuildFSub(builder, bld->one, a, "");
434 else
435 return LLVMBuildSub(builder, bld->one, a, "");
436 }
437
438
439 /**
440 * Generate a + b
441 */
442 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)443 lp_build_add(struct lp_build_context *bld,
444 LLVMValueRef a,
445 LLVMValueRef b)
446 {
447 LLVMBuilderRef builder = bld->gallivm->builder;
448 const struct lp_type type = bld->type;
449 LLVMValueRef res;
450
451 assert(lp_check_value(type, a));
452 assert(lp_check_value(type, b));
453
454 if (a == bld->zero)
455 return b;
456 if (b == bld->zero)
457 return a;
458 if (a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if (type.norm) {
462 const char *intrinsic = NULL;
463
464 if (!type.sign && (a == bld->one || b == bld->one))
465 return bld->one;
466
467 if (!type.floating && !type.fixed) {
468 if (LLVM_VERSION_MAJOR >= 8) {
469 char intrin[32];
470 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
471 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
472 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
473 }
474 if (type.width * type.length == 128) {
475 if (util_cpu_caps.has_sse2) {
476 if (type.width == 8)
477 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
478 if (type.width == 16)
479 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
480 } else if (util_cpu_caps.has_altivec) {
481 if (type.width == 8)
482 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
483 if (type.width == 16)
484 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
485 }
486 }
487 if (type.width * type.length == 256) {
488 if (util_cpu_caps.has_avx2) {
489 if (type.width == 8)
490 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
491 if (type.width == 16)
492 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
493 }
494 }
495 }
496
497 if (intrinsic)
498 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
499 }
500
501 if(type.norm && !type.floating && !type.fixed) {
502 if (type.sign) {
503 uint64_t sign = (uint64_t)1 << (type.width - 1);
504 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
505 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
506 /* a_clamp_max is the maximum a for positive b,
507 a_clamp_min is the minimum a for negative b. */
508 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
509 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
510 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
511 }
512 }
513
514 if(LLVMIsConstant(a) && LLVMIsConstant(b))
515 if (type.floating)
516 res = LLVMConstFAdd(a, b);
517 else
518 res = LLVMConstAdd(a, b);
519 else
520 if (type.floating)
521 res = LLVMBuildFAdd(builder, a, b, "");
522 else
523 res = LLVMBuildAdd(builder, a, b, "");
524
525 /* clamp to ceiling of 1.0 */
526 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
527 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
528
529 if (type.norm && !type.floating && !type.fixed) {
530 if (!type.sign) {
531 /*
532 * newer llvm versions no longer support the intrinsics, but recognize
533 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
534 * code, it is important we match the pattern llvm uses (and pray llvm
535 * doesn't change it - and hope they decide on the same pattern for
536 * all backends supporting it...).
537 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
538 * interfere with llvm's ability to recognize the pattern but seems
539 * a bit brittle.
540 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
541 */
542 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
543 res = lp_build_select(bld, overflowed,
544 LLVMConstAllOnes(bld->int_vec_type), res);
545 }
546 }
547
548 /* XXX clamp to floor of -1 or 0??? */
549
550 return res;
551 }
552
553
554 /** Return the scalar sum of the elements of a.
555 * Should avoid this operation whenever possible.
556 */
557 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)558 lp_build_horizontal_add(struct lp_build_context *bld,
559 LLVMValueRef a)
560 {
561 LLVMBuilderRef builder = bld->gallivm->builder;
562 const struct lp_type type = bld->type;
563 LLVMValueRef index, res;
564 unsigned i, length;
565 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
566 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
567 LLVMValueRef vecres, elem2;
568
569 assert(lp_check_value(type, a));
570
571 if (type.length == 1) {
572 return a;
573 }
574
575 assert(!bld->type.norm);
576
577 /*
578 * for byte vectors can do much better with psadbw.
579 * Using repeated shuffle/adds here. Note with multiple vectors
580 * this can be done more efficiently as outlined in the intel
581 * optimization manual.
582 * Note: could cause data rearrangement if used with smaller element
583 * sizes.
584 */
585
586 vecres = a;
587 length = type.length / 2;
588 while (length > 1) {
589 LLVMValueRef vec1, vec2;
590 for (i = 0; i < length; i++) {
591 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
592 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
593 }
594 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
595 LLVMConstVector(shuffles1, length), "");
596 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
597 LLVMConstVector(shuffles2, length), "");
598 if (type.floating) {
599 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
600 }
601 else {
602 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
603 }
604 length = length >> 1;
605 }
606
607 /* always have vector of size 2 here */
608 assert(length == 1);
609
610 index = lp_build_const_int32(bld->gallivm, 0);
611 res = LLVMBuildExtractElement(builder, vecres, index, "");
612 index = lp_build_const_int32(bld->gallivm, 1);
613 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
614
615 if (type.floating)
616 res = LLVMBuildFAdd(builder, res, elem2, "");
617 else
618 res = LLVMBuildAdd(builder, res, elem2, "");
619
620 return res;
621 }
622
623 /**
624 * Return the horizontal sums of 4 float vectors as a float4 vector.
625 * This uses the technique as outlined in Intel Optimization Manual.
626 */
627 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
629 LLVMValueRef src[4])
630 {
631 struct gallivm_state *gallivm = bld->gallivm;
632 LLVMBuilderRef builder = gallivm->builder;
633 LLVMValueRef shuffles[4];
634 LLVMValueRef tmp[4];
635 LLVMValueRef sumtmp[2], shuftmp[2];
636
637 /* lower half of regs */
638 shuffles[0] = lp_build_const_int32(gallivm, 0);
639 shuffles[1] = lp_build_const_int32(gallivm, 1);
640 shuffles[2] = lp_build_const_int32(gallivm, 4);
641 shuffles[3] = lp_build_const_int32(gallivm, 5);
642 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
643 LLVMConstVector(shuffles, 4), "");
644 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
645 LLVMConstVector(shuffles, 4), "");
646
647 /* upper half of regs */
648 shuffles[0] = lp_build_const_int32(gallivm, 2);
649 shuffles[1] = lp_build_const_int32(gallivm, 3);
650 shuffles[2] = lp_build_const_int32(gallivm, 6);
651 shuffles[3] = lp_build_const_int32(gallivm, 7);
652 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
653 LLVMConstVector(shuffles, 4), "");
654 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
655 LLVMConstVector(shuffles, 4), "");
656
657 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
658 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
659
660 shuffles[0] = lp_build_const_int32(gallivm, 0);
661 shuffles[1] = lp_build_const_int32(gallivm, 2);
662 shuffles[2] = lp_build_const_int32(gallivm, 4);
663 shuffles[3] = lp_build_const_int32(gallivm, 6);
664 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
665 LLVMConstVector(shuffles, 4), "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 1);
668 shuffles[1] = lp_build_const_int32(gallivm, 3);
669 shuffles[2] = lp_build_const_int32(gallivm, 5);
670 shuffles[3] = lp_build_const_int32(gallivm, 7);
671 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
675 }
676
677
678 /*
679 * partially horizontally add 2-4 float vectors with length nx4,
680 * i.e. only four adjacent values in each vector will be added,
681 * assuming values are really grouped in 4 which also determines
682 * output order.
683 *
684 * Return a vector of the same length as the initial vectors,
685 * with the excess elements (if any) being undefined.
686 * The element order is independent of number of input vectors.
687 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
688 * the output order thus will be
689 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
690 */
691 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)692 lp_build_hadd_partial4(struct lp_build_context *bld,
693 LLVMValueRef vectors[],
694 unsigned num_vecs)
695 {
696 struct gallivm_state *gallivm = bld->gallivm;
697 LLVMBuilderRef builder = gallivm->builder;
698 LLVMValueRef ret_vec;
699 LLVMValueRef tmp[4];
700 const char *intrinsic = NULL;
701
702 assert(num_vecs >= 2 && num_vecs <= 4);
703 assert(bld->type.floating);
704
705 /* only use this with at least 2 vectors, as it is sort of expensive
706 * (depending on cpu) and we always need two horizontal adds anyway,
707 * so a shuffle/add approach might be better.
708 */
709
710 tmp[0] = vectors[0];
711 tmp[1] = vectors[1];
712
713 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
714 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
715
716 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
717 bld->type.length == 4) {
718 intrinsic = "llvm.x86.sse3.hadd.ps";
719 }
720 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
721 bld->type.length == 8) {
722 intrinsic = "llvm.x86.avx.hadd.ps.256";
723 }
724 if (intrinsic) {
725 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
726 lp_build_vec_type(gallivm, bld->type),
727 tmp[0], tmp[1]);
728 if (num_vecs > 2) {
729 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
730 lp_build_vec_type(gallivm, bld->type),
731 tmp[2], tmp[3]);
732 }
733 else {
734 tmp[1] = tmp[0];
735 }
736 return lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[0], tmp[1]);
739 }
740
741 if (bld->type.length == 4) {
742 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
743 }
744 else {
745 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
746 unsigned j;
747 unsigned num_iter = bld->type.length / 4;
748 struct lp_type parttype = bld->type;
749 parttype.length = 4;
750 for (j = 0; j < num_iter; j++) {
751 LLVMValueRef partsrc[4];
752 unsigned i;
753 for (i = 0; i < 4; i++) {
754 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
755 }
756 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
757 }
758 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
759 }
760 return ret_vec;
761 }
762
763 /**
764 * Generate a - b
765 */
766 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)767 lp_build_sub(struct lp_build_context *bld,
768 LLVMValueRef a,
769 LLVMValueRef b)
770 {
771 LLVMBuilderRef builder = bld->gallivm->builder;
772 const struct lp_type type = bld->type;
773 LLVMValueRef res;
774
775 assert(lp_check_value(type, a));
776 assert(lp_check_value(type, b));
777
778 if (b == bld->zero)
779 return a;
780 if (a == bld->undef || b == bld->undef)
781 return bld->undef;
782 if (a == b)
783 return bld->zero;
784
785 if (type.norm) {
786 const char *intrinsic = NULL;
787
788 if (!type.sign && b == bld->one)
789 return bld->zero;
790
791 if (!type.floating && !type.fixed) {
792 if (LLVM_VERSION_MAJOR >= 8) {
793 char intrin[32];
794 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
795 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
796 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
797 }
798 if (type.width * type.length == 128) {
799 if (util_cpu_caps.has_sse2) {
800 if (type.width == 8)
801 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
802 if (type.width == 16)
803 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
804 } else if (util_cpu_caps.has_altivec) {
805 if (type.width == 8)
806 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
807 if (type.width == 16)
808 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
809 }
810 }
811 if (type.width * type.length == 256) {
812 if (util_cpu_caps.has_avx2) {
813 if (type.width == 8)
814 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
815 if (type.width == 16)
816 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
817 }
818 }
819 }
820
821 if (intrinsic)
822 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
823 }
824
825 if(type.norm && !type.floating && !type.fixed) {
826 if (type.sign) {
827 uint64_t sign = (uint64_t)1 << (type.width - 1);
828 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
829 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
830 /* a_clamp_max is the maximum a for negative b,
831 a_clamp_min is the minimum a for positive b. */
832 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
833 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
834 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
835 } else {
836 /*
837 * This must match llvm pattern for saturated unsigned sub.
838 * (lp_build_max_simple actually does the job with its current
839 * definition but do it explicitly here.)
840 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
841 * interfere with llvm's ability to recognize the pattern but seems
842 * a bit brittle.
843 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
844 */
845 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
846 a = lp_build_select(bld, no_ov, a, b);
847 }
848 }
849
850 if(LLVMIsConstant(a) && LLVMIsConstant(b))
851 if (type.floating)
852 res = LLVMConstFSub(a, b);
853 else
854 res = LLVMConstSub(a, b);
855 else
856 if (type.floating)
857 res = LLVMBuildFSub(builder, a, b, "");
858 else
859 res = LLVMBuildSub(builder, a, b, "");
860
861 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
862 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
863
864 return res;
865 }
866
867
868
869 /**
870 * Normalized multiplication.
871 *
872 * There are several approaches for (using 8-bit normalized multiplication as
873 * an example):
874 *
875 * - alpha plus one
876 *
877 * makes the following approximation to the division (Sree)
878 *
879 * a*b/255 ~= (a*(b + 1)) >> 256
880 *
881 * which is the fastest method that satisfies the following OpenGL criteria of
882 *
883 * 0*0 = 0 and 255*255 = 255
884 *
885 * - geometric series
886 *
887 * takes the geometric series approximation to the division
888 *
889 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
890 *
891 * in this case just the first two terms to fit in 16bit arithmetic
892 *
893 * t/255 ~= (t + (t >> 8)) >> 8
894 *
895 * note that just by itself it doesn't satisfies the OpenGL criteria, as
896 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
897 * must be used.
898 *
899 * - geometric series plus rounding
900 *
901 * when using a geometric series division instead of truncating the result
902 * use roundoff in the approximation (Jim Blinn)
903 *
904 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
905 *
906 * achieving the exact results.
907 *
908 *
909 *
910 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
911 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
912 * @sa Michael Herf, The "double blend trick", May 2000,
913 * http://www.stereopsis.com/doubleblend.html
914 */
915 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)916 lp_build_mul_norm(struct gallivm_state *gallivm,
917 struct lp_type wide_type,
918 LLVMValueRef a, LLVMValueRef b)
919 {
920 LLVMBuilderRef builder = gallivm->builder;
921 struct lp_build_context bld;
922 unsigned n;
923 LLVMValueRef half;
924 LLVMValueRef ab;
925
926 assert(!wide_type.floating);
927 assert(lp_check_value(wide_type, a));
928 assert(lp_check_value(wide_type, b));
929
930 lp_build_context_init(&bld, gallivm, wide_type);
931
932 n = wide_type.width / 2;
933 if (wide_type.sign) {
934 --n;
935 }
936
937 /*
938 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
939 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
940 */
941
942 /*
943 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
944 */
945
946 ab = LLVMBuildMul(builder, a, b, "");
947 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
948
949 /*
950 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
951 */
952
953 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
954 if (wide_type.sign) {
955 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
956 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
957 half = lp_build_select(&bld, sign, minus_half, half);
958 }
959 ab = LLVMBuildAdd(builder, ab, half, "");
960
961 /* Final division */
962 ab = lp_build_shr_imm(&bld, ab, n);
963
964 return ab;
965 }
966
967 /**
968 * Generate a * b
969 */
970 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)971 lp_build_mul(struct lp_build_context *bld,
972 LLVMValueRef a,
973 LLVMValueRef b)
974 {
975 LLVMBuilderRef builder = bld->gallivm->builder;
976 const struct lp_type type = bld->type;
977 LLVMValueRef shift;
978 LLVMValueRef res;
979
980 assert(lp_check_value(type, a));
981 assert(lp_check_value(type, b));
982
983 if(a == bld->zero)
984 return bld->zero;
985 if(a == bld->one)
986 return b;
987 if(b == bld->zero)
988 return bld->zero;
989 if(b == bld->one)
990 return a;
991 if(a == bld->undef || b == bld->undef)
992 return bld->undef;
993
994 if (!type.floating && !type.fixed && type.norm) {
995 struct lp_type wide_type = lp_wider_type(type);
996 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
997
998 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
999 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000
1001 /* PMULLW, PSRLW, PADDW */
1002 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004
1005 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006
1007 return ab;
1008 }
1009
1010 if(type.fixed)
1011 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012 else
1013 shift = NULL;
1014
1015 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016 if (type.floating)
1017 res = LLVMConstFMul(a, b);
1018 else
1019 res = LLVMConstMul(a, b);
1020 if(shift) {
1021 if(type.sign)
1022 res = LLVMConstAShr(res, shift);
1023 else
1024 res = LLVMConstLShr(res, shift);
1025 }
1026 }
1027 else {
1028 if (type.floating)
1029 res = LLVMBuildFMul(builder, a, b, "");
1030 else
1031 res = LLVMBuildMul(builder, a, b, "");
1032 if(shift) {
1033 if(type.sign)
1034 res = LLVMBuildAShr(builder, res, shift, "");
1035 else
1036 res = LLVMBuildLShr(builder, res, shift, "");
1037 }
1038 }
1039
1040 return res;
1041 }
1042
1043 /*
1044 * Widening mul, valid for 32x32 bit -> 64bit only.
1045 * Result is low 32bits, high bits returned in res_hi.
1046 *
1047 * Emits code that is meant to be compiled for the host CPU.
1048 */
1049 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051 LLVMValueRef a,
1052 LLVMValueRef b,
1053 LLVMValueRef *res_hi)
1054 {
1055 struct gallivm_state *gallivm = bld->gallivm;
1056 LLVMBuilderRef builder = gallivm->builder;
1057
1058 assert(bld->type.width == 32);
1059 assert(bld->type.floating == 0);
1060 assert(bld->type.fixed == 0);
1061 assert(bld->type.norm == 0);
1062
1063 /*
1064 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065 * for x86 simd is atrocious (even if the high bits weren't required),
1066 * trying to handle real 64bit inputs (which of course can't happen due
1067 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068 * apparently llvm does not recognize this widening mul). This includes 6
1069 * (instead of 2) pmuludq plus extra adds and shifts
1070 * The same story applies to signed mul, albeit fixing this requires sse41.
1071 * https://llvm.org/bugs/show_bug.cgi?id=30845
1072 * So, whip up our own code, albeit only for length 4 and 8 (which
1073 * should be good enough)...
1074 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076 * for signed), which the fallback code does not, without this llvm
1077 * will likely still produce atrocious code.
1078 */
1079 if (LLVM_VERSION_MAJOR < 7 &&
1080 (bld->type.length == 4 || bld->type.length == 8) &&
1081 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082 util_cpu_caps.has_sse4_1)) {
1083 const char *intrinsic = NULL;
1084 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086 struct lp_type type_wide = lp_wider_type(bld->type);
1087 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088 unsigned i;
1089 for (i = 0; i < bld->type.length; i += 2) {
1090 shuf[i] = lp_build_const_int32(gallivm, i+1);
1091 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092 }
1093 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094 aeven = a;
1095 beven = b;
1096 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098
1099 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100 if (bld->type.sign) {
1101 intrinsic = "llvm.x86.avx2.pmul.dq";
1102 } else {
1103 intrinsic = "llvm.x86.avx2.pmulu.dq";
1104 }
1105 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106 wider_type, aeven, beven);
1107 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108 wider_type, aodd, bodd);
1109 }
1110 else {
1111 /* for consistent naming look elsewhere... */
1112 if (bld->type.sign) {
1113 intrinsic = "llvm.x86.sse41.pmuldq";
1114 } else {
1115 intrinsic = "llvm.x86.sse2.pmulu.dq";
1116 }
1117 /*
1118 * XXX If we only have AVX but not AVX2 this is a pain.
1119 * lp_build_intrinsic_binary_anylength() can't handle it
1120 * (due to src and dst type not being identical).
1121 */
1122 if (bld->type.length == 8) {
1123 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125 LLVMValueRef muleven2[2], mulodd2[2];
1126 struct lp_type type_wide_half = type_wide;
1127 LLVMTypeRef wtype_half;
1128 type_wide_half.length = 2;
1129 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139 wtype_half, aevenlo, bevenlo);
1140 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141 wtype_half, aoddlo, boddlo);
1142 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143 wtype_half, aevenhi, bevenhi);
1144 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145 wtype_half, aoddhi, boddhi);
1146 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148
1149 }
1150 else {
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 }
1157 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159
1160 for (i = 0; i < bld->type.length; i += 2) {
1161 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163 }
1164 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166
1167 for (i = 0; i < bld->type.length; i += 2) {
1168 shuf[i] = lp_build_const_int32(gallivm, i);
1169 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170 }
1171 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173 }
1174 else {
1175 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176 }
1177 }
1178
1179
1180 /*
1181 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1182 * Result is low N bits, high bits returned in res_hi.
1183 *
1184 * Emits generic code.
1185 */
1186 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188 LLVMValueRef a,
1189 LLVMValueRef b,
1190 LLVMValueRef *res_hi)
1191 {
1192 struct gallivm_state *gallivm = bld->gallivm;
1193 LLVMBuilderRef builder = gallivm->builder;
1194 LLVMValueRef tmp, shift, res_lo;
1195 struct lp_type type_tmp;
1196 LLVMTypeRef wide_type, narrow_type;
1197
1198 type_tmp = bld->type;
1199 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200 if (bld->type.width < 32)
1201 type_tmp.width = 32;
1202 else
1203 type_tmp.width *= 2;
1204 wide_type = lp_build_vec_type(gallivm, type_tmp);
1205 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1206
1207 if (bld->type.sign) {
1208 a = LLVMBuildSExt(builder, a, wide_type, "");
1209 b = LLVMBuildSExt(builder, b, wide_type, "");
1210 } else {
1211 a = LLVMBuildZExt(builder, a, wide_type, "");
1212 b = LLVMBuildZExt(builder, b, wide_type, "");
1213 }
1214 tmp = LLVMBuildMul(builder, a, b, "");
1215
1216 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1217
1218 /* Since we truncate anyway, LShr and AShr are equivalent. */
1219 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1220 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1221
1222 return res_lo;
1223 }
1224
1225
1226 /* a * b + c */
1227 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1228 lp_build_mad(struct lp_build_context *bld,
1229 LLVMValueRef a,
1230 LLVMValueRef b,
1231 LLVMValueRef c)
1232 {
1233 const struct lp_type type = bld->type;
1234 if (type.floating) {
1235 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1236 } else {
1237 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1238 }
1239 }
1240
1241
1242 /**
1243 * Small vector x scale multiplication optimization.
1244 */
1245 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1246 lp_build_mul_imm(struct lp_build_context *bld,
1247 LLVMValueRef a,
1248 int b)
1249 {
1250 LLVMBuilderRef builder = bld->gallivm->builder;
1251 LLVMValueRef factor;
1252
1253 assert(lp_check_value(bld->type, a));
1254
1255 if(b == 0)
1256 return bld->zero;
1257
1258 if(b == 1)
1259 return a;
1260
1261 if(b == -1)
1262 return lp_build_negate(bld, a);
1263
1264 if(b == 2 && bld->type.floating)
1265 return lp_build_add(bld, a, a);
1266
1267 if(util_is_power_of_two_or_zero(b)) {
1268 unsigned shift = ffs(b) - 1;
1269
1270 if(bld->type.floating) {
1271 #if 0
1272 /*
1273 * Power of two multiplication by directly manipulating the exponent.
1274 *
1275 * XXX: This might not be always faster, it will introduce a small error
1276 * for multiplication by zero, and it will produce wrong results
1277 * for Inf and NaN.
1278 */
1279 unsigned mantissa = lp_mantissa(bld->type);
1280 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1281 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1282 a = LLVMBuildAdd(builder, a, factor, "");
1283 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1284 return a;
1285 #endif
1286 }
1287 else {
1288 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1289 return LLVMBuildShl(builder, a, factor, "");
1290 }
1291 }
1292
1293 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1294 return lp_build_mul(bld, a, factor);
1295 }
1296
1297
1298 /**
1299 * Generate a / b
1300 */
1301 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1302 lp_build_div(struct lp_build_context *bld,
1303 LLVMValueRef a,
1304 LLVMValueRef b)
1305 {
1306 LLVMBuilderRef builder = bld->gallivm->builder;
1307 const struct lp_type type = bld->type;
1308
1309 assert(lp_check_value(type, a));
1310 assert(lp_check_value(type, b));
1311
1312 if(a == bld->zero)
1313 return bld->zero;
1314 if(a == bld->one && type.floating)
1315 return lp_build_rcp(bld, b);
1316 if(b == bld->zero)
1317 return bld->undef;
1318 if(b == bld->one)
1319 return a;
1320 if(a == bld->undef || b == bld->undef)
1321 return bld->undef;
1322
1323 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1324 if (type.floating)
1325 return LLVMConstFDiv(a, b);
1326 else if (type.sign)
1327 return LLVMConstSDiv(a, b);
1328 else
1329 return LLVMConstUDiv(a, b);
1330 }
1331
1332 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1333 if(FALSE &&
1334 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1335 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1336 type.floating)
1337 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1338
1339 if (type.floating)
1340 return LLVMBuildFDiv(builder, a, b, "");
1341 else if (type.sign)
1342 return LLVMBuildSDiv(builder, a, b, "");
1343 else
1344 return LLVMBuildUDiv(builder, a, b, "");
1345 }
1346
1347
1348 /**
1349 * Linear interpolation helper.
1350 *
1351 * @param normalized whether we are interpolating normalized values,
1352 * encoded in normalized integers, twice as wide.
1353 *
1354 * @sa http://www.stereopsis.com/doubleblend.html
1355 */
1356 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1357 lp_build_lerp_simple(struct lp_build_context *bld,
1358 LLVMValueRef x,
1359 LLVMValueRef v0,
1360 LLVMValueRef v1,
1361 unsigned flags)
1362 {
1363 unsigned half_width = bld->type.width/2;
1364 LLVMBuilderRef builder = bld->gallivm->builder;
1365 LLVMValueRef delta;
1366 LLVMValueRef res;
1367
1368 assert(lp_check_value(bld->type, x));
1369 assert(lp_check_value(bld->type, v0));
1370 assert(lp_check_value(bld->type, v1));
1371
1372 delta = lp_build_sub(bld, v1, v0);
1373
1374 if (bld->type.floating) {
1375 assert(flags == 0);
1376 return lp_build_mad(bld, x, delta, v0);
1377 }
1378
1379 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1380 if (!bld->type.sign) {
1381 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1382 /*
1383 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1384 * most-significant-bit to the lowest-significant-bit, so that
1385 * later we can just divide by 2**n instead of 2**n - 1.
1386 */
1387
1388 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1389 }
1390
1391 /* (x * delta) >> n */
1392 res = lp_build_mul(bld, x, delta);
1393 res = lp_build_shr_imm(bld, res, half_width);
1394 } else {
1395 /*
1396 * The rescaling trick above doesn't work for signed numbers, so
1397 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1398 * instead.
1399 */
1400 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1401 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1402 }
1403 } else {
1404 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1405 res = lp_build_mul(bld, x, delta);
1406 }
1407
1408 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1409 /*
1410 * At this point both res and v0 only use the lower half of the bits,
1411 * the rest is zero. Instead of add / mask, do add with half wide type.
1412 */
1413 struct lp_type narrow_type;
1414 struct lp_build_context narrow_bld;
1415
1416 memset(&narrow_type, 0, sizeof narrow_type);
1417 narrow_type.sign = bld->type.sign;
1418 narrow_type.width = bld->type.width/2;
1419 narrow_type.length = bld->type.length*2;
1420
1421 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1422 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1423 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1424 res = lp_build_add(&narrow_bld, v0, res);
1425 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1426 } else {
1427 res = lp_build_add(bld, v0, res);
1428
1429 if (bld->type.fixed) {
1430 /*
1431 * We need to mask out the high order bits when lerping 8bit
1432 * normalized colors stored on 16bits
1433 */
1434 /* XXX: This step is necessary for lerping 8bit colors stored on
1435 * 16bits, but it will be wrong for true fixed point use cases.
1436 * Basically we need a more powerful lp_type, capable of further
1437 * distinguishing the values interpretation from the value storage.
1438 */
1439 LLVMValueRef low_bits;
1440 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1441 res = LLVMBuildAnd(builder, res, low_bits, "");
1442 }
1443 }
1444
1445 return res;
1446 }
1447
1448
1449 /**
1450 * Linear interpolation.
1451 */
1452 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1453 lp_build_lerp(struct lp_build_context *bld,
1454 LLVMValueRef x,
1455 LLVMValueRef v0,
1456 LLVMValueRef v1,
1457 unsigned flags)
1458 {
1459 const struct lp_type type = bld->type;
1460 LLVMValueRef res;
1461
1462 assert(lp_check_value(type, x));
1463 assert(lp_check_value(type, v0));
1464 assert(lp_check_value(type, v1));
1465
1466 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1467
1468 if (type.norm) {
1469 struct lp_type wide_type;
1470 struct lp_build_context wide_bld;
1471 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1472
1473 assert(type.length >= 2);
1474
1475 /*
1476 * Create a wider integer type, enough to hold the
1477 * intermediate result of the multiplication.
1478 */
1479 memset(&wide_type, 0, sizeof wide_type);
1480 wide_type.sign = type.sign;
1481 wide_type.width = type.width*2;
1482 wide_type.length = type.length/2;
1483
1484 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1485
1486 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1487 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1488 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1489
1490 /*
1491 * Lerp both halves.
1492 */
1493
1494 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1495
1496 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1497 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1498
1499 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1500 } else {
1501 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1502 }
1503
1504 return res;
1505 }
1506
1507
1508 /**
1509 * Bilinear interpolation.
1510 *
1511 * Values indices are in v_{yx}.
1512 */
1513 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1514 lp_build_lerp_2d(struct lp_build_context *bld,
1515 LLVMValueRef x,
1516 LLVMValueRef y,
1517 LLVMValueRef v00,
1518 LLVMValueRef v01,
1519 LLVMValueRef v10,
1520 LLVMValueRef v11,
1521 unsigned flags)
1522 {
1523 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1524 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1525 return lp_build_lerp(bld, y, v0, v1, flags);
1526 }
1527
1528
1529 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1530 lp_build_lerp_3d(struct lp_build_context *bld,
1531 LLVMValueRef x,
1532 LLVMValueRef y,
1533 LLVMValueRef z,
1534 LLVMValueRef v000,
1535 LLVMValueRef v001,
1536 LLVMValueRef v010,
1537 LLVMValueRef v011,
1538 LLVMValueRef v100,
1539 LLVMValueRef v101,
1540 LLVMValueRef v110,
1541 LLVMValueRef v111,
1542 unsigned flags)
1543 {
1544 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1545 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1546 return lp_build_lerp(bld, z, v0, v1, flags);
1547 }
1548
1549
1550 /**
1551 * Generate min(a, b)
1552 * Do checks for special cases but not for nans.
1553 */
1554 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1555 lp_build_min(struct lp_build_context *bld,
1556 LLVMValueRef a,
1557 LLVMValueRef b)
1558 {
1559 assert(lp_check_value(bld->type, a));
1560 assert(lp_check_value(bld->type, b));
1561
1562 if(a == bld->undef || b == bld->undef)
1563 return bld->undef;
1564
1565 if(a == b)
1566 return a;
1567
1568 if (bld->type.norm) {
1569 if (!bld->type.sign) {
1570 if (a == bld->zero || b == bld->zero) {
1571 return bld->zero;
1572 }
1573 }
1574 if(a == bld->one)
1575 return b;
1576 if(b == bld->one)
1577 return a;
1578 }
1579
1580 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1581 }
1582
1583
1584 /**
1585 * Generate min(a, b)
1586 * NaN's are handled according to the behavior specified by the
1587 * nan_behavior argument.
1588 */
1589 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1590 lp_build_min_ext(struct lp_build_context *bld,
1591 LLVMValueRef a,
1592 LLVMValueRef b,
1593 enum gallivm_nan_behavior nan_behavior)
1594 {
1595 assert(lp_check_value(bld->type, a));
1596 assert(lp_check_value(bld->type, b));
1597
1598 if(a == bld->undef || b == bld->undef)
1599 return bld->undef;
1600
1601 if(a == b)
1602 return a;
1603
1604 if (bld->type.norm) {
1605 if (!bld->type.sign) {
1606 if (a == bld->zero || b == bld->zero) {
1607 return bld->zero;
1608 }
1609 }
1610 if(a == bld->one)
1611 return b;
1612 if(b == bld->one)
1613 return a;
1614 }
1615
1616 return lp_build_min_simple(bld, a, b, nan_behavior);
1617 }
1618
1619 /**
1620 * Generate max(a, b)
1621 * Do checks for special cases, but NaN behavior is undefined.
1622 */
1623 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1624 lp_build_max(struct lp_build_context *bld,
1625 LLVMValueRef a,
1626 LLVMValueRef b)
1627 {
1628 assert(lp_check_value(bld->type, a));
1629 assert(lp_check_value(bld->type, b));
1630
1631 if(a == bld->undef || b == bld->undef)
1632 return bld->undef;
1633
1634 if(a == b)
1635 return a;
1636
1637 if(bld->type.norm) {
1638 if(a == bld->one || b == bld->one)
1639 return bld->one;
1640 if (!bld->type.sign) {
1641 if (a == bld->zero) {
1642 return b;
1643 }
1644 if (b == bld->zero) {
1645 return a;
1646 }
1647 }
1648 }
1649
1650 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1651 }
1652
1653
1654 /**
1655 * Generate max(a, b)
1656 * Checks for special cases.
1657 * NaN's are handled according to the behavior specified by the
1658 * nan_behavior argument.
1659 */
1660 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1661 lp_build_max_ext(struct lp_build_context *bld,
1662 LLVMValueRef a,
1663 LLVMValueRef b,
1664 enum gallivm_nan_behavior nan_behavior)
1665 {
1666 assert(lp_check_value(bld->type, a));
1667 assert(lp_check_value(bld->type, b));
1668
1669 if(a == bld->undef || b == bld->undef)
1670 return bld->undef;
1671
1672 if(a == b)
1673 return a;
1674
1675 if(bld->type.norm) {
1676 if(a == bld->one || b == bld->one)
1677 return bld->one;
1678 if (!bld->type.sign) {
1679 if (a == bld->zero) {
1680 return b;
1681 }
1682 if (b == bld->zero) {
1683 return a;
1684 }
1685 }
1686 }
1687
1688 return lp_build_max_simple(bld, a, b, nan_behavior);
1689 }
1690
1691 /**
1692 * Generate clamp(a, min, max)
1693 * NaN behavior (for any of a, min, max) is undefined.
1694 * Do checks for special cases.
1695 */
1696 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1697 lp_build_clamp(struct lp_build_context *bld,
1698 LLVMValueRef a,
1699 LLVMValueRef min,
1700 LLVMValueRef max)
1701 {
1702 assert(lp_check_value(bld->type, a));
1703 assert(lp_check_value(bld->type, min));
1704 assert(lp_check_value(bld->type, max));
1705
1706 a = lp_build_min(bld, a, max);
1707 a = lp_build_max(bld, a, min);
1708 return a;
1709 }
1710
1711
1712 /**
1713 * Generate clamp(a, 0, 1)
1714 * A NaN will get converted to zero.
1715 */
1716 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1717 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1718 LLVMValueRef a)
1719 {
1720 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1721 a = lp_build_min(bld, a, bld->one);
1722 return a;
1723 }
1724
1725
1726 /**
1727 * Generate abs(a)
1728 */
1729 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1730 lp_build_abs(struct lp_build_context *bld,
1731 LLVMValueRef a)
1732 {
1733 LLVMBuilderRef builder = bld->gallivm->builder;
1734 const struct lp_type type = bld->type;
1735 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1736
1737 assert(lp_check_value(type, a));
1738
1739 if(!type.sign)
1740 return a;
1741
1742 if(type.floating) {
1743 char intrinsic[32];
1744 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1745 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1746 }
1747
1748 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1749 switch(type.width) {
1750 case 8:
1751 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1752 case 16:
1753 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1754 case 32:
1755 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1756 }
1757 }
1758 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1759 switch(type.width) {
1760 case 8:
1761 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1762 case 16:
1763 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1764 case 32:
1765 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1766 }
1767 }
1768
1769 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1770 a, LLVMBuildNeg(builder, a, ""));
1771 }
1772
1773
1774 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1775 lp_build_negate(struct lp_build_context *bld,
1776 LLVMValueRef a)
1777 {
1778 LLVMBuilderRef builder = bld->gallivm->builder;
1779
1780 assert(lp_check_value(bld->type, a));
1781
1782 if (bld->type.floating)
1783 a = LLVMBuildFNeg(builder, a, "");
1784 else
1785 a = LLVMBuildNeg(builder, a, "");
1786
1787 return a;
1788 }
1789
1790
1791 /** Return -1, 0 or +1 depending on the sign of a */
1792 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1793 lp_build_sgn(struct lp_build_context *bld,
1794 LLVMValueRef a)
1795 {
1796 LLVMBuilderRef builder = bld->gallivm->builder;
1797 const struct lp_type type = bld->type;
1798 LLVMValueRef cond;
1799 LLVMValueRef res;
1800
1801 assert(lp_check_value(type, a));
1802
1803 /* Handle non-zero case */
1804 if(!type.sign) {
1805 /* if not zero then sign must be positive */
1806 res = bld->one;
1807 }
1808 else if(type.floating) {
1809 LLVMTypeRef vec_type;
1810 LLVMTypeRef int_type;
1811 LLVMValueRef mask;
1812 LLVMValueRef sign;
1813 LLVMValueRef one;
1814 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1815
1816 int_type = lp_build_int_vec_type(bld->gallivm, type);
1817 vec_type = lp_build_vec_type(bld->gallivm, type);
1818 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1819
1820 /* Take the sign bit and add it to 1 constant */
1821 sign = LLVMBuildBitCast(builder, a, int_type, "");
1822 sign = LLVMBuildAnd(builder, sign, mask, "");
1823 one = LLVMConstBitCast(bld->one, int_type);
1824 res = LLVMBuildOr(builder, sign, one, "");
1825 res = LLVMBuildBitCast(builder, res, vec_type, "");
1826 }
1827 else
1828 {
1829 /* signed int/norm/fixed point */
1830 /* could use psign with sse3 and appropriate vectors here */
1831 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1832 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1833 res = lp_build_select(bld, cond, bld->one, minus_one);
1834 }
1835
1836 /* Handle zero */
1837 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1838 res = lp_build_select(bld, cond, bld->zero, res);
1839
1840 return res;
1841 }
1842
1843
1844 /**
1845 * Set the sign of float vector 'a' according to 'sign'.
1846 * If sign==0, return abs(a).
1847 * If sign==1, return -abs(a);
1848 * Other values for sign produce undefined results.
1849 */
1850 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1851 lp_build_set_sign(struct lp_build_context *bld,
1852 LLVMValueRef a, LLVMValueRef sign)
1853 {
1854 LLVMBuilderRef builder = bld->gallivm->builder;
1855 const struct lp_type type = bld->type;
1856 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1857 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1858 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1859 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1860 ~((unsigned long long) 1 << (type.width - 1)));
1861 LLVMValueRef val, res;
1862
1863 assert(type.floating);
1864 assert(lp_check_value(type, a));
1865
1866 /* val = reinterpret_cast<int>(a) */
1867 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1868 /* val = val & mask */
1869 val = LLVMBuildAnd(builder, val, mask, "");
1870 /* sign = sign << shift */
1871 sign = LLVMBuildShl(builder, sign, shift, "");
1872 /* res = val | sign */
1873 res = LLVMBuildOr(builder, val, sign, "");
1874 /* res = reinterpret_cast<float>(res) */
1875 res = LLVMBuildBitCast(builder, res, vec_type, "");
1876
1877 return res;
1878 }
1879
1880
1881 /**
1882 * Convert vector of (or scalar) int to vector of (or scalar) float.
1883 */
1884 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1885 lp_build_int_to_float(struct lp_build_context *bld,
1886 LLVMValueRef a)
1887 {
1888 LLVMBuilderRef builder = bld->gallivm->builder;
1889 const struct lp_type type = bld->type;
1890 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1891
1892 assert(type.floating);
1893
1894 return LLVMBuildSIToFP(builder, a, vec_type, "");
1895 }
1896
1897 static boolean
arch_rounding_available(const struct lp_type type)1898 arch_rounding_available(const struct lp_type type)
1899 {
1900 if ((util_cpu_caps.has_sse4_1 &&
1901 (type.length == 1 || type.width*type.length == 128)) ||
1902 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1903 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1904 return TRUE;
1905 else if ((util_cpu_caps.has_altivec &&
1906 (type.width == 32 && type.length == 4)))
1907 return TRUE;
1908 else if (util_cpu_caps.has_neon)
1909 return TRUE;
1910
1911 return FALSE;
1912 }
1913
1914 enum lp_build_round_mode
1915 {
1916 LP_BUILD_ROUND_NEAREST = 0,
1917 LP_BUILD_ROUND_FLOOR = 1,
1918 LP_BUILD_ROUND_CEIL = 2,
1919 LP_BUILD_ROUND_TRUNCATE = 3
1920 };
1921
1922 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1923 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1924 LLVMValueRef a)
1925 {
1926 LLVMBuilderRef builder = bld->gallivm->builder;
1927 const struct lp_type type = bld->type;
1928 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1929 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1930 const char *intrinsic;
1931 LLVMValueRef res;
1932
1933 assert(type.floating);
1934 /* using the double precision conversions is a bit more complicated */
1935 assert(type.width == 32);
1936
1937 assert(lp_check_value(type, a));
1938 assert(util_cpu_caps.has_sse2);
1939
1940 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1941 if (type.length == 1) {
1942 LLVMTypeRef vec_type;
1943 LLVMValueRef undef;
1944 LLVMValueRef arg;
1945 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1946
1947 vec_type = LLVMVectorType(bld->elem_type, 4);
1948
1949 intrinsic = "llvm.x86.sse.cvtss2si";
1950
1951 undef = LLVMGetUndef(vec_type);
1952
1953 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1954
1955 res = lp_build_intrinsic_unary(builder, intrinsic,
1956 ret_type, arg);
1957 }
1958 else {
1959 if (type.width* type.length == 128) {
1960 intrinsic = "llvm.x86.sse2.cvtps2dq";
1961 }
1962 else {
1963 assert(type.width*type.length == 256);
1964 assert(util_cpu_caps.has_avx);
1965
1966 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1967 }
1968 res = lp_build_intrinsic_unary(builder, intrinsic,
1969 ret_type, a);
1970 }
1971
1972 return res;
1973 }
1974
1975
1976 /*
1977 */
1978 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1979 lp_build_round_altivec(struct lp_build_context *bld,
1980 LLVMValueRef a,
1981 enum lp_build_round_mode mode)
1982 {
1983 LLVMBuilderRef builder = bld->gallivm->builder;
1984 const struct lp_type type = bld->type;
1985 const char *intrinsic = NULL;
1986
1987 assert(type.floating);
1988
1989 assert(lp_check_value(type, a));
1990 assert(util_cpu_caps.has_altivec);
1991
1992 (void)type;
1993
1994 switch (mode) {
1995 case LP_BUILD_ROUND_NEAREST:
1996 intrinsic = "llvm.ppc.altivec.vrfin";
1997 break;
1998 case LP_BUILD_ROUND_FLOOR:
1999 intrinsic = "llvm.ppc.altivec.vrfim";
2000 break;
2001 case LP_BUILD_ROUND_CEIL:
2002 intrinsic = "llvm.ppc.altivec.vrfip";
2003 break;
2004 case LP_BUILD_ROUND_TRUNCATE:
2005 intrinsic = "llvm.ppc.altivec.vrfiz";
2006 break;
2007 }
2008
2009 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2010 }
2011
2012 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2013 lp_build_round_arch(struct lp_build_context *bld,
2014 LLVMValueRef a,
2015 enum lp_build_round_mode mode)
2016 {
2017 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2018 LLVMBuilderRef builder = bld->gallivm->builder;
2019 const struct lp_type type = bld->type;
2020 const char *intrinsic_root;
2021 char intrinsic[32];
2022
2023 assert(type.floating);
2024 assert(lp_check_value(type, a));
2025 (void)type;
2026
2027 switch (mode) {
2028 case LP_BUILD_ROUND_NEAREST:
2029 intrinsic_root = "llvm.nearbyint";
2030 break;
2031 case LP_BUILD_ROUND_FLOOR:
2032 intrinsic_root = "llvm.floor";
2033 break;
2034 case LP_BUILD_ROUND_CEIL:
2035 intrinsic_root = "llvm.ceil";
2036 break;
2037 case LP_BUILD_ROUND_TRUNCATE:
2038 intrinsic_root = "llvm.trunc";
2039 break;
2040 }
2041
2042 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2043 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2044 }
2045 else /* (util_cpu_caps.has_altivec) */
2046 return lp_build_round_altivec(bld, a, mode);
2047 }
2048
2049 /**
2050 * Return the integer part of a float (vector) value (== round toward zero).
2051 * The returned value is a float (vector).
2052 * Ex: trunc(-1.5) = -1.0
2053 */
2054 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2055 lp_build_trunc(struct lp_build_context *bld,
2056 LLVMValueRef a)
2057 {
2058 LLVMBuilderRef builder = bld->gallivm->builder;
2059 const struct lp_type type = bld->type;
2060
2061 assert(type.floating);
2062 assert(lp_check_value(type, a));
2063
2064 if (arch_rounding_available(type)) {
2065 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2066 }
2067 else {
2068 const struct lp_type type = bld->type;
2069 struct lp_type inttype;
2070 struct lp_build_context intbld;
2071 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2072 LLVMValueRef trunc, res, anosign, mask;
2073 LLVMTypeRef int_vec_type = bld->int_vec_type;
2074 LLVMTypeRef vec_type = bld->vec_type;
2075
2076 inttype = type;
2077 inttype.floating = 0;
2078 lp_build_context_init(&intbld, bld->gallivm, inttype);
2079
2080 /* round by truncation */
2081 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2082 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2083
2084 /* mask out sign bit */
2085 anosign = lp_build_abs(bld, a);
2086 /*
2087 * mask out all values if anosign > 2^24
2088 * This should work both for large ints (all rounding is no-op for them
2089 * because such floats are always exact) as well as special cases like
2090 * NaNs, Infs (taking advantage of the fact they use max exponent).
2091 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2092 */
2093 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2094 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2095 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2096 return lp_build_select(bld, mask, a, res);
2097 }
2098 }
2099
2100
2101 /**
2102 * Return float (vector) rounded to nearest integer (vector). The returned
2103 * value is a float (vector).
2104 * Ex: round(0.9) = 1.0
2105 * Ex: round(-1.5) = -2.0
2106 */
2107 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2108 lp_build_round(struct lp_build_context *bld,
2109 LLVMValueRef a)
2110 {
2111 LLVMBuilderRef builder = bld->gallivm->builder;
2112 const struct lp_type type = bld->type;
2113
2114 assert(type.floating);
2115 assert(lp_check_value(type, a));
2116
2117 if (arch_rounding_available(type)) {
2118 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2119 }
2120 else {
2121 const struct lp_type type = bld->type;
2122 struct lp_type inttype;
2123 struct lp_build_context intbld;
2124 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2125 LLVMValueRef res, anosign, mask;
2126 LLVMTypeRef int_vec_type = bld->int_vec_type;
2127 LLVMTypeRef vec_type = bld->vec_type;
2128
2129 inttype = type;
2130 inttype.floating = 0;
2131 lp_build_context_init(&intbld, bld->gallivm, inttype);
2132
2133 res = lp_build_iround(bld, a);
2134 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2135
2136 /* mask out sign bit */
2137 anosign = lp_build_abs(bld, a);
2138 /*
2139 * mask out all values if anosign > 2^24
2140 * This should work both for large ints (all rounding is no-op for them
2141 * because such floats are always exact) as well as special cases like
2142 * NaNs, Infs (taking advantage of the fact they use max exponent).
2143 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2144 */
2145 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2146 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2147 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2148 return lp_build_select(bld, mask, a, res);
2149 }
2150 }
2151
2152
2153 /**
2154 * Return floor of float (vector), result is a float (vector)
2155 * Ex: floor(1.1) = 1.0
2156 * Ex: floor(-1.1) = -2.0
2157 */
2158 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2159 lp_build_floor(struct lp_build_context *bld,
2160 LLVMValueRef a)
2161 {
2162 LLVMBuilderRef builder = bld->gallivm->builder;
2163 const struct lp_type type = bld->type;
2164
2165 assert(type.floating);
2166 assert(lp_check_value(type, a));
2167
2168 if (arch_rounding_available(type)) {
2169 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2170 }
2171 else {
2172 const struct lp_type type = bld->type;
2173 struct lp_type inttype;
2174 struct lp_build_context intbld;
2175 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2176 LLVMValueRef trunc, res, anosign, mask;
2177 LLVMTypeRef int_vec_type = bld->int_vec_type;
2178 LLVMTypeRef vec_type = bld->vec_type;
2179
2180 if (type.width != 32) {
2181 char intrinsic[32];
2182 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2183 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2184 }
2185
2186 assert(type.width == 32); /* might want to handle doubles at some point */
2187
2188 inttype = type;
2189 inttype.floating = 0;
2190 lp_build_context_init(&intbld, bld->gallivm, inttype);
2191
2192 /* round by truncation */
2193 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2194 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2195
2196 if (type.sign) {
2197 LLVMValueRef tmp;
2198
2199 /*
2200 * fix values if rounding is wrong (for non-special cases)
2201 * - this is the case if trunc > a
2202 */
2203 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2204 /* tmp = trunc > a ? 1.0 : 0.0 */
2205 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2206 tmp = lp_build_and(&intbld, mask, tmp);
2207 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2208 res = lp_build_sub(bld, res, tmp);
2209 }
2210
2211 /* mask out sign bit */
2212 anosign = lp_build_abs(bld, a);
2213 /*
2214 * mask out all values if anosign > 2^24
2215 * This should work both for large ints (all rounding is no-op for them
2216 * because such floats are always exact) as well as special cases like
2217 * NaNs, Infs (taking advantage of the fact they use max exponent).
2218 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2219 */
2220 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2221 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2222 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2223 return lp_build_select(bld, mask, a, res);
2224 }
2225 }
2226
2227
2228 /**
2229 * Return ceiling of float (vector), returning float (vector).
2230 * Ex: ceil( 1.1) = 2.0
2231 * Ex: ceil(-1.1) = -1.0
2232 */
2233 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2234 lp_build_ceil(struct lp_build_context *bld,
2235 LLVMValueRef a)
2236 {
2237 LLVMBuilderRef builder = bld->gallivm->builder;
2238 const struct lp_type type = bld->type;
2239
2240 assert(type.floating);
2241 assert(lp_check_value(type, a));
2242
2243 if (arch_rounding_available(type)) {
2244 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2245 }
2246 else {
2247 const struct lp_type type = bld->type;
2248 struct lp_type inttype;
2249 struct lp_build_context intbld;
2250 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2251 LLVMValueRef trunc, res, anosign, mask, tmp;
2252 LLVMTypeRef int_vec_type = bld->int_vec_type;
2253 LLVMTypeRef vec_type = bld->vec_type;
2254
2255 if (type.width != 32) {
2256 char intrinsic[32];
2257 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2258 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2259 }
2260
2261 assert(type.width == 32); /* might want to handle doubles at some point */
2262
2263 inttype = type;
2264 inttype.floating = 0;
2265 lp_build_context_init(&intbld, bld->gallivm, inttype);
2266
2267 /* round by truncation */
2268 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2269 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2270
2271 /*
2272 * fix values if rounding is wrong (for non-special cases)
2273 * - this is the case if trunc < a
2274 */
2275 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2276 /* tmp = trunc < a ? 1.0 : 0.0 */
2277 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2278 tmp = lp_build_and(&intbld, mask, tmp);
2279 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2280 res = lp_build_add(bld, trunc, tmp);
2281
2282 /* mask out sign bit */
2283 anosign = lp_build_abs(bld, a);
2284 /*
2285 * mask out all values if anosign > 2^24
2286 * This should work both for large ints (all rounding is no-op for them
2287 * because such floats are always exact) as well as special cases like
2288 * NaNs, Infs (taking advantage of the fact they use max exponent).
2289 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2290 */
2291 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2292 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2293 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2294 return lp_build_select(bld, mask, a, res);
2295 }
2296 }
2297
2298
2299 /**
2300 * Return fractional part of 'a' computed as a - floor(a)
2301 * Typically used in texture coord arithmetic.
2302 */
2303 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2304 lp_build_fract(struct lp_build_context *bld,
2305 LLVMValueRef a)
2306 {
2307 assert(bld->type.floating);
2308 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2309 }
2310
2311
2312 /**
2313 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2314 * against 0.99999(9). (Will also return that value for NaNs.)
2315 */
2316 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2317 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2318 {
2319 LLVMValueRef max;
2320
2321 /* this is the largest number smaller than 1.0 representable as float */
2322 max = lp_build_const_vec(bld->gallivm, bld->type,
2323 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2324 return lp_build_min_ext(bld, fract, max,
2325 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2326 }
2327
2328
2329 /**
2330 * Same as lp_build_fract, but guarantees that the result is always smaller
2331 * than one. Will also return the smaller-than-one value for infs, NaNs.
2332 */
2333 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2334 lp_build_fract_safe(struct lp_build_context *bld,
2335 LLVMValueRef a)
2336 {
2337 return clamp_fract(bld, lp_build_fract(bld, a));
2338 }
2339
2340
2341 /**
2342 * Return the integer part of a float (vector) value (== round toward zero).
2343 * The returned value is an integer (vector).
2344 * Ex: itrunc(-1.5) = -1
2345 */
2346 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2347 lp_build_itrunc(struct lp_build_context *bld,
2348 LLVMValueRef a)
2349 {
2350 LLVMBuilderRef builder = bld->gallivm->builder;
2351 const struct lp_type type = bld->type;
2352 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2353
2354 assert(type.floating);
2355 assert(lp_check_value(type, a));
2356
2357 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2358 }
2359
2360
2361 /**
2362 * Return float (vector) rounded to nearest integer (vector). The returned
2363 * value is an integer (vector).
2364 * Ex: iround(0.9) = 1
2365 * Ex: iround(-1.5) = -2
2366 */
2367 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2368 lp_build_iround(struct lp_build_context *bld,
2369 LLVMValueRef a)
2370 {
2371 LLVMBuilderRef builder = bld->gallivm->builder;
2372 const struct lp_type type = bld->type;
2373 LLVMTypeRef int_vec_type = bld->int_vec_type;
2374 LLVMValueRef res;
2375
2376 assert(type.floating);
2377
2378 assert(lp_check_value(type, a));
2379
2380 if ((util_cpu_caps.has_sse2 &&
2381 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2382 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2383 return lp_build_iround_nearest_sse2(bld, a);
2384 }
2385 if (arch_rounding_available(type)) {
2386 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2387 }
2388 else {
2389 LLVMValueRef half;
2390
2391 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2392
2393 if (type.sign) {
2394 LLVMTypeRef vec_type = bld->vec_type;
2395 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2396 (unsigned long long)1 << (type.width - 1));
2397 LLVMValueRef sign;
2398
2399 /* get sign bit */
2400 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2401 sign = LLVMBuildAnd(builder, sign, mask, "");
2402
2403 /* sign * 0.5 */
2404 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2405 half = LLVMBuildOr(builder, sign, half, "");
2406 half = LLVMBuildBitCast(builder, half, vec_type, "");
2407 }
2408
2409 res = LLVMBuildFAdd(builder, a, half, "");
2410 }
2411
2412 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2413
2414 return res;
2415 }
2416
2417
2418 /**
2419 * Return floor of float (vector), result is an int (vector)
2420 * Ex: ifloor(1.1) = 1.0
2421 * Ex: ifloor(-1.1) = -2.0
2422 */
2423 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2424 lp_build_ifloor(struct lp_build_context *bld,
2425 LLVMValueRef a)
2426 {
2427 LLVMBuilderRef builder = bld->gallivm->builder;
2428 const struct lp_type type = bld->type;
2429 LLVMTypeRef int_vec_type = bld->int_vec_type;
2430 LLVMValueRef res;
2431
2432 assert(type.floating);
2433 assert(lp_check_value(type, a));
2434
2435 res = a;
2436 if (type.sign) {
2437 if (arch_rounding_available(type)) {
2438 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2439 }
2440 else {
2441 struct lp_type inttype;
2442 struct lp_build_context intbld;
2443 LLVMValueRef trunc, itrunc, mask;
2444
2445 assert(type.floating);
2446 assert(lp_check_value(type, a));
2447
2448 inttype = type;
2449 inttype.floating = 0;
2450 lp_build_context_init(&intbld, bld->gallivm, inttype);
2451
2452 /* round by truncation */
2453 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2454 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2455
2456 /*
2457 * fix values if rounding is wrong (for non-special cases)
2458 * - this is the case if trunc > a
2459 * The results of doing this with NaNs, very large values etc.
2460 * are undefined but this seems to be the case anyway.
2461 */
2462 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2463 /* cheapie minus one with mask since the mask is minus one / zero */
2464 return lp_build_add(&intbld, itrunc, mask);
2465 }
2466 }
2467
2468 /* round to nearest (toward zero) */
2469 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2470
2471 return res;
2472 }
2473
2474
2475 /**
2476 * Return ceiling of float (vector), returning int (vector).
2477 * Ex: iceil( 1.1) = 2
2478 * Ex: iceil(-1.1) = -1
2479 */
2480 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2481 lp_build_iceil(struct lp_build_context *bld,
2482 LLVMValueRef a)
2483 {
2484 LLVMBuilderRef builder = bld->gallivm->builder;
2485 const struct lp_type type = bld->type;
2486 LLVMTypeRef int_vec_type = bld->int_vec_type;
2487 LLVMValueRef res;
2488
2489 assert(type.floating);
2490 assert(lp_check_value(type, a));
2491
2492 if (arch_rounding_available(type)) {
2493 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2494 }
2495 else {
2496 struct lp_type inttype;
2497 struct lp_build_context intbld;
2498 LLVMValueRef trunc, itrunc, mask;
2499
2500 assert(type.floating);
2501 assert(lp_check_value(type, a));
2502
2503 inttype = type;
2504 inttype.floating = 0;
2505 lp_build_context_init(&intbld, bld->gallivm, inttype);
2506
2507 /* round by truncation */
2508 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2509 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2510
2511 /*
2512 * fix values if rounding is wrong (for non-special cases)
2513 * - this is the case if trunc < a
2514 * The results of doing this with NaNs, very large values etc.
2515 * are undefined but this seems to be the case anyway.
2516 */
2517 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2518 /* cheapie plus one with mask since the mask is minus one / zero */
2519 return lp_build_sub(&intbld, itrunc, mask);
2520 }
2521
2522 /* round to nearest (toward zero) */
2523 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2524
2525 return res;
2526 }
2527
2528
2529 /**
2530 * Combined ifloor() & fract().
2531 *
2532 * Preferred to calling the functions separately, as it will ensure that the
2533 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2534 */
2535 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2536 lp_build_ifloor_fract(struct lp_build_context *bld,
2537 LLVMValueRef a,
2538 LLVMValueRef *out_ipart,
2539 LLVMValueRef *out_fpart)
2540 {
2541 LLVMBuilderRef builder = bld->gallivm->builder;
2542 const struct lp_type type = bld->type;
2543 LLVMValueRef ipart;
2544
2545 assert(type.floating);
2546 assert(lp_check_value(type, a));
2547
2548 if (arch_rounding_available(type)) {
2549 /*
2550 * floor() is easier.
2551 */
2552
2553 ipart = lp_build_floor(bld, a);
2554 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2555 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2556 }
2557 else {
2558 /*
2559 * ifloor() is easier.
2560 */
2561
2562 *out_ipart = lp_build_ifloor(bld, a);
2563 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2564 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2565 }
2566 }
2567
2568
2569 /**
2570 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2571 * always smaller than one.
2572 */
2573 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2574 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2575 LLVMValueRef a,
2576 LLVMValueRef *out_ipart,
2577 LLVMValueRef *out_fpart)
2578 {
2579 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2580 *out_fpart = clamp_fract(bld, *out_fpart);
2581 }
2582
2583
2584 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2585 lp_build_sqrt(struct lp_build_context *bld,
2586 LLVMValueRef a)
2587 {
2588 LLVMBuilderRef builder = bld->gallivm->builder;
2589 const struct lp_type type = bld->type;
2590 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2591 char intrinsic[32];
2592
2593 assert(lp_check_value(type, a));
2594
2595 assert(type.floating);
2596 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2597
2598 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2599 }
2600
2601
2602 /**
2603 * Do one Newton-Raphson step to improve reciprocate precision:
2604 *
2605 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2606 *
2607 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2608 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2609 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2610 * halo. It would be necessary to clamp the argument to prevent this.
2611 *
2612 * See also:
2613 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2614 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2615 */
2616 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2617 lp_build_rcp_refine(struct lp_build_context *bld,
2618 LLVMValueRef a,
2619 LLVMValueRef rcp_a)
2620 {
2621 LLVMBuilderRef builder = bld->gallivm->builder;
2622 LLVMValueRef neg_a;
2623 LLVMValueRef res;
2624
2625 neg_a = LLVMBuildFNeg(builder, a, "");
2626 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2627 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2628
2629 return res;
2630 }
2631
2632
2633 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2634 lp_build_rcp(struct lp_build_context *bld,
2635 LLVMValueRef a)
2636 {
2637 LLVMBuilderRef builder = bld->gallivm->builder;
2638 const struct lp_type type = bld->type;
2639
2640 assert(lp_check_value(type, a));
2641
2642 if(a == bld->zero)
2643 return bld->undef;
2644 if(a == bld->one)
2645 return bld->one;
2646 if(a == bld->undef)
2647 return bld->undef;
2648
2649 assert(type.floating);
2650
2651 if(LLVMIsConstant(a))
2652 return LLVMConstFDiv(bld->one, a);
2653
2654 /*
2655 * We don't use RCPPS because:
2656 * - it only has 10bits of precision
2657 * - it doesn't even get the reciprocate of 1.0 exactly
2658 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2659 * - for recent processors the benefit over DIVPS is marginal, a case
2660 * dependent
2661 *
2662 * We could still use it on certain processors if benchmarks show that the
2663 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2664 * particular uses that require less workarounds.
2665 */
2666
2667 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2668 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2669 const unsigned num_iterations = 0;
2670 LLVMValueRef res;
2671 unsigned i;
2672 const char *intrinsic = NULL;
2673
2674 if (type.length == 4) {
2675 intrinsic = "llvm.x86.sse.rcp.ps";
2676 }
2677 else {
2678 intrinsic = "llvm.x86.avx.rcp.ps.256";
2679 }
2680
2681 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2682
2683 for (i = 0; i < num_iterations; ++i) {
2684 res = lp_build_rcp_refine(bld, a, res);
2685 }
2686
2687 return res;
2688 }
2689
2690 return LLVMBuildFDiv(builder, bld->one, a, "");
2691 }
2692
2693
2694 /**
2695 * Do one Newton-Raphson step to improve rsqrt precision:
2696 *
2697 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2698 *
2699 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2700 */
2701 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2702 lp_build_rsqrt_refine(struct lp_build_context *bld,
2703 LLVMValueRef a,
2704 LLVMValueRef rsqrt_a)
2705 {
2706 LLVMBuilderRef builder = bld->gallivm->builder;
2707 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2708 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2709 LLVMValueRef res;
2710
2711 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2712 res = LLVMBuildFMul(builder, a, res, "");
2713 res = LLVMBuildFSub(builder, three, res, "");
2714 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2715 res = LLVMBuildFMul(builder, half, res, "");
2716
2717 return res;
2718 }
2719
2720
2721 /**
2722 * Generate 1/sqrt(a).
2723 * Result is undefined for values < 0, infinity for +0.
2724 */
2725 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2726 lp_build_rsqrt(struct lp_build_context *bld,
2727 LLVMValueRef a)
2728 {
2729 const struct lp_type type = bld->type;
2730
2731 assert(lp_check_value(type, a));
2732
2733 assert(type.floating);
2734
2735 /*
2736 * This should be faster but all denormals will end up as infinity.
2737 */
2738 if (0 && lp_build_fast_rsqrt_available(type)) {
2739 const unsigned num_iterations = 1;
2740 LLVMValueRef res;
2741 unsigned i;
2742
2743 /* rsqrt(1.0) != 1.0 here */
2744 res = lp_build_fast_rsqrt(bld, a);
2745
2746 if (num_iterations) {
2747 /*
2748 * Newton-Raphson will result in NaN instead of infinity for zero,
2749 * and NaN instead of zero for infinity.
2750 * Also, need to ensure rsqrt(1.0) == 1.0.
2751 * All numbers smaller than FLT_MIN will result in +infinity
2752 * (rsqrtps treats all denormals as zero).
2753 */
2754 LLVMValueRef cmp;
2755 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2756 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2757
2758 for (i = 0; i < num_iterations; ++i) {
2759 res = lp_build_rsqrt_refine(bld, a, res);
2760 }
2761 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2762 res = lp_build_select(bld, cmp, inf, res);
2763 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2764 res = lp_build_select(bld, cmp, bld->zero, res);
2765 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2766 res = lp_build_select(bld, cmp, bld->one, res);
2767 }
2768
2769 return res;
2770 }
2771
2772 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2773 }
2774
2775 /**
2776 * If there's a fast (inaccurate) rsqrt instruction available
2777 * (caller may want to avoid to call rsqrt_fast if it's not available,
2778 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2779 * unavailable it would result in sqrt/div/mul so obviously
2780 * much better to just call sqrt, skipping both div and mul).
2781 */
2782 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2783 lp_build_fast_rsqrt_available(struct lp_type type)
2784 {
2785 assert(type.floating);
2786
2787 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2788 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2789 return true;
2790 }
2791 return false;
2792 }
2793
2794
2795 /**
2796 * Generate 1/sqrt(a).
2797 * Result is undefined for values < 0, infinity for +0.
2798 * Precision is limited, only ~10 bits guaranteed
2799 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2800 */
2801 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2802 lp_build_fast_rsqrt(struct lp_build_context *bld,
2803 LLVMValueRef a)
2804 {
2805 LLVMBuilderRef builder = bld->gallivm->builder;
2806 const struct lp_type type = bld->type;
2807
2808 assert(lp_check_value(type, a));
2809
2810 if (lp_build_fast_rsqrt_available(type)) {
2811 const char *intrinsic = NULL;
2812
2813 if (type.length == 4) {
2814 intrinsic = "llvm.x86.sse.rsqrt.ps";
2815 }
2816 else {
2817 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2818 }
2819 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2820 }
2821 else {
2822 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2823 }
2824 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2825 }
2826
2827
2828 /**
2829 * Generate sin(a) or cos(a) using polynomial approximation.
2830 * TODO: it might be worth recognizing sin and cos using same source
2831 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2832 * would be way cheaper than calculating (nearly) everything twice...
2833 * Not sure it's common enough to be worth bothering however, scs
2834 * opcode could also benefit from calculating both though.
2835 */
2836 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2837 lp_build_sin_or_cos(struct lp_build_context *bld,
2838 LLVMValueRef a,
2839 boolean cos)
2840 {
2841 struct gallivm_state *gallivm = bld->gallivm;
2842 LLVMBuilderRef b = gallivm->builder;
2843 struct lp_type int_type = lp_int_type(bld->type);
2844
2845 /*
2846 * take the absolute value,
2847 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2848 */
2849
2850 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2851 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2852
2853 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2854 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2855
2856 /*
2857 * scale by 4/Pi
2858 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2859 */
2860
2861 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2862 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2863
2864 /*
2865 * store the integer part of y in mm0
2866 * emm2 = _mm_cvttps_epi32(y);
2867 */
2868
2869 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2870
2871 /*
2872 * j=(j+1) & (~1) (see the cephes sources)
2873 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2874 */
2875
2876 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2877 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2878 /*
2879 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2880 */
2881 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2882 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2883
2884 /*
2885 * y = _mm_cvtepi32_ps(emm2);
2886 */
2887 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2888
2889 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2890 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2891 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2892 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2893
2894 /*
2895 * Argument used for poly selection and sign bit determination
2896 * is different for sin vs. cos.
2897 */
2898 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2899 emm2_and;
2900
2901 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2902 LLVMBuildNot(b, emm2_2, ""), ""),
2903 const_29, "sign_bit") :
2904 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2905 LLVMBuildShl(b, emm2_add,
2906 const_29, ""), ""),
2907 sign_mask, "sign_bit");
2908
2909 /*
2910 * get the polynom selection mask
2911 * there is one polynom for 0 <= x <= Pi/4
2912 * and another one for Pi/4<x<=Pi/2
2913 * Both branches will be computed.
2914 *
2915 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2916 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2917 */
2918
2919 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2920 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2921 int_type, PIPE_FUNC_EQUAL,
2922 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2923
2924 /*
2925 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2926 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2927 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2928 */
2929 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2930 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2931 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2932
2933 /*
2934 * The magic pass: "Extended precision modular arithmetic"
2935 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2936 */
2937 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2938 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2939 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2940
2941 /*
2942 * Evaluate the first polynom (0 <= x <= Pi/4)
2943 *
2944 * z = _mm_mul_ps(x,x);
2945 */
2946 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2947
2948 /*
2949 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2950 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2951 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2952 */
2953 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2954 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2955 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2956
2957 /*
2958 * y = *(v4sf*)_ps_coscof_p0;
2959 * y = _mm_mul_ps(y, z);
2960 */
2961 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2962 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2963 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2964 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2965
2966
2967 /*
2968 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2969 * y = _mm_sub_ps(y, tmp);
2970 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2971 */
2972 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2973 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2974 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2975 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2976 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2977
2978 /*
2979 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2980 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2981 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2982 */
2983 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2984 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2985 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2986
2987 /*
2988 * Evaluate the second polynom (Pi/4 <= x <= 0)
2989 *
2990 * y2 = *(v4sf*)_ps_sincof_p0;
2991 * y2 = _mm_mul_ps(y2, z);
2992 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2993 * y2 = _mm_mul_ps(y2, z);
2994 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2995 * y2 = _mm_mul_ps(y2, z);
2996 * y2 = _mm_mul_ps(y2, x);
2997 * y2 = _mm_add_ps(y2, x);
2998 */
2999
3000 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3001 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3002 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3003 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3004
3005 /*
3006 * select the correct result from the two polynoms
3007 * xmm3 = poly_mask;
3008 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3009 * y = _mm_andnot_ps(xmm3, y);
3010 * y = _mm_or_ps(y,y2);
3011 */
3012 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3013 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3014 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3015 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3016 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3017 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3018
3019 /*
3020 * update the sign
3021 * y = _mm_xor_ps(y, sign_bit);
3022 */
3023 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3024 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3025
3026 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3027
3028 /* clamp output to be within [-1, 1] */
3029 y_result = lp_build_clamp(bld, y_result,
3030 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3031 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3032 /* If a is -inf, inf or NaN then return NaN */
3033 y_result = lp_build_select(bld, isfinite, y_result,
3034 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3035 return y_result;
3036 }
3037
3038
3039 /**
3040 * Generate sin(a)
3041 */
3042 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3043 lp_build_sin(struct lp_build_context *bld,
3044 LLVMValueRef a)
3045 {
3046 return lp_build_sin_or_cos(bld, a, FALSE);
3047 }
3048
3049
3050 /**
3051 * Generate cos(a)
3052 */
3053 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3054 lp_build_cos(struct lp_build_context *bld,
3055 LLVMValueRef a)
3056 {
3057 return lp_build_sin_or_cos(bld, a, TRUE);
3058 }
3059
3060
3061 /**
3062 * Generate pow(x, y)
3063 */
3064 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3065 lp_build_pow(struct lp_build_context *bld,
3066 LLVMValueRef x,
3067 LLVMValueRef y)
3068 {
3069 /* TODO: optimize the constant case */
3070 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3071 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3072 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3073 __FUNCTION__);
3074 }
3075
3076 LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3077 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3078
3079 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3080 return res;
3081 }
3082
3083
3084 /**
3085 * Generate exp(x)
3086 */
3087 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3088 lp_build_exp(struct lp_build_context *bld,
3089 LLVMValueRef x)
3090 {
3091 /* log2(e) = 1/log(2) */
3092 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3093 1.4426950408889634);
3094
3095 assert(lp_check_value(bld->type, x));
3096
3097 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3098 }
3099
3100
3101 /**
3102 * Generate log(x)
3103 * Behavior is undefined with infs, 0s and nans
3104 */
3105 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3106 lp_build_log(struct lp_build_context *bld,
3107 LLVMValueRef x)
3108 {
3109 /* log(2) */
3110 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3111 0.69314718055994529);
3112
3113 assert(lp_check_value(bld->type, x));
3114
3115 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3116 }
3117
3118 /**
3119 * Generate log(x) that handles edge cases (infs, 0s and nans)
3120 */
3121 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3122 lp_build_log_safe(struct lp_build_context *bld,
3123 LLVMValueRef x)
3124 {
3125 /* log(2) */
3126 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3127 0.69314718055994529);
3128
3129 assert(lp_check_value(bld->type, x));
3130
3131 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3132 }
3133
3134
3135 /**
3136 * Generate polynomial.
3137 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3138 */
3139 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3140 lp_build_polynomial(struct lp_build_context *bld,
3141 LLVMValueRef x,
3142 const double *coeffs,
3143 unsigned num_coeffs)
3144 {
3145 const struct lp_type type = bld->type;
3146 LLVMValueRef even = NULL, odd = NULL;
3147 LLVMValueRef x2;
3148 unsigned i;
3149
3150 assert(lp_check_value(bld->type, x));
3151
3152 /* TODO: optimize the constant case */
3153 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3154 LLVMIsConstant(x)) {
3155 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3156 __FUNCTION__);
3157 }
3158
3159 /*
3160 * Calculate odd and even terms seperately to decrease data dependency
3161 * Ex:
3162 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3163 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3164 */
3165 x2 = lp_build_mul(bld, x, x);
3166
3167 for (i = num_coeffs; i--; ) {
3168 LLVMValueRef coeff;
3169
3170 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3171
3172 if (i % 2 == 0) {
3173 if (even)
3174 even = lp_build_mad(bld, x2, even, coeff);
3175 else
3176 even = coeff;
3177 } else {
3178 if (odd)
3179 odd = lp_build_mad(bld, x2, odd, coeff);
3180 else
3181 odd = coeff;
3182 }
3183 }
3184
3185 if (odd)
3186 return lp_build_mad(bld, odd, x, even);
3187 else if (even)
3188 return even;
3189 else
3190 return bld->undef;
3191 }
3192
3193
3194 /**
3195 * Minimax polynomial fit of 2**x, in range [0, 1[
3196 */
3197 const double lp_build_exp2_polynomial[] = {
3198 #if EXP_POLY_DEGREE == 5
3199 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3200 0.693153073200168932794,
3201 0.240153617044375388211,
3202 0.0558263180532956664775,
3203 0.00898934009049466391101,
3204 0.00187757667519147912699
3205 #elif EXP_POLY_DEGREE == 4
3206 1.00000259337069434683,
3207 0.693003834469974940458,
3208 0.24144275689150793076,
3209 0.0520114606103070150235,
3210 0.0135341679161270268764
3211 #elif EXP_POLY_DEGREE == 3
3212 0.999925218562710312959,
3213 0.695833540494823811697,
3214 0.226067155427249155588,
3215 0.0780245226406372992967
3216 #elif EXP_POLY_DEGREE == 2
3217 1.00172476321474503578,
3218 0.657636275736077639316,
3219 0.33718943461968720704
3220 #else
3221 #error
3222 #endif
3223 };
3224
3225
3226 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3227 lp_build_exp2(struct lp_build_context *bld,
3228 LLVMValueRef x)
3229 {
3230 LLVMBuilderRef builder = bld->gallivm->builder;
3231 const struct lp_type type = bld->type;
3232 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3233 LLVMValueRef ipart = NULL;
3234 LLVMValueRef fpart = NULL;
3235 LLVMValueRef expipart = NULL;
3236 LLVMValueRef expfpart = NULL;
3237 LLVMValueRef res = NULL;
3238
3239 assert(lp_check_value(bld->type, x));
3240
3241 /* TODO: optimize the constant case */
3242 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3243 LLVMIsConstant(x)) {
3244 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3245 __FUNCTION__);
3246 }
3247
3248 assert(type.floating && type.width == 32);
3249
3250 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3251 * the result is INF and if it's smaller than -126.9 the result is 0 */
3252 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3253 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3254 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3255 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3256
3257 /* ipart = floor(x) */
3258 /* fpart = x - ipart */
3259 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3260
3261 /* expipart = (float) (1 << ipart) */
3262 expipart = LLVMBuildAdd(builder, ipart,
3263 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3264 expipart = LLVMBuildShl(builder, expipart,
3265 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3266 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3267
3268 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3269 ARRAY_SIZE(lp_build_exp2_polynomial));
3270
3271 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3272
3273 return res;
3274 }
3275
3276
3277
3278 /**
3279 * Extract the exponent of a IEEE-754 floating point value.
3280 *
3281 * Optionally apply an integer bias.
3282 *
3283 * Result is an integer value with
3284 *
3285 * ifloor(log2(x)) + bias
3286 */
3287 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3288 lp_build_extract_exponent(struct lp_build_context *bld,
3289 LLVMValueRef x,
3290 int bias)
3291 {
3292 LLVMBuilderRef builder = bld->gallivm->builder;
3293 const struct lp_type type = bld->type;
3294 unsigned mantissa = lp_mantissa(type);
3295 LLVMValueRef res;
3296
3297 assert(type.floating);
3298
3299 assert(lp_check_value(bld->type, x));
3300
3301 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3302
3303 res = LLVMBuildLShr(builder, x,
3304 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3305 res = LLVMBuildAnd(builder, res,
3306 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3307 res = LLVMBuildSub(builder, res,
3308 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3309
3310 return res;
3311 }
3312
3313
3314 /**
3315 * Extract the mantissa of the a floating.
3316 *
3317 * Result is a floating point value with
3318 *
3319 * x / floor(log2(x))
3320 */
3321 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3322 lp_build_extract_mantissa(struct lp_build_context *bld,
3323 LLVMValueRef x)
3324 {
3325 LLVMBuilderRef builder = bld->gallivm->builder;
3326 const struct lp_type type = bld->type;
3327 unsigned mantissa = lp_mantissa(type);
3328 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3329 (1ULL << mantissa) - 1);
3330 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3331 LLVMValueRef res;
3332
3333 assert(lp_check_value(bld->type, x));
3334
3335 assert(type.floating);
3336
3337 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3338
3339 /* res = x / 2**ipart */
3340 res = LLVMBuildAnd(builder, x, mantmask, "");
3341 res = LLVMBuildOr(builder, res, one, "");
3342 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3343
3344 return res;
3345 }
3346
3347
3348
3349 /**
3350 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3351 * These coefficients can be generate with
3352 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3353 */
3354 const double lp_build_log2_polynomial[] = {
3355 #if LOG_POLY_DEGREE == 5
3356 2.88539008148777786488L,
3357 0.961796878841293367824L,
3358 0.577058946784739859012L,
3359 0.412914355135828735411L,
3360 0.308591899232910175289L,
3361 0.352376952300281371868L,
3362 #elif LOG_POLY_DEGREE == 4
3363 2.88539009343309178325L,
3364 0.961791550404184197881L,
3365 0.577440339438736392009L,
3366 0.403343858251329912514L,
3367 0.406718052498846252698L,
3368 #elif LOG_POLY_DEGREE == 3
3369 2.88538959748872753838L,
3370 0.961932915889597772928L,
3371 0.571118517972136195241L,
3372 0.493997535084709500285L,
3373 #else
3374 #error
3375 #endif
3376 };
3377
3378 /**
3379 * See http://www.devmaster.net/forums/showthread.php?p=43580
3380 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3381 * http://www.nezumi.demon.co.uk/consult/logx.htm
3382 *
3383 * If handle_edge_cases is true the function will perform computations
3384 * to match the required D3D10+ behavior for each of the edge cases.
3385 * That means that if input is:
3386 * - less than zero (to and including -inf) then NaN will be returned
3387 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3388 * - +infinity, then +infinity will be returned
3389 * - NaN, then NaN will be returned
3390 *
3391 * Those checks are fairly expensive so if you don't need them make sure
3392 * handle_edge_cases is false.
3393 */
3394 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3395 lp_build_log2_approx(struct lp_build_context *bld,
3396 LLVMValueRef x,
3397 LLVMValueRef *p_exp,
3398 LLVMValueRef *p_floor_log2,
3399 LLVMValueRef *p_log2,
3400 boolean handle_edge_cases)
3401 {
3402 LLVMBuilderRef builder = bld->gallivm->builder;
3403 const struct lp_type type = bld->type;
3404 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3405 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3406
3407 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3408 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3409 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3410
3411 LLVMValueRef i = NULL;
3412 LLVMValueRef y = NULL;
3413 LLVMValueRef z = NULL;
3414 LLVMValueRef exp = NULL;
3415 LLVMValueRef mant = NULL;
3416 LLVMValueRef logexp = NULL;
3417 LLVMValueRef p_z = NULL;
3418 LLVMValueRef res = NULL;
3419
3420 assert(lp_check_value(bld->type, x));
3421
3422 if(p_exp || p_floor_log2 || p_log2) {
3423 /* TODO: optimize the constant case */
3424 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3425 LLVMIsConstant(x)) {
3426 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3427 __FUNCTION__);
3428 }
3429
3430 assert(type.floating && type.width == 32);
3431
3432 /*
3433 * We don't explicitly handle denormalized numbers. They will yield a
3434 * result in the neighbourhood of -127, which appears to be adequate
3435 * enough.
3436 */
3437
3438 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3439
3440 /* exp = (float) exponent(x) */
3441 exp = LLVMBuildAnd(builder, i, expmask, "");
3442 }
3443
3444 if(p_floor_log2 || p_log2) {
3445 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3446 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3447 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3448 }
3449
3450 if (p_log2) {
3451 /* mant = 1 + (float) mantissa(x) */
3452 mant = LLVMBuildAnd(builder, i, mantmask, "");
3453 mant = LLVMBuildOr(builder, mant, one, "");
3454 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3455
3456 /* y = (mant - 1) / (mant + 1) */
3457 y = lp_build_div(bld,
3458 lp_build_sub(bld, mant, bld->one),
3459 lp_build_add(bld, mant, bld->one)
3460 );
3461
3462 /* z = y^2 */
3463 z = lp_build_mul(bld, y, y);
3464
3465 /* compute P(z) */
3466 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3467 ARRAY_SIZE(lp_build_log2_polynomial));
3468
3469 /* y * P(z) + logexp */
3470 res = lp_build_mad(bld, y, p_z, logexp);
3471
3472 if (type.floating && handle_edge_cases) {
3473 LLVMValueRef negmask, infmask, zmask;
3474 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3475 lp_build_const_vec(bld->gallivm, type, 0.0f));
3476 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3477 lp_build_const_vec(bld->gallivm, type, 0.0f));
3478 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3479 lp_build_const_vec(bld->gallivm, type, INFINITY));
3480
3481 /* If x is qual to inf make sure we return inf */
3482 res = lp_build_select(bld, infmask,
3483 lp_build_const_vec(bld->gallivm, type, INFINITY),
3484 res);
3485 /* If x is qual to 0, return -inf */
3486 res = lp_build_select(bld, zmask,
3487 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3488 res);
3489 /* If x is nan or less than 0, return nan */
3490 res = lp_build_select(bld, negmask,
3491 lp_build_const_vec(bld->gallivm, type, NAN),
3492 res);
3493 }
3494 }
3495
3496 if (p_exp) {
3497 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3498 *p_exp = exp;
3499 }
3500
3501 if (p_floor_log2)
3502 *p_floor_log2 = logexp;
3503
3504 if (p_log2)
3505 *p_log2 = res;
3506 }
3507
3508
3509 /*
3510 * log2 implementation which doesn't have special code to
3511 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3512 * the results for those cases are undefined.
3513 */
3514 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3515 lp_build_log2(struct lp_build_context *bld,
3516 LLVMValueRef x)
3517 {
3518 LLVMValueRef res;
3519 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3520 return res;
3521 }
3522
3523 /*
3524 * Version of log2 which handles all edge cases.
3525 * Look at documentation of lp_build_log2_approx for
3526 * description of the behavior for each of the edge cases.
3527 */
3528 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3529 lp_build_log2_safe(struct lp_build_context *bld,
3530 LLVMValueRef x)
3531 {
3532 LLVMValueRef res;
3533 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3534 return res;
3535 }
3536
3537
3538 /**
3539 * Faster (and less accurate) log2.
3540 *
3541 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3542 *
3543 * Piece-wise linear approximation, with exact results when x is a
3544 * power of two.
3545 *
3546 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3547 */
3548 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3549 lp_build_fast_log2(struct lp_build_context *bld,
3550 LLVMValueRef x)
3551 {
3552 LLVMBuilderRef builder = bld->gallivm->builder;
3553 LLVMValueRef ipart;
3554 LLVMValueRef fpart;
3555
3556 assert(lp_check_value(bld->type, x));
3557
3558 assert(bld->type.floating);
3559
3560 /* ipart = floor(log2(x)) - 1 */
3561 ipart = lp_build_extract_exponent(bld, x, -1);
3562 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3563
3564 /* fpart = x / 2**ipart */
3565 fpart = lp_build_extract_mantissa(bld, x);
3566
3567 /* ipart + fpart */
3568 return LLVMBuildFAdd(builder, ipart, fpart, "");
3569 }
3570
3571
3572 /**
3573 * Fast implementation of iround(log2(x)).
3574 *
3575 * Not an approximation -- it should give accurate results all the time.
3576 */
3577 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3578 lp_build_ilog2(struct lp_build_context *bld,
3579 LLVMValueRef x)
3580 {
3581 LLVMBuilderRef builder = bld->gallivm->builder;
3582 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3583 LLVMValueRef ipart;
3584
3585 assert(bld->type.floating);
3586
3587 assert(lp_check_value(bld->type, x));
3588
3589 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3590 x = LLVMBuildFMul(builder, x, sqrt2, "");
3591
3592 /* ipart = floor(log2(x) + 0.5) */
3593 ipart = lp_build_extract_exponent(bld, x, 0);
3594
3595 return ipart;
3596 }
3597
3598 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3599 lp_build_mod(struct lp_build_context *bld,
3600 LLVMValueRef x,
3601 LLVMValueRef y)
3602 {
3603 LLVMBuilderRef builder = bld->gallivm->builder;
3604 LLVMValueRef res;
3605 const struct lp_type type = bld->type;
3606
3607 assert(lp_check_value(type, x));
3608 assert(lp_check_value(type, y));
3609
3610 if (type.floating)
3611 res = LLVMBuildFRem(builder, x, y, "");
3612 else if (type.sign)
3613 res = LLVMBuildSRem(builder, x, y, "");
3614 else
3615 res = LLVMBuildURem(builder, x, y, "");
3616 return res;
3617 }
3618
3619
3620 /*
3621 * For floating inputs it creates and returns a mask
3622 * which is all 1's for channels which are NaN.
3623 * Channels inside x which are not NaN will be 0.
3624 */
3625 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3626 lp_build_isnan(struct lp_build_context *bld,
3627 LLVMValueRef x)
3628 {
3629 LLVMValueRef mask;
3630 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3631
3632 assert(bld->type.floating);
3633 assert(lp_check_value(bld->type, x));
3634
3635 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3636 "isnotnan");
3637 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3638 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3639 return mask;
3640 }
3641
3642 /* Returns all 1's for floating point numbers that are
3643 * finite numbers and returns all zeros for -inf,
3644 * inf and nan's */
3645 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3646 lp_build_isfinite(struct lp_build_context *bld,
3647 LLVMValueRef x)
3648 {
3649 LLVMBuilderRef builder = bld->gallivm->builder;
3650 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3651 struct lp_type int_type = lp_int_type(bld->type);
3652 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3653 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3654 0x7f800000);
3655
3656 if (!bld->type.floating) {
3657 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3658 }
3659 assert(bld->type.floating);
3660 assert(lp_check_value(bld->type, x));
3661 assert(bld->type.width == 32);
3662
3663 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3664 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3665 intx, infornan32);
3666 }
3667
3668 /*
3669 * Returns true if the number is nan or inf and false otherwise.
3670 * The input has to be a floating point vector.
3671 */
3672 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3673 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3674 const struct lp_type type,
3675 LLVMValueRef x)
3676 {
3677 LLVMBuilderRef builder = gallivm->builder;
3678 struct lp_type int_type = lp_int_type(type);
3679 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3680 0x7f800000);
3681 LLVMValueRef ret;
3682
3683 assert(type.floating);
3684
3685 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3686 ret = LLVMBuildAnd(builder, ret, const0, "");
3687 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3688 ret, const0);
3689
3690 return ret;
3691 }
3692
3693
3694 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3695 lp_build_fpstate_get(struct gallivm_state *gallivm)
3696 {
3697 if (util_cpu_caps.has_sse) {
3698 LLVMBuilderRef builder = gallivm->builder;
3699 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3700 gallivm,
3701 LLVMInt32TypeInContext(gallivm->context),
3702 "mxcsr_ptr");
3703 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3704 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3705 lp_build_intrinsic(builder,
3706 "llvm.x86.sse.stmxcsr",
3707 LLVMVoidTypeInContext(gallivm->context),
3708 &mxcsr_ptr8, 1, 0);
3709 return mxcsr_ptr;
3710 }
3711 return 0;
3712 }
3713
3714 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3715 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3716 boolean zero)
3717 {
3718 if (util_cpu_caps.has_sse) {
3719 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3720 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3721
3722 LLVMBuilderRef builder = gallivm->builder;
3723 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3724 LLVMValueRef mxcsr =
3725 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3726
3727 if (util_cpu_caps.has_daz) {
3728 /* Enable denormals are zero mode */
3729 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3730 }
3731 if (zero) {
3732 mxcsr = LLVMBuildOr(builder, mxcsr,
3733 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3734 } else {
3735 mxcsr = LLVMBuildAnd(builder, mxcsr,
3736 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3737 }
3738
3739 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3740 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3741 }
3742 }
3743
3744 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3745 lp_build_fpstate_set(struct gallivm_state *gallivm,
3746 LLVMValueRef mxcsr_ptr)
3747 {
3748 if (util_cpu_caps.has_sse) {
3749 LLVMBuilderRef builder = gallivm->builder;
3750 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3751 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3752 lp_build_intrinsic(builder,
3753 "llvm.x86.sse.ldmxcsr",
3754 LLVMVoidTypeInContext(gallivm->context),
3755 &mxcsr_ptr, 1, 0);
3756 }
3757 }
3758