1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if (a == bld->zero)
545 return b;
546 if (b == bld->zero)
547 return a;
548 if (a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if (type.norm) {
552 const char *intrinsic = NULL;
553
554 if (!type.sign && (a == bld->one || b == bld->one))
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if (util_cpu_caps.has_sse2) {
560 if (type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if (type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if (type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if (type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if (util_cpu_caps.has_avx2) {
573 if (type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if (type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if (b == bld->zero)
846 return a;
847 if (a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if (a == b)
850 return bld->zero;
851
852 if (type.norm) {
853 const char *intrinsic = NULL;
854
855 if (!type.sign && b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if (type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if (type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if (type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if (type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if (type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if (type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094 /*
1095 * Widening mul, valid for 32x32 bit -> 64bit only.
1096 * Result is low 32bits, high bits returned in res_hi.
1097 *
1098 * Emits code that is meant to be compiled for the host CPU.
1099 */
1100 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102 LLVMValueRef a,
1103 LLVMValueRef b,
1104 LLVMValueRef *res_hi)
1105 {
1106 struct gallivm_state *gallivm = bld->gallivm;
1107 LLVMBuilderRef builder = gallivm->builder;
1108
1109 assert(bld->type.width == 32);
1110 assert(bld->type.floating == 0);
1111 assert(bld->type.fixed == 0);
1112 assert(bld->type.norm == 0);
1113
1114 /*
1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116 * for x86 simd is atrocious (even if the high bits weren't required),
1117 * trying to handle real 64bit inputs (which of course can't happen due
1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119 * apparently llvm does not recognize this widening mul). This includes 6
1120 * (instead of 2) pmuludq plus extra adds and shifts
1121 * The same story applies to signed mul, albeit fixing this requires sse41.
1122 * https://llvm.org/bugs/show_bug.cgi?id=30845
1123 * So, whip up our own code, albeit only for length 4 and 8 (which
1124 * should be good enough)...
1125 */
1126 if ((bld->type.length == 4 || bld->type.length == 8) &&
1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128 util_cpu_caps.has_sse4_1)) {
1129 const char *intrinsic = NULL;
1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132 struct lp_type type_wide = lp_wider_type(bld->type);
1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134 unsigned i;
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i+1);
1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 aeven = a;
1141 beven = b;
1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146 if (bld->type.sign) {
1147 intrinsic = "llvm.x86.avx2.pmul.dq";
1148 } else {
1149 intrinsic = "llvm.x86.avx2.pmulu.dq";
1150 }
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 else {
1157 /* for consistent naming look elsewhere... */
1158 if (bld->type.sign) {
1159 intrinsic = "llvm.x86.sse41.pmuldq";
1160 } else {
1161 intrinsic = "llvm.x86.sse2.pmulu.dq";
1162 }
1163 /*
1164 * XXX If we only have AVX but not AVX2 this is a pain.
1165 * lp_build_intrinsic_binary_anylength() can't handle it
1166 * (due to src and dst type not being identical).
1167 */
1168 if (bld->type.length == 8) {
1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171 LLVMValueRef muleven2[2], mulodd2[2];
1172 struct lp_type type_wide_half = type_wide;
1173 LLVMTypeRef wtype_half;
1174 type_wide_half.length = 2;
1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185 wtype_half, aevenlo, bevenlo);
1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187 wtype_half, aoddlo, boddlo);
1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189 wtype_half, aevenhi, bevenhi);
1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191 wtype_half, aoddhi, boddhi);
1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195 }
1196 else {
1197 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198 wider_type, aeven, beven);
1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200 wider_type, aodd, bodd);
1201 }
1202 }
1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206 for (i = 0; i < bld->type.length; i += 2) {
1207 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209 }
1210 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213 for (i = 0; i < bld->type.length; i += 2) {
1214 shuf[i] = lp_build_const_int32(gallivm, i);
1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216 }
1217 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219 }
1220 else {
1221 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222 }
1223 }
1224
1225
1226 /*
1227 * Widening mul, valid for 32x32 bit -> 64bit only.
1228 * Result is low 32bits, high bits returned in res_hi.
1229 *
1230 * Emits generic code.
1231 */
1232 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef *res_hi)
1237 {
1238 struct gallivm_state *gallivm = bld->gallivm;
1239 LLVMBuilderRef builder = gallivm->builder;
1240 LLVMValueRef tmp, shift, res_lo;
1241 struct lp_type type_tmp;
1242 LLVMTypeRef wide_type, narrow_type;
1243
1244 type_tmp = bld->type;
1245 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246 type_tmp.width *= 2;
1247 wide_type = lp_build_vec_type(gallivm, type_tmp);
1248 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250 if (bld->type.sign) {
1251 a = LLVMBuildSExt(builder, a, wide_type, "");
1252 b = LLVMBuildSExt(builder, b, wide_type, "");
1253 } else {
1254 a = LLVMBuildZExt(builder, a, wide_type, "");
1255 b = LLVMBuildZExt(builder, b, wide_type, "");
1256 }
1257 tmp = LLVMBuildMul(builder, a, b, "");
1258
1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261 /* Since we truncate anyway, LShr and AShr are equivalent. */
1262 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265 return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1271 lp_build_mad(struct lp_build_context *bld,
1272 LLVMValueRef a,
1273 LLVMValueRef b,
1274 LLVMValueRef c)
1275 {
1276 const struct lp_type type = bld->type;
1277 if (type.floating) {
1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279 } else {
1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281 }
1282 }
1283
1284
1285 /**
1286 * Small vector x scale multiplication optimization.
1287 */
1288 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1289 lp_build_mul_imm(struct lp_build_context *bld,
1290 LLVMValueRef a,
1291 int b)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 LLVMValueRef factor;
1295
1296 assert(lp_check_value(bld->type, a));
1297
1298 if(b == 0)
1299 return bld->zero;
1300
1301 if(b == 1)
1302 return a;
1303
1304 if(b == -1)
1305 return lp_build_negate(bld, a);
1306
1307 if(b == 2 && bld->type.floating)
1308 return lp_build_add(bld, a, a);
1309
1310 if(util_is_power_of_two(b)) {
1311 unsigned shift = ffs(b) - 1;
1312
1313 if(bld->type.floating) {
1314 #if 0
1315 /*
1316 * Power of two multiplication by directly manipulating the exponent.
1317 *
1318 * XXX: This might not be always faster, it will introduce a small error
1319 * for multiplication by zero, and it will produce wrong results
1320 * for Inf and NaN.
1321 */
1322 unsigned mantissa = lp_mantissa(bld->type);
1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325 a = LLVMBuildAdd(builder, a, factor, "");
1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327 return a;
1328 #endif
1329 }
1330 else {
1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332 return LLVMBuildShl(builder, a, factor, "");
1333 }
1334 }
1335
1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337 return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342 * Generate a / b
1343 */
1344 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1345 lp_build_div(struct lp_build_context *bld,
1346 LLVMValueRef a,
1347 LLVMValueRef b)
1348 {
1349 LLVMBuilderRef builder = bld->gallivm->builder;
1350 const struct lp_type type = bld->type;
1351
1352 assert(lp_check_value(type, a));
1353 assert(lp_check_value(type, b));
1354
1355 if(a == bld->zero)
1356 return bld->zero;
1357 if(a == bld->one && type.floating)
1358 return lp_build_rcp(bld, b);
1359 if(b == bld->zero)
1360 return bld->undef;
1361 if(b == bld->one)
1362 return a;
1363 if(a == bld->undef || b == bld->undef)
1364 return bld->undef;
1365
1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367 if (type.floating)
1368 return LLVMConstFDiv(a, b);
1369 else if (type.sign)
1370 return LLVMConstSDiv(a, b);
1371 else
1372 return LLVMConstUDiv(a, b);
1373 }
1374
1375 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376 if(FALSE &&
1377 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379 type.floating)
1380 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381
1382 if (type.floating)
1383 return LLVMBuildFDiv(builder, a, b, "");
1384 else if (type.sign)
1385 return LLVMBuildSDiv(builder, a, b, "");
1386 else
1387 return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389
1390
1391 /**
1392 * Linear interpolation helper.
1393 *
1394 * @param normalized whether we are interpolating normalized values,
1395 * encoded in normalized integers, twice as wide.
1396 *
1397 * @sa http://www.stereopsis.com/doubleblend.html
1398 */
1399 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401 LLVMValueRef x,
1402 LLVMValueRef v0,
1403 LLVMValueRef v1,
1404 unsigned flags)
1405 {
1406 unsigned half_width = bld->type.width/2;
1407 LLVMBuilderRef builder = bld->gallivm->builder;
1408 LLVMValueRef delta;
1409 LLVMValueRef res;
1410
1411 assert(lp_check_value(bld->type, x));
1412 assert(lp_check_value(bld->type, v0));
1413 assert(lp_check_value(bld->type, v1));
1414
1415 delta = lp_build_sub(bld, v1, v0);
1416
1417 if (bld->type.floating) {
1418 assert(flags == 0);
1419 return lp_build_mad(bld, x, delta, v0);
1420 }
1421
1422 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423 if (!bld->type.sign) {
1424 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425 /*
1426 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427 * most-significant-bit to the lowest-significant-bit, so that
1428 * later we can just divide by 2**n instead of 2**n - 1.
1429 */
1430
1431 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432 }
1433
1434 /* (x * delta) >> n */
1435 res = lp_build_mul(bld, x, delta);
1436 res = lp_build_shr_imm(bld, res, half_width);
1437 } else {
1438 /*
1439 * The rescaling trick above doesn't work for signed numbers, so
1440 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441 * instead.
1442 */
1443 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445 }
1446 } else {
1447 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448 res = lp_build_mul(bld, x, delta);
1449 }
1450
1451 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452 /*
1453 * At this point both res and v0 only use the lower half of the bits,
1454 * the rest is zero. Instead of add / mask, do add with half wide type.
1455 */
1456 struct lp_type narrow_type;
1457 struct lp_build_context narrow_bld;
1458
1459 memset(&narrow_type, 0, sizeof narrow_type);
1460 narrow_type.sign = bld->type.sign;
1461 narrow_type.width = bld->type.width/2;
1462 narrow_type.length = bld->type.length*2;
1463
1464 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467 res = lp_build_add(&narrow_bld, v0, res);
1468 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469 } else {
1470 res = lp_build_add(bld, v0, res);
1471
1472 if (bld->type.fixed) {
1473 /*
1474 * We need to mask out the high order bits when lerping 8bit
1475 * normalized colors stored on 16bits
1476 */
1477 /* XXX: This step is necessary for lerping 8bit colors stored on
1478 * 16bits, but it will be wrong for true fixed point use cases.
1479 * Basically we need a more powerful lp_type, capable of further
1480 * distinguishing the values interpretation from the value storage.
1481 */
1482 LLVMValueRef low_bits;
1483 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484 res = LLVMBuildAnd(builder, res, low_bits, "");
1485 }
1486 }
1487
1488 return res;
1489 }
1490
1491
1492 /**
1493 * Linear interpolation.
1494 */
1495 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1496 lp_build_lerp(struct lp_build_context *bld,
1497 LLVMValueRef x,
1498 LLVMValueRef v0,
1499 LLVMValueRef v1,
1500 unsigned flags)
1501 {
1502 const struct lp_type type = bld->type;
1503 LLVMValueRef res;
1504
1505 assert(lp_check_value(type, x));
1506 assert(lp_check_value(type, v0));
1507 assert(lp_check_value(type, v1));
1508
1509 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510
1511 if (type.norm) {
1512 struct lp_type wide_type;
1513 struct lp_build_context wide_bld;
1514 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515
1516 assert(type.length >= 2);
1517
1518 /*
1519 * Create a wider integer type, enough to hold the
1520 * intermediate result of the multiplication.
1521 */
1522 memset(&wide_type, 0, sizeof wide_type);
1523 wide_type.sign = type.sign;
1524 wide_type.width = type.width*2;
1525 wide_type.length = type.length/2;
1526
1527 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528
1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1530 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532
1533 /*
1534 * Lerp both halves.
1535 */
1536
1537 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538
1539 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541
1542 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543 } else {
1544 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545 }
1546
1547 return res;
1548 }
1549
1550
1551 /**
1552 * Bilinear interpolation.
1553 *
1554 * Values indices are in v_{yx}.
1555 */
1556 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558 LLVMValueRef x,
1559 LLVMValueRef y,
1560 LLVMValueRef v00,
1561 LLVMValueRef v01,
1562 LLVMValueRef v10,
1563 LLVMValueRef v11,
1564 unsigned flags)
1565 {
1566 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568 return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570
1571
1572 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574 LLVMValueRef x,
1575 LLVMValueRef y,
1576 LLVMValueRef z,
1577 LLVMValueRef v000,
1578 LLVMValueRef v001,
1579 LLVMValueRef v010,
1580 LLVMValueRef v011,
1581 LLVMValueRef v100,
1582 LLVMValueRef v101,
1583 LLVMValueRef v110,
1584 LLVMValueRef v111,
1585 unsigned flags)
1586 {
1587 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589 return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591
1592
1593 /**
1594 * Generate min(a, b)
1595 * Do checks for special cases but not for nans.
1596 */
1597 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1598 lp_build_min(struct lp_build_context *bld,
1599 LLVMValueRef a,
1600 LLVMValueRef b)
1601 {
1602 assert(lp_check_value(bld->type, a));
1603 assert(lp_check_value(bld->type, b));
1604
1605 if(a == bld->undef || b == bld->undef)
1606 return bld->undef;
1607
1608 if(a == b)
1609 return a;
1610
1611 if (bld->type.norm) {
1612 if (!bld->type.sign) {
1613 if (a == bld->zero || b == bld->zero) {
1614 return bld->zero;
1615 }
1616 }
1617 if(a == bld->one)
1618 return b;
1619 if(b == bld->one)
1620 return a;
1621 }
1622
1623 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625
1626
1627 /**
1628 * Generate min(a, b)
1629 * NaN's are handled according to the behavior specified by the
1630 * nan_behavior argument.
1631 */
1632 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1633 lp_build_min_ext(struct lp_build_context *bld,
1634 LLVMValueRef a,
1635 LLVMValueRef b,
1636 enum gallivm_nan_behavior nan_behavior)
1637 {
1638 assert(lp_check_value(bld->type, a));
1639 assert(lp_check_value(bld->type, b));
1640
1641 if(a == bld->undef || b == bld->undef)
1642 return bld->undef;
1643
1644 if(a == b)
1645 return a;
1646
1647 if (bld->type.norm) {
1648 if (!bld->type.sign) {
1649 if (a == bld->zero || b == bld->zero) {
1650 return bld->zero;
1651 }
1652 }
1653 if(a == bld->one)
1654 return b;
1655 if(b == bld->one)
1656 return a;
1657 }
1658
1659 return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661
1662 /**
1663 * Generate max(a, b)
1664 * Do checks for special cases, but NaN behavior is undefined.
1665 */
1666 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1667 lp_build_max(struct lp_build_context *bld,
1668 LLVMValueRef a,
1669 LLVMValueRef b)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if(bld->type.norm) {
1681 if(a == bld->one || b == bld->one)
1682 return bld->one;
1683 if (!bld->type.sign) {
1684 if (a == bld->zero) {
1685 return b;
1686 }
1687 if (b == bld->zero) {
1688 return a;
1689 }
1690 }
1691 }
1692
1693 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695
1696
1697 /**
1698 * Generate max(a, b)
1699 * Checks for special cases.
1700 * NaN's are handled according to the behavior specified by the
1701 * nan_behavior argument.
1702 */
1703 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1704 lp_build_max_ext(struct lp_build_context *bld,
1705 LLVMValueRef a,
1706 LLVMValueRef b,
1707 enum gallivm_nan_behavior nan_behavior)
1708 {
1709 assert(lp_check_value(bld->type, a));
1710 assert(lp_check_value(bld->type, b));
1711
1712 if(a == bld->undef || b == bld->undef)
1713 return bld->undef;
1714
1715 if(a == b)
1716 return a;
1717
1718 if(bld->type.norm) {
1719 if(a == bld->one || b == bld->one)
1720 return bld->one;
1721 if (!bld->type.sign) {
1722 if (a == bld->zero) {
1723 return b;
1724 }
1725 if (b == bld->zero) {
1726 return a;
1727 }
1728 }
1729 }
1730
1731 return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733
1734 /**
1735 * Generate clamp(a, min, max)
1736 * NaN behavior (for any of a, min, max) is undefined.
1737 * Do checks for special cases.
1738 */
1739 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1740 lp_build_clamp(struct lp_build_context *bld,
1741 LLVMValueRef a,
1742 LLVMValueRef min,
1743 LLVMValueRef max)
1744 {
1745 assert(lp_check_value(bld->type, a));
1746 assert(lp_check_value(bld->type, min));
1747 assert(lp_check_value(bld->type, max));
1748
1749 a = lp_build_min(bld, a, max);
1750 a = lp_build_max(bld, a, min);
1751 return a;
1752 }
1753
1754
1755 /**
1756 * Generate clamp(a, 0, 1)
1757 * A NaN will get converted to zero.
1758 */
1759 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761 LLVMValueRef a)
1762 {
1763 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764 a = lp_build_min(bld, a, bld->one);
1765 return a;
1766 }
1767
1768
1769 /**
1770 * Generate abs(a)
1771 */
1772 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1773 lp_build_abs(struct lp_build_context *bld,
1774 LLVMValueRef a)
1775 {
1776 LLVMBuilderRef builder = bld->gallivm->builder;
1777 const struct lp_type type = bld->type;
1778 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779
1780 assert(lp_check_value(type, a));
1781
1782 if(!type.sign)
1783 return a;
1784
1785 if(type.floating) {
1786 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787 /* Workaround llvm.org/PR27332 */
1788 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789 unsigned long long absMask = ~(1ULL << (type.width - 1));
1790 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792 a = LLVMBuildAnd(builder, a, mask, "");
1793 a = LLVMBuildBitCast(builder, a, vec_type, "");
1794 return a;
1795 } else {
1796 char intrinsic[32];
1797 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799 }
1800 }
1801
1802 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1803 switch(type.width) {
1804 case 8:
1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806 case 16:
1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808 case 32:
1809 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810 }
1811 }
1812 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1813 switch(type.width) {
1814 case 8:
1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816 case 16:
1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818 case 32:
1819 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820 }
1821 }
1822
1823 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1824 a, LLVMBuildNeg(builder, a, ""));
1825 }
1826
1827
1828 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1829 lp_build_negate(struct lp_build_context *bld,
1830 LLVMValueRef a)
1831 {
1832 LLVMBuilderRef builder = bld->gallivm->builder;
1833
1834 assert(lp_check_value(bld->type, a));
1835
1836 if (bld->type.floating)
1837 a = LLVMBuildFNeg(builder, a, "");
1838 else
1839 a = LLVMBuildNeg(builder, a, "");
1840
1841 return a;
1842 }
1843
1844
1845 /** Return -1, 0 or +1 depending on the sign of a */
1846 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1847 lp_build_sgn(struct lp_build_context *bld,
1848 LLVMValueRef a)
1849 {
1850 LLVMBuilderRef builder = bld->gallivm->builder;
1851 const struct lp_type type = bld->type;
1852 LLVMValueRef cond;
1853 LLVMValueRef res;
1854
1855 assert(lp_check_value(type, a));
1856
1857 /* Handle non-zero case */
1858 if(!type.sign) {
1859 /* if not zero then sign must be positive */
1860 res = bld->one;
1861 }
1862 else if(type.floating) {
1863 LLVMTypeRef vec_type;
1864 LLVMTypeRef int_type;
1865 LLVMValueRef mask;
1866 LLVMValueRef sign;
1867 LLVMValueRef one;
1868 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1869
1870 int_type = lp_build_int_vec_type(bld->gallivm, type);
1871 vec_type = lp_build_vec_type(bld->gallivm, type);
1872 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1873
1874 /* Take the sign bit and add it to 1 constant */
1875 sign = LLVMBuildBitCast(builder, a, int_type, "");
1876 sign = LLVMBuildAnd(builder, sign, mask, "");
1877 one = LLVMConstBitCast(bld->one, int_type);
1878 res = LLVMBuildOr(builder, sign, one, "");
1879 res = LLVMBuildBitCast(builder, res, vec_type, "");
1880 }
1881 else
1882 {
1883 /* signed int/norm/fixed point */
1884 /* could use psign with sse3 and appropriate vectors here */
1885 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1886 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1887 res = lp_build_select(bld, cond, bld->one, minus_one);
1888 }
1889
1890 /* Handle zero */
1891 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1892 res = lp_build_select(bld, cond, bld->zero, res);
1893
1894 return res;
1895 }
1896
1897
1898 /**
1899 * Set the sign of float vector 'a' according to 'sign'.
1900 * If sign==0, return abs(a).
1901 * If sign==1, return -abs(a);
1902 * Other values for sign produce undefined results.
1903 */
1904 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1905 lp_build_set_sign(struct lp_build_context *bld,
1906 LLVMValueRef a, LLVMValueRef sign)
1907 {
1908 LLVMBuilderRef builder = bld->gallivm->builder;
1909 const struct lp_type type = bld->type;
1910 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1911 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1912 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1913 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1914 ~((unsigned long long) 1 << (type.width - 1)));
1915 LLVMValueRef val, res;
1916
1917 assert(type.floating);
1918 assert(lp_check_value(type, a));
1919
1920 /* val = reinterpret_cast<int>(a) */
1921 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1922 /* val = val & mask */
1923 val = LLVMBuildAnd(builder, val, mask, "");
1924 /* sign = sign << shift */
1925 sign = LLVMBuildShl(builder, sign, shift, "");
1926 /* res = val | sign */
1927 res = LLVMBuildOr(builder, val, sign, "");
1928 /* res = reinterpret_cast<float>(res) */
1929 res = LLVMBuildBitCast(builder, res, vec_type, "");
1930
1931 return res;
1932 }
1933
1934
1935 /**
1936 * Convert vector of (or scalar) int to vector of (or scalar) float.
1937 */
1938 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1939 lp_build_int_to_float(struct lp_build_context *bld,
1940 LLVMValueRef a)
1941 {
1942 LLVMBuilderRef builder = bld->gallivm->builder;
1943 const struct lp_type type = bld->type;
1944 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945
1946 assert(type.floating);
1947
1948 return LLVMBuildSIToFP(builder, a, vec_type, "");
1949 }
1950
1951 static boolean
arch_rounding_available(const struct lp_type type)1952 arch_rounding_available(const struct lp_type type)
1953 {
1954 if ((util_cpu_caps.has_sse4_1 &&
1955 (type.length == 1 || type.width*type.length == 128)) ||
1956 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1957 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1958 return TRUE;
1959 else if ((util_cpu_caps.has_altivec &&
1960 (type.width == 32 && type.length == 4)))
1961 return TRUE;
1962
1963 return FALSE;
1964 }
1965
1966 enum lp_build_round_mode
1967 {
1968 LP_BUILD_ROUND_NEAREST = 0,
1969 LP_BUILD_ROUND_FLOOR = 1,
1970 LP_BUILD_ROUND_CEIL = 2,
1971 LP_BUILD_ROUND_TRUNCATE = 3
1972 };
1973
1974 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1975 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1976 LLVMValueRef a)
1977 {
1978 LLVMBuilderRef builder = bld->gallivm->builder;
1979 const struct lp_type type = bld->type;
1980 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1981 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1982 const char *intrinsic;
1983 LLVMValueRef res;
1984
1985 assert(type.floating);
1986 /* using the double precision conversions is a bit more complicated */
1987 assert(type.width == 32);
1988
1989 assert(lp_check_value(type, a));
1990 assert(util_cpu_caps.has_sse2);
1991
1992 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1993 if (type.length == 1) {
1994 LLVMTypeRef vec_type;
1995 LLVMValueRef undef;
1996 LLVMValueRef arg;
1997 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1998
1999 vec_type = LLVMVectorType(bld->elem_type, 4);
2000
2001 intrinsic = "llvm.x86.sse.cvtss2si";
2002
2003 undef = LLVMGetUndef(vec_type);
2004
2005 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2006
2007 res = lp_build_intrinsic_unary(builder, intrinsic,
2008 ret_type, arg);
2009 }
2010 else {
2011 if (type.width* type.length == 128) {
2012 intrinsic = "llvm.x86.sse2.cvtps2dq";
2013 }
2014 else {
2015 assert(type.width*type.length == 256);
2016 assert(util_cpu_caps.has_avx);
2017
2018 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2019 }
2020 res = lp_build_intrinsic_unary(builder, intrinsic,
2021 ret_type, a);
2022 }
2023
2024 return res;
2025 }
2026
2027
2028 /*
2029 */
2030 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2031 lp_build_round_altivec(struct lp_build_context *bld,
2032 LLVMValueRef a,
2033 enum lp_build_round_mode mode)
2034 {
2035 LLVMBuilderRef builder = bld->gallivm->builder;
2036 const struct lp_type type = bld->type;
2037 const char *intrinsic = NULL;
2038
2039 assert(type.floating);
2040
2041 assert(lp_check_value(type, a));
2042 assert(util_cpu_caps.has_altivec);
2043
2044 (void)type;
2045
2046 switch (mode) {
2047 case LP_BUILD_ROUND_NEAREST:
2048 intrinsic = "llvm.ppc.altivec.vrfin";
2049 break;
2050 case LP_BUILD_ROUND_FLOOR:
2051 intrinsic = "llvm.ppc.altivec.vrfim";
2052 break;
2053 case LP_BUILD_ROUND_CEIL:
2054 intrinsic = "llvm.ppc.altivec.vrfip";
2055 break;
2056 case LP_BUILD_ROUND_TRUNCATE:
2057 intrinsic = "llvm.ppc.altivec.vrfiz";
2058 break;
2059 }
2060
2061 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2062 }
2063
2064 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2065 lp_build_round_arch(struct lp_build_context *bld,
2066 LLVMValueRef a,
2067 enum lp_build_round_mode mode)
2068 {
2069 if (util_cpu_caps.has_sse4_1) {
2070 LLVMBuilderRef builder = bld->gallivm->builder;
2071 const struct lp_type type = bld->type;
2072 const char *intrinsic_root;
2073 char intrinsic[32];
2074
2075 assert(type.floating);
2076 assert(lp_check_value(type, a));
2077 (void)type;
2078
2079 switch (mode) {
2080 case LP_BUILD_ROUND_NEAREST:
2081 intrinsic_root = "llvm.nearbyint";
2082 break;
2083 case LP_BUILD_ROUND_FLOOR:
2084 intrinsic_root = "llvm.floor";
2085 break;
2086 case LP_BUILD_ROUND_CEIL:
2087 intrinsic_root = "llvm.ceil";
2088 break;
2089 case LP_BUILD_ROUND_TRUNCATE:
2090 intrinsic_root = "llvm.trunc";
2091 break;
2092 }
2093
2094 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2095 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2096 }
2097 else /* (util_cpu_caps.has_altivec) */
2098 return lp_build_round_altivec(bld, a, mode);
2099 }
2100
2101 /**
2102 * Return the integer part of a float (vector) value (== round toward zero).
2103 * The returned value is a float (vector).
2104 * Ex: trunc(-1.5) = -1.0
2105 */
2106 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2107 lp_build_trunc(struct lp_build_context *bld,
2108 LLVMValueRef a)
2109 {
2110 LLVMBuilderRef builder = bld->gallivm->builder;
2111 const struct lp_type type = bld->type;
2112
2113 assert(type.floating);
2114 assert(lp_check_value(type, a));
2115
2116 if (arch_rounding_available(type)) {
2117 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2118 }
2119 else {
2120 const struct lp_type type = bld->type;
2121 struct lp_type inttype;
2122 struct lp_build_context intbld;
2123 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2124 LLVMValueRef trunc, res, anosign, mask;
2125 LLVMTypeRef int_vec_type = bld->int_vec_type;
2126 LLVMTypeRef vec_type = bld->vec_type;
2127
2128 assert(type.width == 32); /* might want to handle doubles at some point */
2129
2130 inttype = type;
2131 inttype.floating = 0;
2132 lp_build_context_init(&intbld, bld->gallivm, inttype);
2133
2134 /* round by truncation */
2135 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2136 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2137
2138 /* mask out sign bit */
2139 anosign = lp_build_abs(bld, a);
2140 /*
2141 * mask out all values if anosign > 2^24
2142 * This should work both for large ints (all rounding is no-op for them
2143 * because such floats are always exact) as well as special cases like
2144 * NaNs, Infs (taking advantage of the fact they use max exponent).
2145 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2146 */
2147 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2148 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2149 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2150 return lp_build_select(bld, mask, a, res);
2151 }
2152 }
2153
2154
2155 /**
2156 * Return float (vector) rounded to nearest integer (vector). The returned
2157 * value is a float (vector).
2158 * Ex: round(0.9) = 1.0
2159 * Ex: round(-1.5) = -2.0
2160 */
2161 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2162 lp_build_round(struct lp_build_context *bld,
2163 LLVMValueRef a)
2164 {
2165 LLVMBuilderRef builder = bld->gallivm->builder;
2166 const struct lp_type type = bld->type;
2167
2168 assert(type.floating);
2169 assert(lp_check_value(type, a));
2170
2171 if (arch_rounding_available(type)) {
2172 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2173 }
2174 else {
2175 const struct lp_type type = bld->type;
2176 struct lp_type inttype;
2177 struct lp_build_context intbld;
2178 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2179 LLVMValueRef res, anosign, mask;
2180 LLVMTypeRef int_vec_type = bld->int_vec_type;
2181 LLVMTypeRef vec_type = bld->vec_type;
2182
2183 assert(type.width == 32); /* might want to handle doubles at some point */
2184
2185 inttype = type;
2186 inttype.floating = 0;
2187 lp_build_context_init(&intbld, bld->gallivm, inttype);
2188
2189 res = lp_build_iround(bld, a);
2190 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2191
2192 /* mask out sign bit */
2193 anosign = lp_build_abs(bld, a);
2194 /*
2195 * mask out all values if anosign > 2^24
2196 * This should work both for large ints (all rounding is no-op for them
2197 * because such floats are always exact) as well as special cases like
2198 * NaNs, Infs (taking advantage of the fact they use max exponent).
2199 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2200 */
2201 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2202 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2203 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2204 return lp_build_select(bld, mask, a, res);
2205 }
2206 }
2207
2208
2209 /**
2210 * Return floor of float (vector), result is a float (vector)
2211 * Ex: floor(1.1) = 1.0
2212 * Ex: floor(-1.1) = -2.0
2213 */
2214 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2215 lp_build_floor(struct lp_build_context *bld,
2216 LLVMValueRef a)
2217 {
2218 LLVMBuilderRef builder = bld->gallivm->builder;
2219 const struct lp_type type = bld->type;
2220
2221 assert(type.floating);
2222 assert(lp_check_value(type, a));
2223
2224 if (arch_rounding_available(type)) {
2225 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2226 }
2227 else {
2228 const struct lp_type type = bld->type;
2229 struct lp_type inttype;
2230 struct lp_build_context intbld;
2231 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2232 LLVMValueRef trunc, res, anosign, mask;
2233 LLVMTypeRef int_vec_type = bld->int_vec_type;
2234 LLVMTypeRef vec_type = bld->vec_type;
2235
2236 if (type.width != 32) {
2237 char intrinsic[32];
2238 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2239 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2240 }
2241
2242 assert(type.width == 32); /* might want to handle doubles at some point */
2243
2244 inttype = type;
2245 inttype.floating = 0;
2246 lp_build_context_init(&intbld, bld->gallivm, inttype);
2247
2248 /* round by truncation */
2249 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2250 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2251
2252 if (type.sign) {
2253 LLVMValueRef tmp;
2254
2255 /*
2256 * fix values if rounding is wrong (for non-special cases)
2257 * - this is the case if trunc > a
2258 */
2259 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2260 /* tmp = trunc > a ? 1.0 : 0.0 */
2261 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2262 tmp = lp_build_and(&intbld, mask, tmp);
2263 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2264 res = lp_build_sub(bld, res, tmp);
2265 }
2266
2267 /* mask out sign bit */
2268 anosign = lp_build_abs(bld, a);
2269 /*
2270 * mask out all values if anosign > 2^24
2271 * This should work both for large ints (all rounding is no-op for them
2272 * because such floats are always exact) as well as special cases like
2273 * NaNs, Infs (taking advantage of the fact they use max exponent).
2274 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2275 */
2276 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2277 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2278 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2279 return lp_build_select(bld, mask, a, res);
2280 }
2281 }
2282
2283
2284 /**
2285 * Return ceiling of float (vector), returning float (vector).
2286 * Ex: ceil( 1.1) = 2.0
2287 * Ex: ceil(-1.1) = -1.0
2288 */
2289 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2290 lp_build_ceil(struct lp_build_context *bld,
2291 LLVMValueRef a)
2292 {
2293 LLVMBuilderRef builder = bld->gallivm->builder;
2294 const struct lp_type type = bld->type;
2295
2296 assert(type.floating);
2297 assert(lp_check_value(type, a));
2298
2299 if (arch_rounding_available(type)) {
2300 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2301 }
2302 else {
2303 const struct lp_type type = bld->type;
2304 struct lp_type inttype;
2305 struct lp_build_context intbld;
2306 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2307 LLVMValueRef trunc, res, anosign, mask, tmp;
2308 LLVMTypeRef int_vec_type = bld->int_vec_type;
2309 LLVMTypeRef vec_type = bld->vec_type;
2310
2311 if (type.width != 32) {
2312 char intrinsic[32];
2313 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2314 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2315 }
2316
2317 assert(type.width == 32); /* might want to handle doubles at some point */
2318
2319 inttype = type;
2320 inttype.floating = 0;
2321 lp_build_context_init(&intbld, bld->gallivm, inttype);
2322
2323 /* round by truncation */
2324 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2325 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2326
2327 /*
2328 * fix values if rounding is wrong (for non-special cases)
2329 * - this is the case if trunc < a
2330 */
2331 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2332 /* tmp = trunc < a ? 1.0 : 0.0 */
2333 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2334 tmp = lp_build_and(&intbld, mask, tmp);
2335 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2336 res = lp_build_add(bld, trunc, tmp);
2337
2338 /* mask out sign bit */
2339 anosign = lp_build_abs(bld, a);
2340 /*
2341 * mask out all values if anosign > 2^24
2342 * This should work both for large ints (all rounding is no-op for them
2343 * because such floats are always exact) as well as special cases like
2344 * NaNs, Infs (taking advantage of the fact they use max exponent).
2345 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2346 */
2347 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2348 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2349 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2350 return lp_build_select(bld, mask, a, res);
2351 }
2352 }
2353
2354
2355 /**
2356 * Return fractional part of 'a' computed as a - floor(a)
2357 * Typically used in texture coord arithmetic.
2358 */
2359 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2360 lp_build_fract(struct lp_build_context *bld,
2361 LLVMValueRef a)
2362 {
2363 assert(bld->type.floating);
2364 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2365 }
2366
2367
2368 /**
2369 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2370 * against 0.99999(9). (Will also return that value for NaNs.)
2371 */
2372 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2373 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2374 {
2375 LLVMValueRef max;
2376
2377 /* this is the largest number smaller than 1.0 representable as float */
2378 max = lp_build_const_vec(bld->gallivm, bld->type,
2379 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2380 return lp_build_min_ext(bld, fract, max,
2381 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2382 }
2383
2384
2385 /**
2386 * Same as lp_build_fract, but guarantees that the result is always smaller
2387 * than one. Will also return the smaller-than-one value for infs, NaNs.
2388 */
2389 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2390 lp_build_fract_safe(struct lp_build_context *bld,
2391 LLVMValueRef a)
2392 {
2393 return clamp_fract(bld, lp_build_fract(bld, a));
2394 }
2395
2396
2397 /**
2398 * Return the integer part of a float (vector) value (== round toward zero).
2399 * The returned value is an integer (vector).
2400 * Ex: itrunc(-1.5) = -1
2401 */
2402 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2403 lp_build_itrunc(struct lp_build_context *bld,
2404 LLVMValueRef a)
2405 {
2406 LLVMBuilderRef builder = bld->gallivm->builder;
2407 const struct lp_type type = bld->type;
2408 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2409
2410 assert(type.floating);
2411 assert(lp_check_value(type, a));
2412
2413 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2414 }
2415
2416
2417 /**
2418 * Return float (vector) rounded to nearest integer (vector). The returned
2419 * value is an integer (vector).
2420 * Ex: iround(0.9) = 1
2421 * Ex: iround(-1.5) = -2
2422 */
2423 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2424 lp_build_iround(struct lp_build_context *bld,
2425 LLVMValueRef a)
2426 {
2427 LLVMBuilderRef builder = bld->gallivm->builder;
2428 const struct lp_type type = bld->type;
2429 LLVMTypeRef int_vec_type = bld->int_vec_type;
2430 LLVMValueRef res;
2431
2432 assert(type.floating);
2433
2434 assert(lp_check_value(type, a));
2435
2436 if ((util_cpu_caps.has_sse2 &&
2437 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2438 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2439 return lp_build_iround_nearest_sse2(bld, a);
2440 }
2441 if (arch_rounding_available(type)) {
2442 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2443 }
2444 else {
2445 LLVMValueRef half;
2446
2447 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2448
2449 if (type.sign) {
2450 LLVMTypeRef vec_type = bld->vec_type;
2451 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2452 (unsigned long long)1 << (type.width - 1));
2453 LLVMValueRef sign;
2454
2455 /* get sign bit */
2456 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2457 sign = LLVMBuildAnd(builder, sign, mask, "");
2458
2459 /* sign * 0.5 */
2460 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2461 half = LLVMBuildOr(builder, sign, half, "");
2462 half = LLVMBuildBitCast(builder, half, vec_type, "");
2463 }
2464
2465 res = LLVMBuildFAdd(builder, a, half, "");
2466 }
2467
2468 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2469
2470 return res;
2471 }
2472
2473
2474 /**
2475 * Return floor of float (vector), result is an int (vector)
2476 * Ex: ifloor(1.1) = 1.0
2477 * Ex: ifloor(-1.1) = -2.0
2478 */
2479 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2480 lp_build_ifloor(struct lp_build_context *bld,
2481 LLVMValueRef a)
2482 {
2483 LLVMBuilderRef builder = bld->gallivm->builder;
2484 const struct lp_type type = bld->type;
2485 LLVMTypeRef int_vec_type = bld->int_vec_type;
2486 LLVMValueRef res;
2487
2488 assert(type.floating);
2489 assert(lp_check_value(type, a));
2490
2491 res = a;
2492 if (type.sign) {
2493 if (arch_rounding_available(type)) {
2494 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2495 }
2496 else {
2497 struct lp_type inttype;
2498 struct lp_build_context intbld;
2499 LLVMValueRef trunc, itrunc, mask;
2500
2501 assert(type.floating);
2502 assert(lp_check_value(type, a));
2503
2504 inttype = type;
2505 inttype.floating = 0;
2506 lp_build_context_init(&intbld, bld->gallivm, inttype);
2507
2508 /* round by truncation */
2509 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2511
2512 /*
2513 * fix values if rounding is wrong (for non-special cases)
2514 * - this is the case if trunc > a
2515 * The results of doing this with NaNs, very large values etc.
2516 * are undefined but this seems to be the case anyway.
2517 */
2518 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2519 /* cheapie minus one with mask since the mask is minus one / zero */
2520 return lp_build_add(&intbld, itrunc, mask);
2521 }
2522 }
2523
2524 /* round to nearest (toward zero) */
2525 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2526
2527 return res;
2528 }
2529
2530
2531 /**
2532 * Return ceiling of float (vector), returning int (vector).
2533 * Ex: iceil( 1.1) = 2
2534 * Ex: iceil(-1.1) = -1
2535 */
2536 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2537 lp_build_iceil(struct lp_build_context *bld,
2538 LLVMValueRef a)
2539 {
2540 LLVMBuilderRef builder = bld->gallivm->builder;
2541 const struct lp_type type = bld->type;
2542 LLVMTypeRef int_vec_type = bld->int_vec_type;
2543 LLVMValueRef res;
2544
2545 assert(type.floating);
2546 assert(lp_check_value(type, a));
2547
2548 if (arch_rounding_available(type)) {
2549 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2550 }
2551 else {
2552 struct lp_type inttype;
2553 struct lp_build_context intbld;
2554 LLVMValueRef trunc, itrunc, mask;
2555
2556 assert(type.floating);
2557 assert(lp_check_value(type, a));
2558
2559 inttype = type;
2560 inttype.floating = 0;
2561 lp_build_context_init(&intbld, bld->gallivm, inttype);
2562
2563 /* round by truncation */
2564 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2565 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2566
2567 /*
2568 * fix values if rounding is wrong (for non-special cases)
2569 * - this is the case if trunc < a
2570 * The results of doing this with NaNs, very large values etc.
2571 * are undefined but this seems to be the case anyway.
2572 */
2573 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2574 /* cheapie plus one with mask since the mask is minus one / zero */
2575 return lp_build_sub(&intbld, itrunc, mask);
2576 }
2577
2578 /* round to nearest (toward zero) */
2579 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2580
2581 return res;
2582 }
2583
2584
2585 /**
2586 * Combined ifloor() & fract().
2587 *
2588 * Preferred to calling the functions separately, as it will ensure that the
2589 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2590 */
2591 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2592 lp_build_ifloor_fract(struct lp_build_context *bld,
2593 LLVMValueRef a,
2594 LLVMValueRef *out_ipart,
2595 LLVMValueRef *out_fpart)
2596 {
2597 LLVMBuilderRef builder = bld->gallivm->builder;
2598 const struct lp_type type = bld->type;
2599 LLVMValueRef ipart;
2600
2601 assert(type.floating);
2602 assert(lp_check_value(type, a));
2603
2604 if (arch_rounding_available(type)) {
2605 /*
2606 * floor() is easier.
2607 */
2608
2609 ipart = lp_build_floor(bld, a);
2610 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2611 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2612 }
2613 else {
2614 /*
2615 * ifloor() is easier.
2616 */
2617
2618 *out_ipart = lp_build_ifloor(bld, a);
2619 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2620 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2621 }
2622 }
2623
2624
2625 /**
2626 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2627 * always smaller than one.
2628 */
2629 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2630 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2631 LLVMValueRef a,
2632 LLVMValueRef *out_ipart,
2633 LLVMValueRef *out_fpart)
2634 {
2635 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2636 *out_fpart = clamp_fract(bld, *out_fpart);
2637 }
2638
2639
2640 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2641 lp_build_sqrt(struct lp_build_context *bld,
2642 LLVMValueRef a)
2643 {
2644 LLVMBuilderRef builder = bld->gallivm->builder;
2645 const struct lp_type type = bld->type;
2646 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2647 char intrinsic[32];
2648
2649 assert(lp_check_value(type, a));
2650
2651 assert(type.floating);
2652 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2653
2654 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2655 }
2656
2657
2658 /**
2659 * Do one Newton-Raphson step to improve reciprocate precision:
2660 *
2661 * x_{i+1} = x_i * (2 - a * x_i)
2662 *
2663 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2664 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2665 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2666 * halo. It would be necessary to clamp the argument to prevent this.
2667 *
2668 * See also:
2669 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2670 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2671 */
2672 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2673 lp_build_rcp_refine(struct lp_build_context *bld,
2674 LLVMValueRef a,
2675 LLVMValueRef rcp_a)
2676 {
2677 LLVMBuilderRef builder = bld->gallivm->builder;
2678 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2679 LLVMValueRef res;
2680
2681 res = LLVMBuildFMul(builder, a, rcp_a, "");
2682 res = LLVMBuildFSub(builder, two, res, "");
2683 res = LLVMBuildFMul(builder, rcp_a, res, "");
2684
2685 return res;
2686 }
2687
2688
2689 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2690 lp_build_rcp(struct lp_build_context *bld,
2691 LLVMValueRef a)
2692 {
2693 LLVMBuilderRef builder = bld->gallivm->builder;
2694 const struct lp_type type = bld->type;
2695
2696 assert(lp_check_value(type, a));
2697
2698 if(a == bld->zero)
2699 return bld->undef;
2700 if(a == bld->one)
2701 return bld->one;
2702 if(a == bld->undef)
2703 return bld->undef;
2704
2705 assert(type.floating);
2706
2707 if(LLVMIsConstant(a))
2708 return LLVMConstFDiv(bld->one, a);
2709
2710 /*
2711 * We don't use RCPPS because:
2712 * - it only has 10bits of precision
2713 * - it doesn't even get the reciprocate of 1.0 exactly
2714 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2715 * - for recent processors the benefit over DIVPS is marginal, a case
2716 * dependent
2717 *
2718 * We could still use it on certain processors if benchmarks show that the
2719 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2720 * particular uses that require less workarounds.
2721 */
2722
2723 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2724 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2725 const unsigned num_iterations = 0;
2726 LLVMValueRef res;
2727 unsigned i;
2728 const char *intrinsic = NULL;
2729
2730 if (type.length == 4) {
2731 intrinsic = "llvm.x86.sse.rcp.ps";
2732 }
2733 else {
2734 intrinsic = "llvm.x86.avx.rcp.ps.256";
2735 }
2736
2737 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2738
2739 for (i = 0; i < num_iterations; ++i) {
2740 res = lp_build_rcp_refine(bld, a, res);
2741 }
2742
2743 return res;
2744 }
2745
2746 return LLVMBuildFDiv(builder, bld->one, a, "");
2747 }
2748
2749
2750 /**
2751 * Do one Newton-Raphson step to improve rsqrt precision:
2752 *
2753 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2754 *
2755 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2756 */
2757 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2758 lp_build_rsqrt_refine(struct lp_build_context *bld,
2759 LLVMValueRef a,
2760 LLVMValueRef rsqrt_a)
2761 {
2762 LLVMBuilderRef builder = bld->gallivm->builder;
2763 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2764 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2765 LLVMValueRef res;
2766
2767 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2768 res = LLVMBuildFMul(builder, a, res, "");
2769 res = LLVMBuildFSub(builder, three, res, "");
2770 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2771 res = LLVMBuildFMul(builder, half, res, "");
2772
2773 return res;
2774 }
2775
2776
2777 /**
2778 * Generate 1/sqrt(a).
2779 * Result is undefined for values < 0, infinity for +0.
2780 */
2781 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2782 lp_build_rsqrt(struct lp_build_context *bld,
2783 LLVMValueRef a)
2784 {
2785 const struct lp_type type = bld->type;
2786
2787 assert(lp_check_value(type, a));
2788
2789 assert(type.floating);
2790
2791 /*
2792 * This should be faster but all denormals will end up as infinity.
2793 */
2794 if (0 && lp_build_fast_rsqrt_available(type)) {
2795 const unsigned num_iterations = 1;
2796 LLVMValueRef res;
2797 unsigned i;
2798
2799 /* rsqrt(1.0) != 1.0 here */
2800 res = lp_build_fast_rsqrt(bld, a);
2801
2802 if (num_iterations) {
2803 /*
2804 * Newton-Raphson will result in NaN instead of infinity for zero,
2805 * and NaN instead of zero for infinity.
2806 * Also, need to ensure rsqrt(1.0) == 1.0.
2807 * All numbers smaller than FLT_MIN will result in +infinity
2808 * (rsqrtps treats all denormals as zero).
2809 */
2810 LLVMValueRef cmp;
2811 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2812 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2813
2814 for (i = 0; i < num_iterations; ++i) {
2815 res = lp_build_rsqrt_refine(bld, a, res);
2816 }
2817 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2818 res = lp_build_select(bld, cmp, inf, res);
2819 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2820 res = lp_build_select(bld, cmp, bld->zero, res);
2821 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2822 res = lp_build_select(bld, cmp, bld->one, res);
2823 }
2824
2825 return res;
2826 }
2827
2828 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2829 }
2830
2831 /**
2832 * If there's a fast (inaccurate) rsqrt instruction available
2833 * (caller may want to avoid to call rsqrt_fast if it's not available,
2834 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2835 * unavailable it would result in sqrt/div/mul so obviously
2836 * much better to just call sqrt, skipping both div and mul).
2837 */
2838 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2839 lp_build_fast_rsqrt_available(struct lp_type type)
2840 {
2841 assert(type.floating);
2842
2843 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2844 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2845 return true;
2846 }
2847 return false;
2848 }
2849
2850
2851 /**
2852 * Generate 1/sqrt(a).
2853 * Result is undefined for values < 0, infinity for +0.
2854 * Precision is limited, only ~10 bits guaranteed
2855 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2856 */
2857 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2858 lp_build_fast_rsqrt(struct lp_build_context *bld,
2859 LLVMValueRef a)
2860 {
2861 LLVMBuilderRef builder = bld->gallivm->builder;
2862 const struct lp_type type = bld->type;
2863
2864 assert(lp_check_value(type, a));
2865
2866 if (lp_build_fast_rsqrt_available(type)) {
2867 const char *intrinsic = NULL;
2868
2869 if (type.length == 4) {
2870 intrinsic = "llvm.x86.sse.rsqrt.ps";
2871 }
2872 else {
2873 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2874 }
2875 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2876 }
2877 else {
2878 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2879 }
2880 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2881 }
2882
2883
2884 /**
2885 * Generate sin(a) or cos(a) using polynomial approximation.
2886 * TODO: it might be worth recognizing sin and cos using same source
2887 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2888 * would be way cheaper than calculating (nearly) everything twice...
2889 * Not sure it's common enough to be worth bothering however, scs
2890 * opcode could also benefit from calculating both though.
2891 */
2892 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2893 lp_build_sin_or_cos(struct lp_build_context *bld,
2894 LLVMValueRef a,
2895 boolean cos)
2896 {
2897 struct gallivm_state *gallivm = bld->gallivm;
2898 LLVMBuilderRef b = gallivm->builder;
2899 struct lp_type int_type = lp_int_type(bld->type);
2900
2901 /*
2902 * take the absolute value,
2903 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2904 */
2905
2906 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2907 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2908
2909 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2910 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2911
2912 /*
2913 * scale by 4/Pi
2914 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2915 */
2916
2917 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2918 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2919
2920 /*
2921 * store the integer part of y in mm0
2922 * emm2 = _mm_cvttps_epi32(y);
2923 */
2924
2925 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2926
2927 /*
2928 * j=(j+1) & (~1) (see the cephes sources)
2929 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2930 */
2931
2932 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2933 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2934 /*
2935 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2936 */
2937 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2938 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2939
2940 /*
2941 * y = _mm_cvtepi32_ps(emm2);
2942 */
2943 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2944
2945 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2946 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2947 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2948 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2949
2950 /*
2951 * Argument used for poly selection and sign bit determination
2952 * is different for sin vs. cos.
2953 */
2954 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2955 emm2_and;
2956
2957 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2958 LLVMBuildNot(b, emm2_2, ""), ""),
2959 const_29, "sign_bit") :
2960 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2961 LLVMBuildShl(b, emm2_add,
2962 const_29, ""), ""),
2963 sign_mask, "sign_bit");
2964
2965 /*
2966 * get the polynom selection mask
2967 * there is one polynom for 0 <= x <= Pi/4
2968 * and another one for Pi/4<x<=Pi/2
2969 * Both branches will be computed.
2970 *
2971 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2972 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2973 */
2974
2975 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2976 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2977 int_type, PIPE_FUNC_EQUAL,
2978 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2979
2980 /*
2981 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2982 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2983 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2984 */
2985 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2986 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2987 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2988
2989 /*
2990 * The magic pass: "Extended precision modular arithmetic"
2991 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2992 */
2993 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2994 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2995 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2996
2997 /*
2998 * Evaluate the first polynom (0 <= x <= Pi/4)
2999 *
3000 * z = _mm_mul_ps(x,x);
3001 */
3002 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3003
3004 /*
3005 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3006 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3007 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3008 */
3009 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3010 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3011 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3012
3013 /*
3014 * y = *(v4sf*)_ps_coscof_p0;
3015 * y = _mm_mul_ps(y, z);
3016 */
3017 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3018 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3019 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3020 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3021
3022
3023 /*
3024 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3025 * y = _mm_sub_ps(y, tmp);
3026 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3027 */
3028 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3029 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3030 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3031 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3032 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3033
3034 /*
3035 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3036 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3037 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3038 */
3039 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3040 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3041 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3042
3043 /*
3044 * Evaluate the second polynom (Pi/4 <= x <= 0)
3045 *
3046 * y2 = *(v4sf*)_ps_sincof_p0;
3047 * y2 = _mm_mul_ps(y2, z);
3048 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3049 * y2 = _mm_mul_ps(y2, z);
3050 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3051 * y2 = _mm_mul_ps(y2, z);
3052 * y2 = _mm_mul_ps(y2, x);
3053 * y2 = _mm_add_ps(y2, x);
3054 */
3055
3056 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3057 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3058 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3059 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3060
3061 /*
3062 * select the correct result from the two polynoms
3063 * xmm3 = poly_mask;
3064 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3065 * y = _mm_andnot_ps(xmm3, y);
3066 * y = _mm_or_ps(y,y2);
3067 */
3068 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3069 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3070 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3071 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3072 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3073 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3074
3075 /*
3076 * update the sign
3077 * y = _mm_xor_ps(y, sign_bit);
3078 */
3079 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3080 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3081
3082 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3083
3084 /* clamp output to be within [-1, 1] */
3085 y_result = lp_build_clamp(bld, y_result,
3086 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3087 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3088 /* If a is -inf, inf or NaN then return NaN */
3089 y_result = lp_build_select(bld, isfinite, y_result,
3090 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3091 return y_result;
3092 }
3093
3094
3095 /**
3096 * Generate sin(a)
3097 */
3098 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3099 lp_build_sin(struct lp_build_context *bld,
3100 LLVMValueRef a)
3101 {
3102 return lp_build_sin_or_cos(bld, a, FALSE);
3103 }
3104
3105
3106 /**
3107 * Generate cos(a)
3108 */
3109 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3110 lp_build_cos(struct lp_build_context *bld,
3111 LLVMValueRef a)
3112 {
3113 return lp_build_sin_or_cos(bld, a, TRUE);
3114 }
3115
3116
3117 /**
3118 * Generate pow(x, y)
3119 */
3120 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3121 lp_build_pow(struct lp_build_context *bld,
3122 LLVMValueRef x,
3123 LLVMValueRef y)
3124 {
3125 /* TODO: optimize the constant case */
3126 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3127 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3128 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3129 __FUNCTION__);
3130 }
3131
3132 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3133 }
3134
3135
3136 /**
3137 * Generate exp(x)
3138 */
3139 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3140 lp_build_exp(struct lp_build_context *bld,
3141 LLVMValueRef x)
3142 {
3143 /* log2(e) = 1/log(2) */
3144 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3145 1.4426950408889634);
3146
3147 assert(lp_check_value(bld->type, x));
3148
3149 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3150 }
3151
3152
3153 /**
3154 * Generate log(x)
3155 * Behavior is undefined with infs, 0s and nans
3156 */
3157 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3158 lp_build_log(struct lp_build_context *bld,
3159 LLVMValueRef x)
3160 {
3161 /* log(2) */
3162 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3163 0.69314718055994529);
3164
3165 assert(lp_check_value(bld->type, x));
3166
3167 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3168 }
3169
3170 /**
3171 * Generate log(x) that handles edge cases (infs, 0s and nans)
3172 */
3173 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3174 lp_build_log_safe(struct lp_build_context *bld,
3175 LLVMValueRef x)
3176 {
3177 /* log(2) */
3178 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3179 0.69314718055994529);
3180
3181 assert(lp_check_value(bld->type, x));
3182
3183 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3184 }
3185
3186
3187 /**
3188 * Generate polynomial.
3189 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3190 */
3191 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3192 lp_build_polynomial(struct lp_build_context *bld,
3193 LLVMValueRef x,
3194 const double *coeffs,
3195 unsigned num_coeffs)
3196 {
3197 const struct lp_type type = bld->type;
3198 LLVMValueRef even = NULL, odd = NULL;
3199 LLVMValueRef x2;
3200 unsigned i;
3201
3202 assert(lp_check_value(bld->type, x));
3203
3204 /* TODO: optimize the constant case */
3205 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3206 LLVMIsConstant(x)) {
3207 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3208 __FUNCTION__);
3209 }
3210
3211 /*
3212 * Calculate odd and even terms seperately to decrease data dependency
3213 * Ex:
3214 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3215 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3216 */
3217 x2 = lp_build_mul(bld, x, x);
3218
3219 for (i = num_coeffs; i--; ) {
3220 LLVMValueRef coeff;
3221
3222 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3223
3224 if (i % 2 == 0) {
3225 if (even)
3226 even = lp_build_mad(bld, x2, even, coeff);
3227 else
3228 even = coeff;
3229 } else {
3230 if (odd)
3231 odd = lp_build_mad(bld, x2, odd, coeff);
3232 else
3233 odd = coeff;
3234 }
3235 }
3236
3237 if (odd)
3238 return lp_build_mad(bld, odd, x, even);
3239 else if (even)
3240 return even;
3241 else
3242 return bld->undef;
3243 }
3244
3245
3246 /**
3247 * Minimax polynomial fit of 2**x, in range [0, 1[
3248 */
3249 const double lp_build_exp2_polynomial[] = {
3250 #if EXP_POLY_DEGREE == 5
3251 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3252 0.693153073200168932794,
3253 0.240153617044375388211,
3254 0.0558263180532956664775,
3255 0.00898934009049466391101,
3256 0.00187757667519147912699
3257 #elif EXP_POLY_DEGREE == 4
3258 1.00000259337069434683,
3259 0.693003834469974940458,
3260 0.24144275689150793076,
3261 0.0520114606103070150235,
3262 0.0135341679161270268764
3263 #elif EXP_POLY_DEGREE == 3
3264 0.999925218562710312959,
3265 0.695833540494823811697,
3266 0.226067155427249155588,
3267 0.0780245226406372992967
3268 #elif EXP_POLY_DEGREE == 2
3269 1.00172476321474503578,
3270 0.657636275736077639316,
3271 0.33718943461968720704
3272 #else
3273 #error
3274 #endif
3275 };
3276
3277
3278 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3279 lp_build_exp2(struct lp_build_context *bld,
3280 LLVMValueRef x)
3281 {
3282 LLVMBuilderRef builder = bld->gallivm->builder;
3283 const struct lp_type type = bld->type;
3284 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3285 LLVMValueRef ipart = NULL;
3286 LLVMValueRef fpart = NULL;
3287 LLVMValueRef expipart = NULL;
3288 LLVMValueRef expfpart = NULL;
3289 LLVMValueRef res = NULL;
3290
3291 assert(lp_check_value(bld->type, x));
3292
3293 /* TODO: optimize the constant case */
3294 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3295 LLVMIsConstant(x)) {
3296 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3297 __FUNCTION__);
3298 }
3299
3300 assert(type.floating && type.width == 32);
3301
3302 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3303 * the result is INF and if it's smaller than -126.9 the result is 0 */
3304 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3305 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3306 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3307 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3308
3309 /* ipart = floor(x) */
3310 /* fpart = x - ipart */
3311 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3312
3313 /* expipart = (float) (1 << ipart) */
3314 expipart = LLVMBuildAdd(builder, ipart,
3315 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3316 expipart = LLVMBuildShl(builder, expipart,
3317 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3318 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3319
3320 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3321 ARRAY_SIZE(lp_build_exp2_polynomial));
3322
3323 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3324
3325 return res;
3326 }
3327
3328
3329
3330 /**
3331 * Extract the exponent of a IEEE-754 floating point value.
3332 *
3333 * Optionally apply an integer bias.
3334 *
3335 * Result is an integer value with
3336 *
3337 * ifloor(log2(x)) + bias
3338 */
3339 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3340 lp_build_extract_exponent(struct lp_build_context *bld,
3341 LLVMValueRef x,
3342 int bias)
3343 {
3344 LLVMBuilderRef builder = bld->gallivm->builder;
3345 const struct lp_type type = bld->type;
3346 unsigned mantissa = lp_mantissa(type);
3347 LLVMValueRef res;
3348
3349 assert(type.floating);
3350
3351 assert(lp_check_value(bld->type, x));
3352
3353 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3354
3355 res = LLVMBuildLShr(builder, x,
3356 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3357 res = LLVMBuildAnd(builder, res,
3358 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3359 res = LLVMBuildSub(builder, res,
3360 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3361
3362 return res;
3363 }
3364
3365
3366 /**
3367 * Extract the mantissa of the a floating.
3368 *
3369 * Result is a floating point value with
3370 *
3371 * x / floor(log2(x))
3372 */
3373 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3374 lp_build_extract_mantissa(struct lp_build_context *bld,
3375 LLVMValueRef x)
3376 {
3377 LLVMBuilderRef builder = bld->gallivm->builder;
3378 const struct lp_type type = bld->type;
3379 unsigned mantissa = lp_mantissa(type);
3380 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3381 (1ULL << mantissa) - 1);
3382 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3383 LLVMValueRef res;
3384
3385 assert(lp_check_value(bld->type, x));
3386
3387 assert(type.floating);
3388
3389 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3390
3391 /* res = x / 2**ipart */
3392 res = LLVMBuildAnd(builder, x, mantmask, "");
3393 res = LLVMBuildOr(builder, res, one, "");
3394 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3395
3396 return res;
3397 }
3398
3399
3400
3401 /**
3402 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3403 * These coefficients can be generate with
3404 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3405 */
3406 const double lp_build_log2_polynomial[] = {
3407 #if LOG_POLY_DEGREE == 5
3408 2.88539008148777786488L,
3409 0.961796878841293367824L,
3410 0.577058946784739859012L,
3411 0.412914355135828735411L,
3412 0.308591899232910175289L,
3413 0.352376952300281371868L,
3414 #elif LOG_POLY_DEGREE == 4
3415 2.88539009343309178325L,
3416 0.961791550404184197881L,
3417 0.577440339438736392009L,
3418 0.403343858251329912514L,
3419 0.406718052498846252698L,
3420 #elif LOG_POLY_DEGREE == 3
3421 2.88538959748872753838L,
3422 0.961932915889597772928L,
3423 0.571118517972136195241L,
3424 0.493997535084709500285L,
3425 #else
3426 #error
3427 #endif
3428 };
3429
3430 /**
3431 * See http://www.devmaster.net/forums/showthread.php?p=43580
3432 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3433 * http://www.nezumi.demon.co.uk/consult/logx.htm
3434 *
3435 * If handle_edge_cases is true the function will perform computations
3436 * to match the required D3D10+ behavior for each of the edge cases.
3437 * That means that if input is:
3438 * - less than zero (to and including -inf) then NaN will be returned
3439 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3440 * - +infinity, then +infinity will be returned
3441 * - NaN, then NaN will be returned
3442 *
3443 * Those checks are fairly expensive so if you don't need them make sure
3444 * handle_edge_cases is false.
3445 */
3446 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3447 lp_build_log2_approx(struct lp_build_context *bld,
3448 LLVMValueRef x,
3449 LLVMValueRef *p_exp,
3450 LLVMValueRef *p_floor_log2,
3451 LLVMValueRef *p_log2,
3452 boolean handle_edge_cases)
3453 {
3454 LLVMBuilderRef builder = bld->gallivm->builder;
3455 const struct lp_type type = bld->type;
3456 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3457 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3458
3459 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3460 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3461 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3462
3463 LLVMValueRef i = NULL;
3464 LLVMValueRef y = NULL;
3465 LLVMValueRef z = NULL;
3466 LLVMValueRef exp = NULL;
3467 LLVMValueRef mant = NULL;
3468 LLVMValueRef logexp = NULL;
3469 LLVMValueRef p_z = NULL;
3470 LLVMValueRef res = NULL;
3471
3472 assert(lp_check_value(bld->type, x));
3473
3474 if(p_exp || p_floor_log2 || p_log2) {
3475 /* TODO: optimize the constant case */
3476 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3477 LLVMIsConstant(x)) {
3478 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3479 __FUNCTION__);
3480 }
3481
3482 assert(type.floating && type.width == 32);
3483
3484 /*
3485 * We don't explicitly handle denormalized numbers. They will yield a
3486 * result in the neighbourhood of -127, which appears to be adequate
3487 * enough.
3488 */
3489
3490 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3491
3492 /* exp = (float) exponent(x) */
3493 exp = LLVMBuildAnd(builder, i, expmask, "");
3494 }
3495
3496 if(p_floor_log2 || p_log2) {
3497 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3498 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3499 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3500 }
3501
3502 if (p_log2) {
3503 /* mant = 1 + (float) mantissa(x) */
3504 mant = LLVMBuildAnd(builder, i, mantmask, "");
3505 mant = LLVMBuildOr(builder, mant, one, "");
3506 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3507
3508 /* y = (mant - 1) / (mant + 1) */
3509 y = lp_build_div(bld,
3510 lp_build_sub(bld, mant, bld->one),
3511 lp_build_add(bld, mant, bld->one)
3512 );
3513
3514 /* z = y^2 */
3515 z = lp_build_mul(bld, y, y);
3516
3517 /* compute P(z) */
3518 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3519 ARRAY_SIZE(lp_build_log2_polynomial));
3520
3521 /* y * P(z) + logexp */
3522 res = lp_build_mad(bld, y, p_z, logexp);
3523
3524 if (type.floating && handle_edge_cases) {
3525 LLVMValueRef negmask, infmask, zmask;
3526 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3527 lp_build_const_vec(bld->gallivm, type, 0.0f));
3528 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3529 lp_build_const_vec(bld->gallivm, type, 0.0f));
3530 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3531 lp_build_const_vec(bld->gallivm, type, INFINITY));
3532
3533 /* If x is qual to inf make sure we return inf */
3534 res = lp_build_select(bld, infmask,
3535 lp_build_const_vec(bld->gallivm, type, INFINITY),
3536 res);
3537 /* If x is qual to 0, return -inf */
3538 res = lp_build_select(bld, zmask,
3539 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3540 res);
3541 /* If x is nan or less than 0, return nan */
3542 res = lp_build_select(bld, negmask,
3543 lp_build_const_vec(bld->gallivm, type, NAN),
3544 res);
3545 }
3546 }
3547
3548 if (p_exp) {
3549 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3550 *p_exp = exp;
3551 }
3552
3553 if (p_floor_log2)
3554 *p_floor_log2 = logexp;
3555
3556 if (p_log2)
3557 *p_log2 = res;
3558 }
3559
3560
3561 /*
3562 * log2 implementation which doesn't have special code to
3563 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3564 * the results for those cases are undefined.
3565 */
3566 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3567 lp_build_log2(struct lp_build_context *bld,
3568 LLVMValueRef x)
3569 {
3570 LLVMValueRef res;
3571 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3572 return res;
3573 }
3574
3575 /*
3576 * Version of log2 which handles all edge cases.
3577 * Look at documentation of lp_build_log2_approx for
3578 * description of the behavior for each of the edge cases.
3579 */
3580 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3581 lp_build_log2_safe(struct lp_build_context *bld,
3582 LLVMValueRef x)
3583 {
3584 LLVMValueRef res;
3585 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3586 return res;
3587 }
3588
3589
3590 /**
3591 * Faster (and less accurate) log2.
3592 *
3593 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3594 *
3595 * Piece-wise linear approximation, with exact results when x is a
3596 * power of two.
3597 *
3598 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3599 */
3600 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3601 lp_build_fast_log2(struct lp_build_context *bld,
3602 LLVMValueRef x)
3603 {
3604 LLVMBuilderRef builder = bld->gallivm->builder;
3605 LLVMValueRef ipart;
3606 LLVMValueRef fpart;
3607
3608 assert(lp_check_value(bld->type, x));
3609
3610 assert(bld->type.floating);
3611
3612 /* ipart = floor(log2(x)) - 1 */
3613 ipart = lp_build_extract_exponent(bld, x, -1);
3614 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3615
3616 /* fpart = x / 2**ipart */
3617 fpart = lp_build_extract_mantissa(bld, x);
3618
3619 /* ipart + fpart */
3620 return LLVMBuildFAdd(builder, ipart, fpart, "");
3621 }
3622
3623
3624 /**
3625 * Fast implementation of iround(log2(x)).
3626 *
3627 * Not an approximation -- it should give accurate results all the time.
3628 */
3629 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3630 lp_build_ilog2(struct lp_build_context *bld,
3631 LLVMValueRef x)
3632 {
3633 LLVMBuilderRef builder = bld->gallivm->builder;
3634 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3635 LLVMValueRef ipart;
3636
3637 assert(bld->type.floating);
3638
3639 assert(lp_check_value(bld->type, x));
3640
3641 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3642 x = LLVMBuildFMul(builder, x, sqrt2, "");
3643
3644 /* ipart = floor(log2(x) + 0.5) */
3645 ipart = lp_build_extract_exponent(bld, x, 0);
3646
3647 return ipart;
3648 }
3649
3650 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3651 lp_build_mod(struct lp_build_context *bld,
3652 LLVMValueRef x,
3653 LLVMValueRef y)
3654 {
3655 LLVMBuilderRef builder = bld->gallivm->builder;
3656 LLVMValueRef res;
3657 const struct lp_type type = bld->type;
3658
3659 assert(lp_check_value(type, x));
3660 assert(lp_check_value(type, y));
3661
3662 if (type.floating)
3663 res = LLVMBuildFRem(builder, x, y, "");
3664 else if (type.sign)
3665 res = LLVMBuildSRem(builder, x, y, "");
3666 else
3667 res = LLVMBuildURem(builder, x, y, "");
3668 return res;
3669 }
3670
3671
3672 /*
3673 * For floating inputs it creates and returns a mask
3674 * which is all 1's for channels which are NaN.
3675 * Channels inside x which are not NaN will be 0.
3676 */
3677 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3678 lp_build_isnan(struct lp_build_context *bld,
3679 LLVMValueRef x)
3680 {
3681 LLVMValueRef mask;
3682 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683
3684 assert(bld->type.floating);
3685 assert(lp_check_value(bld->type, x));
3686
3687 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3688 "isnotnan");
3689 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3690 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3691 return mask;
3692 }
3693
3694 /* Returns all 1's for floating point numbers that are
3695 * finite numbers and returns all zeros for -inf,
3696 * inf and nan's */
3697 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3698 lp_build_isfinite(struct lp_build_context *bld,
3699 LLVMValueRef x)
3700 {
3701 LLVMBuilderRef builder = bld->gallivm->builder;
3702 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3703 struct lp_type int_type = lp_int_type(bld->type);
3704 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3705 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3706 0x7f800000);
3707
3708 if (!bld->type.floating) {
3709 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3710 }
3711 assert(bld->type.floating);
3712 assert(lp_check_value(bld->type, x));
3713 assert(bld->type.width == 32);
3714
3715 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3716 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3717 intx, infornan32);
3718 }
3719
3720 /*
3721 * Returns true if the number is nan or inf and false otherwise.
3722 * The input has to be a floating point vector.
3723 */
3724 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3725 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3726 const struct lp_type type,
3727 LLVMValueRef x)
3728 {
3729 LLVMBuilderRef builder = gallivm->builder;
3730 struct lp_type int_type = lp_int_type(type);
3731 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3732 0x7f800000);
3733 LLVMValueRef ret;
3734
3735 assert(type.floating);
3736
3737 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3738 ret = LLVMBuildAnd(builder, ret, const0, "");
3739 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3740 ret, const0);
3741
3742 return ret;
3743 }
3744
3745
3746 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3747 lp_build_fpstate_get(struct gallivm_state *gallivm)
3748 {
3749 if (util_cpu_caps.has_sse) {
3750 LLVMBuilderRef builder = gallivm->builder;
3751 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3752 gallivm,
3753 LLVMInt32TypeInContext(gallivm->context),
3754 "mxcsr_ptr");
3755 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3756 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3757 lp_build_intrinsic(builder,
3758 "llvm.x86.sse.stmxcsr",
3759 LLVMVoidTypeInContext(gallivm->context),
3760 &mxcsr_ptr8, 1, 0);
3761 return mxcsr_ptr;
3762 }
3763 return 0;
3764 }
3765
3766 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3767 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3768 boolean zero)
3769 {
3770 if (util_cpu_caps.has_sse) {
3771 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3772 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3773
3774 LLVMBuilderRef builder = gallivm->builder;
3775 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3776 LLVMValueRef mxcsr =
3777 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3778
3779 if (util_cpu_caps.has_daz) {
3780 /* Enable denormals are zero mode */
3781 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3782 }
3783 if (zero) {
3784 mxcsr = LLVMBuildOr(builder, mxcsr,
3785 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3786 } else {
3787 mxcsr = LLVMBuildAnd(builder, mxcsr,
3788 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3789 }
3790
3791 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3792 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3793 }
3794 }
3795
3796 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3797 lp_build_fpstate_set(struct gallivm_state *gallivm,
3798 LLVMValueRef mxcsr_ptr)
3799 {
3800 if (util_cpu_caps.has_sse) {
3801 LLVMBuilderRef builder = gallivm->builder;
3802 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3803 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3804 lp_build_intrinsic(builder,
3805 "llvm.x86.sse.ldmxcsr",
3806 LLVMVoidTypeInContext(gallivm->context),
3807 &mxcsr_ptr, 1, 0);
3808 }
3809 }
3810