1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if(a == bld->zero)
545 return b;
546 if(b == bld->zero)
547 return a;
548 if(a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if(bld->type.norm) {
552 const char *intrinsic = NULL;
553
554 if(a == bld->one || b == bld->one)
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if(util_cpu_caps.has_sse2) {
560 if(type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if(type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if(type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if(type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if(util_cpu_caps.has_avx2) {
573 if(type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if(type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if(b == bld->zero)
846 return a;
847 if(a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if(a == b)
850 return bld->zero;
851
852 if(bld->type.norm) {
853 const char *intrinsic = NULL;
854
855 if(b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if(type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if(type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if(type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if(type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if(type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if(type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 static LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094 /*
1095 * Widening mul, valid for 32x32 bit -> 64bit only.
1096 * Result is low 32bits, high bits returned in res_hi.
1097 *
1098 * Emits code that is meant to be compiled for the host CPU.
1099 */
1100 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102 LLVMValueRef a,
1103 LLVMValueRef b,
1104 LLVMValueRef *res_hi)
1105 {
1106 struct gallivm_state *gallivm = bld->gallivm;
1107 LLVMBuilderRef builder = gallivm->builder;
1108
1109 assert(bld->type.width == 32);
1110 assert(bld->type.floating == 0);
1111 assert(bld->type.fixed == 0);
1112 assert(bld->type.norm == 0);
1113
1114 /*
1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116 * for x86 simd is atrocious (even if the high bits weren't required),
1117 * trying to handle real 64bit inputs (which of course can't happen due
1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119 * apparently llvm does not recognize this widening mul). This includes 6
1120 * (instead of 2) pmuludq plus extra adds and shifts
1121 * The same story applies to signed mul, albeit fixing this requires sse41.
1122 * https://llvm.org/bugs/show_bug.cgi?id=30845
1123 * So, whip up our own code, albeit only for length 4 and 8 (which
1124 * should be good enough)...
1125 */
1126 if ((bld->type.length == 4 || bld->type.length == 8) &&
1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128 util_cpu_caps.has_sse4_1)) {
1129 const char *intrinsic = NULL;
1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132 struct lp_type type_wide = lp_wider_type(bld->type);
1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134 unsigned i;
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i+1);
1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 aeven = a;
1141 beven = b;
1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146 if (bld->type.sign) {
1147 intrinsic = "llvm.x86.avx2.pmul.dq";
1148 } else {
1149 intrinsic = "llvm.x86.avx2.pmulu.dq";
1150 }
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 else {
1157 /* for consistent naming look elsewhere... */
1158 if (bld->type.sign) {
1159 intrinsic = "llvm.x86.sse41.pmuldq";
1160 } else {
1161 intrinsic = "llvm.x86.sse2.pmulu.dq";
1162 }
1163 /*
1164 * XXX If we only have AVX but not AVX2 this is a pain.
1165 * lp_build_intrinsic_binary_anylength() can't handle it
1166 * (due to src and dst type not being identical).
1167 */
1168 if (bld->type.length == 8) {
1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171 LLVMValueRef muleven2[2], mulodd2[2];
1172 struct lp_type type_wide_half = type_wide;
1173 LLVMTypeRef wtype_half;
1174 type_wide_half.length = 2;
1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185 wtype_half, aevenlo, bevenlo);
1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187 wtype_half, aoddlo, boddlo);
1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189 wtype_half, aevenhi, bevenhi);
1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191 wtype_half, aoddhi, boddhi);
1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195 }
1196 else {
1197 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198 wider_type, aeven, beven);
1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200 wider_type, aodd, bodd);
1201 }
1202 }
1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206 for (i = 0; i < bld->type.length; i += 2) {
1207 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209 }
1210 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213 for (i = 0; i < bld->type.length; i += 2) {
1214 shuf[i] = lp_build_const_int32(gallivm, i);
1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216 }
1217 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219 }
1220 else {
1221 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222 }
1223 }
1224
1225
1226 /*
1227 * Widening mul, valid for 32x32 bit -> 64bit only.
1228 * Result is low 32bits, high bits returned in res_hi.
1229 *
1230 * Emits generic code.
1231 */
1232 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef *res_hi)
1237 {
1238 struct gallivm_state *gallivm = bld->gallivm;
1239 LLVMBuilderRef builder = gallivm->builder;
1240 LLVMValueRef tmp, shift, res_lo;
1241 struct lp_type type_tmp;
1242 LLVMTypeRef wide_type, narrow_type;
1243
1244 type_tmp = bld->type;
1245 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246 type_tmp.width *= 2;
1247 wide_type = lp_build_vec_type(gallivm, type_tmp);
1248 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250 if (bld->type.sign) {
1251 a = LLVMBuildSExt(builder, a, wide_type, "");
1252 b = LLVMBuildSExt(builder, b, wide_type, "");
1253 } else {
1254 a = LLVMBuildZExt(builder, a, wide_type, "");
1255 b = LLVMBuildZExt(builder, b, wide_type, "");
1256 }
1257 tmp = LLVMBuildMul(builder, a, b, "");
1258
1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261 /* Since we truncate anyway, LShr and AShr are equivalent. */
1262 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265 return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1271 lp_build_mad(struct lp_build_context *bld,
1272 LLVMValueRef a,
1273 LLVMValueRef b,
1274 LLVMValueRef c)
1275 {
1276 const struct lp_type type = bld->type;
1277 if (type.floating) {
1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279 } else {
1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281 }
1282 }
1283
1284
1285 /**
1286 * Small vector x scale multiplication optimization.
1287 */
1288 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1289 lp_build_mul_imm(struct lp_build_context *bld,
1290 LLVMValueRef a,
1291 int b)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 LLVMValueRef factor;
1295
1296 assert(lp_check_value(bld->type, a));
1297
1298 if(b == 0)
1299 return bld->zero;
1300
1301 if(b == 1)
1302 return a;
1303
1304 if(b == -1)
1305 return lp_build_negate(bld, a);
1306
1307 if(b == 2 && bld->type.floating)
1308 return lp_build_add(bld, a, a);
1309
1310 if(util_is_power_of_two(b)) {
1311 unsigned shift = ffs(b) - 1;
1312
1313 if(bld->type.floating) {
1314 #if 0
1315 /*
1316 * Power of two multiplication by directly manipulating the exponent.
1317 *
1318 * XXX: This might not be always faster, it will introduce a small error
1319 * for multiplication by zero, and it will produce wrong results
1320 * for Inf and NaN.
1321 */
1322 unsigned mantissa = lp_mantissa(bld->type);
1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325 a = LLVMBuildAdd(builder, a, factor, "");
1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327 return a;
1328 #endif
1329 }
1330 else {
1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332 return LLVMBuildShl(builder, a, factor, "");
1333 }
1334 }
1335
1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337 return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342 * Generate a / b
1343 */
1344 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1345 lp_build_div(struct lp_build_context *bld,
1346 LLVMValueRef a,
1347 LLVMValueRef b)
1348 {
1349 LLVMBuilderRef builder = bld->gallivm->builder;
1350 const struct lp_type type = bld->type;
1351
1352 assert(lp_check_value(type, a));
1353 assert(lp_check_value(type, b));
1354
1355 if(a == bld->zero)
1356 return bld->zero;
1357 if(a == bld->one && type.floating)
1358 return lp_build_rcp(bld, b);
1359 if(b == bld->zero)
1360 return bld->undef;
1361 if(b == bld->one)
1362 return a;
1363 if(a == bld->undef || b == bld->undef)
1364 return bld->undef;
1365
1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367 if (type.floating)
1368 return LLVMConstFDiv(a, b);
1369 else if (type.sign)
1370 return LLVMConstSDiv(a, b);
1371 else
1372 return LLVMConstUDiv(a, b);
1373 }
1374
1375 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1376 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1377 type.floating)
1378 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1379
1380 if (type.floating)
1381 return LLVMBuildFDiv(builder, a, b, "");
1382 else if (type.sign)
1383 return LLVMBuildSDiv(builder, a, b, "");
1384 else
1385 return LLVMBuildUDiv(builder, a, b, "");
1386 }
1387
1388
1389 /**
1390 * Linear interpolation helper.
1391 *
1392 * @param normalized whether we are interpolating normalized values,
1393 * encoded in normalized integers, twice as wide.
1394 *
1395 * @sa http://www.stereopsis.com/doubleblend.html
1396 */
1397 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1398 lp_build_lerp_simple(struct lp_build_context *bld,
1399 LLVMValueRef x,
1400 LLVMValueRef v0,
1401 LLVMValueRef v1,
1402 unsigned flags)
1403 {
1404 unsigned half_width = bld->type.width/2;
1405 LLVMBuilderRef builder = bld->gallivm->builder;
1406 LLVMValueRef delta;
1407 LLVMValueRef res;
1408
1409 assert(lp_check_value(bld->type, x));
1410 assert(lp_check_value(bld->type, v0));
1411 assert(lp_check_value(bld->type, v1));
1412
1413 delta = lp_build_sub(bld, v1, v0);
1414
1415 if (bld->type.floating) {
1416 assert(flags == 0);
1417 return lp_build_mad(bld, x, delta, v0);
1418 }
1419
1420 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1421 if (!bld->type.sign) {
1422 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1423 /*
1424 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1425 * most-significant-bit to the lowest-significant-bit, so that
1426 * later we can just divide by 2**n instead of 2**n - 1.
1427 */
1428
1429 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1430 }
1431
1432 /* (x * delta) >> n */
1433 res = lp_build_mul(bld, x, delta);
1434 res = lp_build_shr_imm(bld, res, half_width);
1435 } else {
1436 /*
1437 * The rescaling trick above doesn't work for signed numbers, so
1438 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1439 * instead.
1440 */
1441 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1442 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1443 }
1444 } else {
1445 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1446 res = lp_build_mul(bld, x, delta);
1447 }
1448
1449 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1450 /*
1451 * At this point both res and v0 only use the lower half of the bits,
1452 * the rest is zero. Instead of add / mask, do add with half wide type.
1453 */
1454 struct lp_type narrow_type;
1455 struct lp_build_context narrow_bld;
1456
1457 memset(&narrow_type, 0, sizeof narrow_type);
1458 narrow_type.sign = bld->type.sign;
1459 narrow_type.width = bld->type.width/2;
1460 narrow_type.length = bld->type.length*2;
1461
1462 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1463 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1464 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1465 res = lp_build_add(&narrow_bld, v0, res);
1466 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1467 } else {
1468 res = lp_build_add(bld, v0, res);
1469
1470 if (bld->type.fixed) {
1471 /*
1472 * We need to mask out the high order bits when lerping 8bit
1473 * normalized colors stored on 16bits
1474 */
1475 /* XXX: This step is necessary for lerping 8bit colors stored on
1476 * 16bits, but it will be wrong for true fixed point use cases.
1477 * Basically we need a more powerful lp_type, capable of further
1478 * distinguishing the values interpretation from the value storage.
1479 */
1480 LLVMValueRef low_bits;
1481 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1482 res = LLVMBuildAnd(builder, res, low_bits, "");
1483 }
1484 }
1485
1486 return res;
1487 }
1488
1489
1490 /**
1491 * Linear interpolation.
1492 */
1493 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1494 lp_build_lerp(struct lp_build_context *bld,
1495 LLVMValueRef x,
1496 LLVMValueRef v0,
1497 LLVMValueRef v1,
1498 unsigned flags)
1499 {
1500 const struct lp_type type = bld->type;
1501 LLVMValueRef res;
1502
1503 assert(lp_check_value(type, x));
1504 assert(lp_check_value(type, v0));
1505 assert(lp_check_value(type, v1));
1506
1507 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1508
1509 if (type.norm) {
1510 struct lp_type wide_type;
1511 struct lp_build_context wide_bld;
1512 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1513
1514 assert(type.length >= 2);
1515
1516 /*
1517 * Create a wider integer type, enough to hold the
1518 * intermediate result of the multiplication.
1519 */
1520 memset(&wide_type, 0, sizeof wide_type);
1521 wide_type.sign = type.sign;
1522 wide_type.width = type.width*2;
1523 wide_type.length = type.length/2;
1524
1525 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1526
1527 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1528 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1530
1531 /*
1532 * Lerp both halves.
1533 */
1534
1535 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1536
1537 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1538 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1539
1540 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1541 } else {
1542 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1543 }
1544
1545 return res;
1546 }
1547
1548
1549 /**
1550 * Bilinear interpolation.
1551 *
1552 * Values indices are in v_{yx}.
1553 */
1554 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1555 lp_build_lerp_2d(struct lp_build_context *bld,
1556 LLVMValueRef x,
1557 LLVMValueRef y,
1558 LLVMValueRef v00,
1559 LLVMValueRef v01,
1560 LLVMValueRef v10,
1561 LLVMValueRef v11,
1562 unsigned flags)
1563 {
1564 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1565 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1566 return lp_build_lerp(bld, y, v0, v1, flags);
1567 }
1568
1569
1570 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1571 lp_build_lerp_3d(struct lp_build_context *bld,
1572 LLVMValueRef x,
1573 LLVMValueRef y,
1574 LLVMValueRef z,
1575 LLVMValueRef v000,
1576 LLVMValueRef v001,
1577 LLVMValueRef v010,
1578 LLVMValueRef v011,
1579 LLVMValueRef v100,
1580 LLVMValueRef v101,
1581 LLVMValueRef v110,
1582 LLVMValueRef v111,
1583 unsigned flags)
1584 {
1585 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1586 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1587 return lp_build_lerp(bld, z, v0, v1, flags);
1588 }
1589
1590
1591 /**
1592 * Generate min(a, b)
1593 * Do checks for special cases but not for nans.
1594 */
1595 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1596 lp_build_min(struct lp_build_context *bld,
1597 LLVMValueRef a,
1598 LLVMValueRef b)
1599 {
1600 assert(lp_check_value(bld->type, a));
1601 assert(lp_check_value(bld->type, b));
1602
1603 if(a == bld->undef || b == bld->undef)
1604 return bld->undef;
1605
1606 if(a == b)
1607 return a;
1608
1609 if (bld->type.norm) {
1610 if (!bld->type.sign) {
1611 if (a == bld->zero || b == bld->zero) {
1612 return bld->zero;
1613 }
1614 }
1615 if(a == bld->one)
1616 return b;
1617 if(b == bld->one)
1618 return a;
1619 }
1620
1621 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1622 }
1623
1624
1625 /**
1626 * Generate min(a, b)
1627 * NaN's are handled according to the behavior specified by the
1628 * nan_behavior argument.
1629 */
1630 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1631 lp_build_min_ext(struct lp_build_context *bld,
1632 LLVMValueRef a,
1633 LLVMValueRef b,
1634 enum gallivm_nan_behavior nan_behavior)
1635 {
1636 assert(lp_check_value(bld->type, a));
1637 assert(lp_check_value(bld->type, b));
1638
1639 if(a == bld->undef || b == bld->undef)
1640 return bld->undef;
1641
1642 if(a == b)
1643 return a;
1644
1645 if (bld->type.norm) {
1646 if (!bld->type.sign) {
1647 if (a == bld->zero || b == bld->zero) {
1648 return bld->zero;
1649 }
1650 }
1651 if(a == bld->one)
1652 return b;
1653 if(b == bld->one)
1654 return a;
1655 }
1656
1657 return lp_build_min_simple(bld, a, b, nan_behavior);
1658 }
1659
1660 /**
1661 * Generate max(a, b)
1662 * Do checks for special cases, but NaN behavior is undefined.
1663 */
1664 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1665 lp_build_max(struct lp_build_context *bld,
1666 LLVMValueRef a,
1667 LLVMValueRef b)
1668 {
1669 assert(lp_check_value(bld->type, a));
1670 assert(lp_check_value(bld->type, b));
1671
1672 if(a == bld->undef || b == bld->undef)
1673 return bld->undef;
1674
1675 if(a == b)
1676 return a;
1677
1678 if(bld->type.norm) {
1679 if(a == bld->one || b == bld->one)
1680 return bld->one;
1681 if (!bld->type.sign) {
1682 if (a == bld->zero) {
1683 return b;
1684 }
1685 if (b == bld->zero) {
1686 return a;
1687 }
1688 }
1689 }
1690
1691 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1692 }
1693
1694
1695 /**
1696 * Generate max(a, b)
1697 * Checks for special cases.
1698 * NaN's are handled according to the behavior specified by the
1699 * nan_behavior argument.
1700 */
1701 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1702 lp_build_max_ext(struct lp_build_context *bld,
1703 LLVMValueRef a,
1704 LLVMValueRef b,
1705 enum gallivm_nan_behavior nan_behavior)
1706 {
1707 assert(lp_check_value(bld->type, a));
1708 assert(lp_check_value(bld->type, b));
1709
1710 if(a == bld->undef || b == bld->undef)
1711 return bld->undef;
1712
1713 if(a == b)
1714 return a;
1715
1716 if(bld->type.norm) {
1717 if(a == bld->one || b == bld->one)
1718 return bld->one;
1719 if (!bld->type.sign) {
1720 if (a == bld->zero) {
1721 return b;
1722 }
1723 if (b == bld->zero) {
1724 return a;
1725 }
1726 }
1727 }
1728
1729 return lp_build_max_simple(bld, a, b, nan_behavior);
1730 }
1731
1732 /**
1733 * Generate clamp(a, min, max)
1734 * NaN behavior (for any of a, min, max) is undefined.
1735 * Do checks for special cases.
1736 */
1737 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1738 lp_build_clamp(struct lp_build_context *bld,
1739 LLVMValueRef a,
1740 LLVMValueRef min,
1741 LLVMValueRef max)
1742 {
1743 assert(lp_check_value(bld->type, a));
1744 assert(lp_check_value(bld->type, min));
1745 assert(lp_check_value(bld->type, max));
1746
1747 a = lp_build_min(bld, a, max);
1748 a = lp_build_max(bld, a, min);
1749 return a;
1750 }
1751
1752
1753 /**
1754 * Generate clamp(a, 0, 1)
1755 * A NaN will get converted to zero.
1756 */
1757 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1758 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1759 LLVMValueRef a)
1760 {
1761 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1762 a = lp_build_min(bld, a, bld->one);
1763 return a;
1764 }
1765
1766
1767 /**
1768 * Generate abs(a)
1769 */
1770 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1771 lp_build_abs(struct lp_build_context *bld,
1772 LLVMValueRef a)
1773 {
1774 LLVMBuilderRef builder = bld->gallivm->builder;
1775 const struct lp_type type = bld->type;
1776 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1777
1778 assert(lp_check_value(type, a));
1779
1780 if(!type.sign)
1781 return a;
1782
1783 if(type.floating) {
1784 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1785 /* Workaround llvm.org/PR27332 */
1786 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1787 unsigned long long absMask = ~(1ULL << (type.width - 1));
1788 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1789 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1790 a = LLVMBuildAnd(builder, a, mask, "");
1791 a = LLVMBuildBitCast(builder, a, vec_type, "");
1792 return a;
1793 } else {
1794 char intrinsic[32];
1795 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1796 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1797 }
1798 }
1799
1800 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1801 switch(type.width) {
1802 case 8:
1803 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1804 case 16:
1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1806 case 32:
1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1808 }
1809 }
1810 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1811 switch(type.width) {
1812 case 8:
1813 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1814 case 16:
1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1816 case 32:
1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1818 }
1819 }
1820 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1821 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1822 (type.width == 8 || type.width == 16 || type.width == 32)) {
1823 debug_printf("%s: inefficient code, should split vectors manually\n",
1824 __FUNCTION__);
1825 }
1826
1827 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1828 }
1829
1830
1831 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1832 lp_build_negate(struct lp_build_context *bld,
1833 LLVMValueRef a)
1834 {
1835 LLVMBuilderRef builder = bld->gallivm->builder;
1836
1837 assert(lp_check_value(bld->type, a));
1838
1839 if (bld->type.floating)
1840 a = LLVMBuildFNeg(builder, a, "");
1841 else
1842 a = LLVMBuildNeg(builder, a, "");
1843
1844 return a;
1845 }
1846
1847
1848 /** Return -1, 0 or +1 depending on the sign of a */
1849 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1850 lp_build_sgn(struct lp_build_context *bld,
1851 LLVMValueRef a)
1852 {
1853 LLVMBuilderRef builder = bld->gallivm->builder;
1854 const struct lp_type type = bld->type;
1855 LLVMValueRef cond;
1856 LLVMValueRef res;
1857
1858 assert(lp_check_value(type, a));
1859
1860 /* Handle non-zero case */
1861 if(!type.sign) {
1862 /* if not zero then sign must be positive */
1863 res = bld->one;
1864 }
1865 else if(type.floating) {
1866 LLVMTypeRef vec_type;
1867 LLVMTypeRef int_type;
1868 LLVMValueRef mask;
1869 LLVMValueRef sign;
1870 LLVMValueRef one;
1871 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1872
1873 int_type = lp_build_int_vec_type(bld->gallivm, type);
1874 vec_type = lp_build_vec_type(bld->gallivm, type);
1875 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1876
1877 /* Take the sign bit and add it to 1 constant */
1878 sign = LLVMBuildBitCast(builder, a, int_type, "");
1879 sign = LLVMBuildAnd(builder, sign, mask, "");
1880 one = LLVMConstBitCast(bld->one, int_type);
1881 res = LLVMBuildOr(builder, sign, one, "");
1882 res = LLVMBuildBitCast(builder, res, vec_type, "");
1883 }
1884 else
1885 {
1886 /* signed int/norm/fixed point */
1887 /* could use psign with sse3 and appropriate vectors here */
1888 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1889 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1890 res = lp_build_select(bld, cond, bld->one, minus_one);
1891 }
1892
1893 /* Handle zero */
1894 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1895 res = lp_build_select(bld, cond, bld->zero, res);
1896
1897 return res;
1898 }
1899
1900
1901 /**
1902 * Set the sign of float vector 'a' according to 'sign'.
1903 * If sign==0, return abs(a).
1904 * If sign==1, return -abs(a);
1905 * Other values for sign produce undefined results.
1906 */
1907 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1908 lp_build_set_sign(struct lp_build_context *bld,
1909 LLVMValueRef a, LLVMValueRef sign)
1910 {
1911 LLVMBuilderRef builder = bld->gallivm->builder;
1912 const struct lp_type type = bld->type;
1913 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1914 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1915 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1916 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1917 ~((unsigned long long) 1 << (type.width - 1)));
1918 LLVMValueRef val, res;
1919
1920 assert(type.floating);
1921 assert(lp_check_value(type, a));
1922
1923 /* val = reinterpret_cast<int>(a) */
1924 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1925 /* val = val & mask */
1926 val = LLVMBuildAnd(builder, val, mask, "");
1927 /* sign = sign << shift */
1928 sign = LLVMBuildShl(builder, sign, shift, "");
1929 /* res = val | sign */
1930 res = LLVMBuildOr(builder, val, sign, "");
1931 /* res = reinterpret_cast<float>(res) */
1932 res = LLVMBuildBitCast(builder, res, vec_type, "");
1933
1934 return res;
1935 }
1936
1937
1938 /**
1939 * Convert vector of (or scalar) int to vector of (or scalar) float.
1940 */
1941 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1942 lp_build_int_to_float(struct lp_build_context *bld,
1943 LLVMValueRef a)
1944 {
1945 LLVMBuilderRef builder = bld->gallivm->builder;
1946 const struct lp_type type = bld->type;
1947 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1948
1949 assert(type.floating);
1950
1951 return LLVMBuildSIToFP(builder, a, vec_type, "");
1952 }
1953
1954 static boolean
arch_rounding_available(const struct lp_type type)1955 arch_rounding_available(const struct lp_type type)
1956 {
1957 if ((util_cpu_caps.has_sse4_1 &&
1958 (type.length == 1 || type.width*type.length == 128)) ||
1959 (util_cpu_caps.has_avx && type.width*type.length == 256))
1960 return TRUE;
1961 else if ((util_cpu_caps.has_altivec &&
1962 (type.width == 32 && type.length == 4)))
1963 return TRUE;
1964
1965 return FALSE;
1966 }
1967
1968 enum lp_build_round_mode
1969 {
1970 LP_BUILD_ROUND_NEAREST = 0,
1971 LP_BUILD_ROUND_FLOOR = 1,
1972 LP_BUILD_ROUND_CEIL = 2,
1973 LP_BUILD_ROUND_TRUNCATE = 3
1974 };
1975
1976 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1977 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1978 LLVMValueRef a)
1979 {
1980 LLVMBuilderRef builder = bld->gallivm->builder;
1981 const struct lp_type type = bld->type;
1982 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1983 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1984 const char *intrinsic;
1985 LLVMValueRef res;
1986
1987 assert(type.floating);
1988 /* using the double precision conversions is a bit more complicated */
1989 assert(type.width == 32);
1990
1991 assert(lp_check_value(type, a));
1992 assert(util_cpu_caps.has_sse2);
1993
1994 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1995 if (type.length == 1) {
1996 LLVMTypeRef vec_type;
1997 LLVMValueRef undef;
1998 LLVMValueRef arg;
1999 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2000
2001 vec_type = LLVMVectorType(bld->elem_type, 4);
2002
2003 intrinsic = "llvm.x86.sse.cvtss2si";
2004
2005 undef = LLVMGetUndef(vec_type);
2006
2007 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2008
2009 res = lp_build_intrinsic_unary(builder, intrinsic,
2010 ret_type, arg);
2011 }
2012 else {
2013 if (type.width* type.length == 128) {
2014 intrinsic = "llvm.x86.sse2.cvtps2dq";
2015 }
2016 else {
2017 assert(type.width*type.length == 256);
2018 assert(util_cpu_caps.has_avx);
2019
2020 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2021 }
2022 res = lp_build_intrinsic_unary(builder, intrinsic,
2023 ret_type, a);
2024 }
2025
2026 return res;
2027 }
2028
2029
2030 /*
2031 */
2032 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2033 lp_build_round_altivec(struct lp_build_context *bld,
2034 LLVMValueRef a,
2035 enum lp_build_round_mode mode)
2036 {
2037 LLVMBuilderRef builder = bld->gallivm->builder;
2038 const struct lp_type type = bld->type;
2039 const char *intrinsic = NULL;
2040
2041 assert(type.floating);
2042
2043 assert(lp_check_value(type, a));
2044 assert(util_cpu_caps.has_altivec);
2045
2046 (void)type;
2047
2048 switch (mode) {
2049 case LP_BUILD_ROUND_NEAREST:
2050 intrinsic = "llvm.ppc.altivec.vrfin";
2051 break;
2052 case LP_BUILD_ROUND_FLOOR:
2053 intrinsic = "llvm.ppc.altivec.vrfim";
2054 break;
2055 case LP_BUILD_ROUND_CEIL:
2056 intrinsic = "llvm.ppc.altivec.vrfip";
2057 break;
2058 case LP_BUILD_ROUND_TRUNCATE:
2059 intrinsic = "llvm.ppc.altivec.vrfiz";
2060 break;
2061 }
2062
2063 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2064 }
2065
2066 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)2067 lp_build_round_arch(struct lp_build_context *bld,
2068 LLVMValueRef a,
2069 enum lp_build_round_mode mode)
2070 {
2071 if (util_cpu_caps.has_sse4_1) {
2072 LLVMBuilderRef builder = bld->gallivm->builder;
2073 const struct lp_type type = bld->type;
2074 const char *intrinsic_root;
2075 char intrinsic[32];
2076
2077 assert(type.floating);
2078 assert(lp_check_value(type, a));
2079 (void)type;
2080
2081 switch (mode) {
2082 case LP_BUILD_ROUND_NEAREST:
2083 intrinsic_root = "llvm.nearbyint";
2084 break;
2085 case LP_BUILD_ROUND_FLOOR:
2086 intrinsic_root = "llvm.floor";
2087 break;
2088 case LP_BUILD_ROUND_CEIL:
2089 intrinsic_root = "llvm.ceil";
2090 break;
2091 case LP_BUILD_ROUND_TRUNCATE:
2092 intrinsic_root = "llvm.trunc";
2093 break;
2094 }
2095
2096 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2097 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2098 }
2099 else /* (util_cpu_caps.has_altivec) */
2100 return lp_build_round_altivec(bld, a, mode);
2101 }
2102
2103 /**
2104 * Return the integer part of a float (vector) value (== round toward zero).
2105 * The returned value is a float (vector).
2106 * Ex: trunc(-1.5) = -1.0
2107 */
2108 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2109 lp_build_trunc(struct lp_build_context *bld,
2110 LLVMValueRef a)
2111 {
2112 LLVMBuilderRef builder = bld->gallivm->builder;
2113 const struct lp_type type = bld->type;
2114
2115 assert(type.floating);
2116 assert(lp_check_value(type, a));
2117
2118 if (arch_rounding_available(type)) {
2119 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2120 }
2121 else {
2122 const struct lp_type type = bld->type;
2123 struct lp_type inttype;
2124 struct lp_build_context intbld;
2125 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2126 LLVMValueRef trunc, res, anosign, mask;
2127 LLVMTypeRef int_vec_type = bld->int_vec_type;
2128 LLVMTypeRef vec_type = bld->vec_type;
2129
2130 assert(type.width == 32); /* might want to handle doubles at some point */
2131
2132 inttype = type;
2133 inttype.floating = 0;
2134 lp_build_context_init(&intbld, bld->gallivm, inttype);
2135
2136 /* round by truncation */
2137 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2138 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2139
2140 /* mask out sign bit */
2141 anosign = lp_build_abs(bld, a);
2142 /*
2143 * mask out all values if anosign > 2^24
2144 * This should work both for large ints (all rounding is no-op for them
2145 * because such floats are always exact) as well as special cases like
2146 * NaNs, Infs (taking advantage of the fact they use max exponent).
2147 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2148 */
2149 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2150 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2151 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2152 return lp_build_select(bld, mask, a, res);
2153 }
2154 }
2155
2156
2157 /**
2158 * Return float (vector) rounded to nearest integer (vector). The returned
2159 * value is a float (vector).
2160 * Ex: round(0.9) = 1.0
2161 * Ex: round(-1.5) = -2.0
2162 */
2163 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2164 lp_build_round(struct lp_build_context *bld,
2165 LLVMValueRef a)
2166 {
2167 LLVMBuilderRef builder = bld->gallivm->builder;
2168 const struct lp_type type = bld->type;
2169
2170 assert(type.floating);
2171 assert(lp_check_value(type, a));
2172
2173 if (arch_rounding_available(type)) {
2174 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2175 }
2176 else {
2177 const struct lp_type type = bld->type;
2178 struct lp_type inttype;
2179 struct lp_build_context intbld;
2180 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2181 LLVMValueRef res, anosign, mask;
2182 LLVMTypeRef int_vec_type = bld->int_vec_type;
2183 LLVMTypeRef vec_type = bld->vec_type;
2184
2185 assert(type.width == 32); /* might want to handle doubles at some point */
2186
2187 inttype = type;
2188 inttype.floating = 0;
2189 lp_build_context_init(&intbld, bld->gallivm, inttype);
2190
2191 res = lp_build_iround(bld, a);
2192 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2193
2194 /* mask out sign bit */
2195 anosign = lp_build_abs(bld, a);
2196 /*
2197 * mask out all values if anosign > 2^24
2198 * This should work both for large ints (all rounding is no-op for them
2199 * because such floats are always exact) as well as special cases like
2200 * NaNs, Infs (taking advantage of the fact they use max exponent).
2201 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2202 */
2203 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2204 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2205 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2206 return lp_build_select(bld, mask, a, res);
2207 }
2208 }
2209
2210
2211 /**
2212 * Return floor of float (vector), result is a float (vector)
2213 * Ex: floor(1.1) = 1.0
2214 * Ex: floor(-1.1) = -2.0
2215 */
2216 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2217 lp_build_floor(struct lp_build_context *bld,
2218 LLVMValueRef a)
2219 {
2220 LLVMBuilderRef builder = bld->gallivm->builder;
2221 const struct lp_type type = bld->type;
2222
2223 assert(type.floating);
2224 assert(lp_check_value(type, a));
2225
2226 if (arch_rounding_available(type)) {
2227 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2228 }
2229 else {
2230 const struct lp_type type = bld->type;
2231 struct lp_type inttype;
2232 struct lp_build_context intbld;
2233 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2234 LLVMValueRef trunc, res, anosign, mask;
2235 LLVMTypeRef int_vec_type = bld->int_vec_type;
2236 LLVMTypeRef vec_type = bld->vec_type;
2237
2238 if (type.width != 32) {
2239 char intrinsic[32];
2240 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2241 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2242 }
2243
2244 assert(type.width == 32); /* might want to handle doubles at some point */
2245
2246 inttype = type;
2247 inttype.floating = 0;
2248 lp_build_context_init(&intbld, bld->gallivm, inttype);
2249
2250 /* round by truncation */
2251 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2252 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2253
2254 if (type.sign) {
2255 LLVMValueRef tmp;
2256
2257 /*
2258 * fix values if rounding is wrong (for non-special cases)
2259 * - this is the case if trunc > a
2260 */
2261 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2262 /* tmp = trunc > a ? 1.0 : 0.0 */
2263 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2264 tmp = lp_build_and(&intbld, mask, tmp);
2265 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2266 res = lp_build_sub(bld, res, tmp);
2267 }
2268
2269 /* mask out sign bit */
2270 anosign = lp_build_abs(bld, a);
2271 /*
2272 * mask out all values if anosign > 2^24
2273 * This should work both for large ints (all rounding is no-op for them
2274 * because such floats are always exact) as well as special cases like
2275 * NaNs, Infs (taking advantage of the fact they use max exponent).
2276 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2277 */
2278 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2279 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2280 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2281 return lp_build_select(bld, mask, a, res);
2282 }
2283 }
2284
2285
2286 /**
2287 * Return ceiling of float (vector), returning float (vector).
2288 * Ex: ceil( 1.1) = 2.0
2289 * Ex: ceil(-1.1) = -1.0
2290 */
2291 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2292 lp_build_ceil(struct lp_build_context *bld,
2293 LLVMValueRef a)
2294 {
2295 LLVMBuilderRef builder = bld->gallivm->builder;
2296 const struct lp_type type = bld->type;
2297
2298 assert(type.floating);
2299 assert(lp_check_value(type, a));
2300
2301 if (arch_rounding_available(type)) {
2302 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2303 }
2304 else {
2305 const struct lp_type type = bld->type;
2306 struct lp_type inttype;
2307 struct lp_build_context intbld;
2308 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2309 LLVMValueRef trunc, res, anosign, mask, tmp;
2310 LLVMTypeRef int_vec_type = bld->int_vec_type;
2311 LLVMTypeRef vec_type = bld->vec_type;
2312
2313 if (type.width != 32) {
2314 char intrinsic[32];
2315 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2316 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2317 }
2318
2319 assert(type.width == 32); /* might want to handle doubles at some point */
2320
2321 inttype = type;
2322 inttype.floating = 0;
2323 lp_build_context_init(&intbld, bld->gallivm, inttype);
2324
2325 /* round by truncation */
2326 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2327 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2328
2329 /*
2330 * fix values if rounding is wrong (for non-special cases)
2331 * - this is the case if trunc < a
2332 */
2333 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2334 /* tmp = trunc < a ? 1.0 : 0.0 */
2335 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2336 tmp = lp_build_and(&intbld, mask, tmp);
2337 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2338 res = lp_build_add(bld, trunc, tmp);
2339
2340 /* mask out sign bit */
2341 anosign = lp_build_abs(bld, a);
2342 /*
2343 * mask out all values if anosign > 2^24
2344 * This should work both for large ints (all rounding is no-op for them
2345 * because such floats are always exact) as well as special cases like
2346 * NaNs, Infs (taking advantage of the fact they use max exponent).
2347 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2348 */
2349 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2350 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2351 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2352 return lp_build_select(bld, mask, a, res);
2353 }
2354 }
2355
2356
2357 /**
2358 * Return fractional part of 'a' computed as a - floor(a)
2359 * Typically used in texture coord arithmetic.
2360 */
2361 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2362 lp_build_fract(struct lp_build_context *bld,
2363 LLVMValueRef a)
2364 {
2365 assert(bld->type.floating);
2366 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2367 }
2368
2369
2370 /**
2371 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2372 * against 0.99999(9). (Will also return that value for NaNs.)
2373 */
2374 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2375 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2376 {
2377 LLVMValueRef max;
2378
2379 /* this is the largest number smaller than 1.0 representable as float */
2380 max = lp_build_const_vec(bld->gallivm, bld->type,
2381 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2382 return lp_build_min_ext(bld, fract, max,
2383 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2384 }
2385
2386
2387 /**
2388 * Same as lp_build_fract, but guarantees that the result is always smaller
2389 * than one. Will also return the smaller-than-one value for infs, NaNs.
2390 */
2391 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2392 lp_build_fract_safe(struct lp_build_context *bld,
2393 LLVMValueRef a)
2394 {
2395 return clamp_fract(bld, lp_build_fract(bld, a));
2396 }
2397
2398
2399 /**
2400 * Return the integer part of a float (vector) value (== round toward zero).
2401 * The returned value is an integer (vector).
2402 * Ex: itrunc(-1.5) = -1
2403 */
2404 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2405 lp_build_itrunc(struct lp_build_context *bld,
2406 LLVMValueRef a)
2407 {
2408 LLVMBuilderRef builder = bld->gallivm->builder;
2409 const struct lp_type type = bld->type;
2410 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2411
2412 assert(type.floating);
2413 assert(lp_check_value(type, a));
2414
2415 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2416 }
2417
2418
2419 /**
2420 * Return float (vector) rounded to nearest integer (vector). The returned
2421 * value is an integer (vector).
2422 * Ex: iround(0.9) = 1
2423 * Ex: iround(-1.5) = -2
2424 */
2425 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2426 lp_build_iround(struct lp_build_context *bld,
2427 LLVMValueRef a)
2428 {
2429 LLVMBuilderRef builder = bld->gallivm->builder;
2430 const struct lp_type type = bld->type;
2431 LLVMTypeRef int_vec_type = bld->int_vec_type;
2432 LLVMValueRef res;
2433
2434 assert(type.floating);
2435
2436 assert(lp_check_value(type, a));
2437
2438 if ((util_cpu_caps.has_sse2 &&
2439 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2440 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2441 return lp_build_iround_nearest_sse2(bld, a);
2442 }
2443 if (arch_rounding_available(type)) {
2444 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2445 }
2446 else {
2447 LLVMValueRef half;
2448
2449 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2450
2451 if (type.sign) {
2452 LLVMTypeRef vec_type = bld->vec_type;
2453 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2454 (unsigned long long)1 << (type.width - 1));
2455 LLVMValueRef sign;
2456
2457 /* get sign bit */
2458 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2459 sign = LLVMBuildAnd(builder, sign, mask, "");
2460
2461 /* sign * 0.5 */
2462 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2463 half = LLVMBuildOr(builder, sign, half, "");
2464 half = LLVMBuildBitCast(builder, half, vec_type, "");
2465 }
2466
2467 res = LLVMBuildFAdd(builder, a, half, "");
2468 }
2469
2470 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2471
2472 return res;
2473 }
2474
2475
2476 /**
2477 * Return floor of float (vector), result is an int (vector)
2478 * Ex: ifloor(1.1) = 1.0
2479 * Ex: ifloor(-1.1) = -2.0
2480 */
2481 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2482 lp_build_ifloor(struct lp_build_context *bld,
2483 LLVMValueRef a)
2484 {
2485 LLVMBuilderRef builder = bld->gallivm->builder;
2486 const struct lp_type type = bld->type;
2487 LLVMTypeRef int_vec_type = bld->int_vec_type;
2488 LLVMValueRef res;
2489
2490 assert(type.floating);
2491 assert(lp_check_value(type, a));
2492
2493 res = a;
2494 if (type.sign) {
2495 if (arch_rounding_available(type)) {
2496 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2497 }
2498 else {
2499 struct lp_type inttype;
2500 struct lp_build_context intbld;
2501 LLVMValueRef trunc, itrunc, mask;
2502
2503 assert(type.floating);
2504 assert(lp_check_value(type, a));
2505
2506 inttype = type;
2507 inttype.floating = 0;
2508 lp_build_context_init(&intbld, bld->gallivm, inttype);
2509
2510 /* round by truncation */
2511 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2512 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2513
2514 /*
2515 * fix values if rounding is wrong (for non-special cases)
2516 * - this is the case if trunc > a
2517 * The results of doing this with NaNs, very large values etc.
2518 * are undefined but this seems to be the case anyway.
2519 */
2520 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2521 /* cheapie minus one with mask since the mask is minus one / zero */
2522 return lp_build_add(&intbld, itrunc, mask);
2523 }
2524 }
2525
2526 /* round to nearest (toward zero) */
2527 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2528
2529 return res;
2530 }
2531
2532
2533 /**
2534 * Return ceiling of float (vector), returning int (vector).
2535 * Ex: iceil( 1.1) = 2
2536 * Ex: iceil(-1.1) = -1
2537 */
2538 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2539 lp_build_iceil(struct lp_build_context *bld,
2540 LLVMValueRef a)
2541 {
2542 LLVMBuilderRef builder = bld->gallivm->builder;
2543 const struct lp_type type = bld->type;
2544 LLVMTypeRef int_vec_type = bld->int_vec_type;
2545 LLVMValueRef res;
2546
2547 assert(type.floating);
2548 assert(lp_check_value(type, a));
2549
2550 if (arch_rounding_available(type)) {
2551 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2552 }
2553 else {
2554 struct lp_type inttype;
2555 struct lp_build_context intbld;
2556 LLVMValueRef trunc, itrunc, mask;
2557
2558 assert(type.floating);
2559 assert(lp_check_value(type, a));
2560
2561 inttype = type;
2562 inttype.floating = 0;
2563 lp_build_context_init(&intbld, bld->gallivm, inttype);
2564
2565 /* round by truncation */
2566 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2567 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2568
2569 /*
2570 * fix values if rounding is wrong (for non-special cases)
2571 * - this is the case if trunc < a
2572 * The results of doing this with NaNs, very large values etc.
2573 * are undefined but this seems to be the case anyway.
2574 */
2575 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2576 /* cheapie plus one with mask since the mask is minus one / zero */
2577 return lp_build_sub(&intbld, itrunc, mask);
2578 }
2579
2580 /* round to nearest (toward zero) */
2581 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2582
2583 return res;
2584 }
2585
2586
2587 /**
2588 * Combined ifloor() & fract().
2589 *
2590 * Preferred to calling the functions separately, as it will ensure that the
2591 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2592 */
2593 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2594 lp_build_ifloor_fract(struct lp_build_context *bld,
2595 LLVMValueRef a,
2596 LLVMValueRef *out_ipart,
2597 LLVMValueRef *out_fpart)
2598 {
2599 LLVMBuilderRef builder = bld->gallivm->builder;
2600 const struct lp_type type = bld->type;
2601 LLVMValueRef ipart;
2602
2603 assert(type.floating);
2604 assert(lp_check_value(type, a));
2605
2606 if (arch_rounding_available(type)) {
2607 /*
2608 * floor() is easier.
2609 */
2610
2611 ipart = lp_build_floor(bld, a);
2612 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2613 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2614 }
2615 else {
2616 /*
2617 * ifloor() is easier.
2618 */
2619
2620 *out_ipart = lp_build_ifloor(bld, a);
2621 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2622 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2623 }
2624 }
2625
2626
2627 /**
2628 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2629 * always smaller than one.
2630 */
2631 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2632 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2633 LLVMValueRef a,
2634 LLVMValueRef *out_ipart,
2635 LLVMValueRef *out_fpart)
2636 {
2637 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2638 *out_fpart = clamp_fract(bld, *out_fpart);
2639 }
2640
2641
2642 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2643 lp_build_sqrt(struct lp_build_context *bld,
2644 LLVMValueRef a)
2645 {
2646 LLVMBuilderRef builder = bld->gallivm->builder;
2647 const struct lp_type type = bld->type;
2648 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2649 char intrinsic[32];
2650
2651 assert(lp_check_value(type, a));
2652
2653 assert(type.floating);
2654 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2655
2656 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2657 }
2658
2659
2660 /**
2661 * Do one Newton-Raphson step to improve reciprocate precision:
2662 *
2663 * x_{i+1} = x_i * (2 - a * x_i)
2664 *
2665 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2666 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2667 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2668 * halo. It would be necessary to clamp the argument to prevent this.
2669 *
2670 * See also:
2671 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2672 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2673 */
2674 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2675 lp_build_rcp_refine(struct lp_build_context *bld,
2676 LLVMValueRef a,
2677 LLVMValueRef rcp_a)
2678 {
2679 LLVMBuilderRef builder = bld->gallivm->builder;
2680 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2681 LLVMValueRef res;
2682
2683 res = LLVMBuildFMul(builder, a, rcp_a, "");
2684 res = LLVMBuildFSub(builder, two, res, "");
2685 res = LLVMBuildFMul(builder, rcp_a, res, "");
2686
2687 return res;
2688 }
2689
2690
2691 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2692 lp_build_rcp(struct lp_build_context *bld,
2693 LLVMValueRef a)
2694 {
2695 LLVMBuilderRef builder = bld->gallivm->builder;
2696 const struct lp_type type = bld->type;
2697
2698 assert(lp_check_value(type, a));
2699
2700 if(a == bld->zero)
2701 return bld->undef;
2702 if(a == bld->one)
2703 return bld->one;
2704 if(a == bld->undef)
2705 return bld->undef;
2706
2707 assert(type.floating);
2708
2709 if(LLVMIsConstant(a))
2710 return LLVMConstFDiv(bld->one, a);
2711
2712 /*
2713 * We don't use RCPPS because:
2714 * - it only has 10bits of precision
2715 * - it doesn't even get the reciprocate of 1.0 exactly
2716 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2717 * - for recent processors the benefit over DIVPS is marginal, a case
2718 * dependent
2719 *
2720 * We could still use it on certain processors if benchmarks show that the
2721 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2722 * particular uses that require less workarounds.
2723 */
2724
2725 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2726 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2727 const unsigned num_iterations = 0;
2728 LLVMValueRef res;
2729 unsigned i;
2730 const char *intrinsic = NULL;
2731
2732 if (type.length == 4) {
2733 intrinsic = "llvm.x86.sse.rcp.ps";
2734 }
2735 else {
2736 intrinsic = "llvm.x86.avx.rcp.ps.256";
2737 }
2738
2739 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2740
2741 for (i = 0; i < num_iterations; ++i) {
2742 res = lp_build_rcp_refine(bld, a, res);
2743 }
2744
2745 return res;
2746 }
2747
2748 return LLVMBuildFDiv(builder, bld->one, a, "");
2749 }
2750
2751
2752 /**
2753 * Do one Newton-Raphson step to improve rsqrt precision:
2754 *
2755 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2756 *
2757 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2758 */
2759 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2760 lp_build_rsqrt_refine(struct lp_build_context *bld,
2761 LLVMValueRef a,
2762 LLVMValueRef rsqrt_a)
2763 {
2764 LLVMBuilderRef builder = bld->gallivm->builder;
2765 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2766 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2767 LLVMValueRef res;
2768
2769 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2770 res = LLVMBuildFMul(builder, a, res, "");
2771 res = LLVMBuildFSub(builder, three, res, "");
2772 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2773 res = LLVMBuildFMul(builder, half, res, "");
2774
2775 return res;
2776 }
2777
2778
2779 /**
2780 * Generate 1/sqrt(a).
2781 * Result is undefined for values < 0, infinity for +0.
2782 */
2783 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2784 lp_build_rsqrt(struct lp_build_context *bld,
2785 LLVMValueRef a)
2786 {
2787 const struct lp_type type = bld->type;
2788
2789 assert(lp_check_value(type, a));
2790
2791 assert(type.floating);
2792
2793 /*
2794 * This should be faster but all denormals will end up as infinity.
2795 */
2796 if (0 && lp_build_fast_rsqrt_available(type)) {
2797 const unsigned num_iterations = 1;
2798 LLVMValueRef res;
2799 unsigned i;
2800
2801 /* rsqrt(1.0) != 1.0 here */
2802 res = lp_build_fast_rsqrt(bld, a);
2803
2804 if (num_iterations) {
2805 /*
2806 * Newton-Raphson will result in NaN instead of infinity for zero,
2807 * and NaN instead of zero for infinity.
2808 * Also, need to ensure rsqrt(1.0) == 1.0.
2809 * All numbers smaller than FLT_MIN will result in +infinity
2810 * (rsqrtps treats all denormals as zero).
2811 */
2812 LLVMValueRef cmp;
2813 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2814 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2815
2816 for (i = 0; i < num_iterations; ++i) {
2817 res = lp_build_rsqrt_refine(bld, a, res);
2818 }
2819 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2820 res = lp_build_select(bld, cmp, inf, res);
2821 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2822 res = lp_build_select(bld, cmp, bld->zero, res);
2823 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2824 res = lp_build_select(bld, cmp, bld->one, res);
2825 }
2826
2827 return res;
2828 }
2829
2830 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2831 }
2832
2833 /**
2834 * If there's a fast (inaccurate) rsqrt instruction available
2835 * (caller may want to avoid to call rsqrt_fast if it's not available,
2836 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2837 * unavailable it would result in sqrt/div/mul so obviously
2838 * much better to just call sqrt, skipping both div and mul).
2839 */
2840 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2841 lp_build_fast_rsqrt_available(struct lp_type type)
2842 {
2843 assert(type.floating);
2844
2845 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2846 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2847 return true;
2848 }
2849 return false;
2850 }
2851
2852
2853 /**
2854 * Generate 1/sqrt(a).
2855 * Result is undefined for values < 0, infinity for +0.
2856 * Precision is limited, only ~10 bits guaranteed
2857 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2858 */
2859 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2860 lp_build_fast_rsqrt(struct lp_build_context *bld,
2861 LLVMValueRef a)
2862 {
2863 LLVMBuilderRef builder = bld->gallivm->builder;
2864 const struct lp_type type = bld->type;
2865
2866 assert(lp_check_value(type, a));
2867
2868 if (lp_build_fast_rsqrt_available(type)) {
2869 const char *intrinsic = NULL;
2870
2871 if (type.length == 4) {
2872 intrinsic = "llvm.x86.sse.rsqrt.ps";
2873 }
2874 else {
2875 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2876 }
2877 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2878 }
2879 else {
2880 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2881 }
2882 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2883 }
2884
2885
2886 /**
2887 * Generate sin(a) or cos(a) using polynomial approximation.
2888 * TODO: it might be worth recognizing sin and cos using same source
2889 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2890 * would be way cheaper than calculating (nearly) everything twice...
2891 * Not sure it's common enough to be worth bothering however, scs
2892 * opcode could also benefit from calculating both though.
2893 */
2894 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2895 lp_build_sin_or_cos(struct lp_build_context *bld,
2896 LLVMValueRef a,
2897 boolean cos)
2898 {
2899 struct gallivm_state *gallivm = bld->gallivm;
2900 LLVMBuilderRef b = gallivm->builder;
2901 struct lp_type int_type = lp_int_type(bld->type);
2902
2903 /*
2904 * take the absolute value,
2905 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2906 */
2907
2908 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2909 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2910
2911 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2912 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2913
2914 /*
2915 * scale by 4/Pi
2916 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2917 */
2918
2919 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2920 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2921
2922 /*
2923 * store the integer part of y in mm0
2924 * emm2 = _mm_cvttps_epi32(y);
2925 */
2926
2927 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2928
2929 /*
2930 * j=(j+1) & (~1) (see the cephes sources)
2931 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2932 */
2933
2934 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2935 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2936 /*
2937 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2938 */
2939 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2940 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2941
2942 /*
2943 * y = _mm_cvtepi32_ps(emm2);
2944 */
2945 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2946
2947 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2948 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2949 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2950 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2951
2952 /*
2953 * Argument used for poly selection and sign bit determination
2954 * is different for sin vs. cos.
2955 */
2956 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2957 emm2_and;
2958
2959 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2960 LLVMBuildNot(b, emm2_2, ""), ""),
2961 const_29, "sign_bit") :
2962 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2963 LLVMBuildShl(b, emm2_add,
2964 const_29, ""), ""),
2965 sign_mask, "sign_bit");
2966
2967 /*
2968 * get the polynom selection mask
2969 * there is one polynom for 0 <= x <= Pi/4
2970 * and another one for Pi/4<x<=Pi/2
2971 * Both branches will be computed.
2972 *
2973 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2974 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2975 */
2976
2977 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2978 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2979 int_type, PIPE_FUNC_EQUAL,
2980 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2981
2982 /*
2983 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2984 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2985 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2986 */
2987 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2988 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2989 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2990
2991 /*
2992 * The magic pass: "Extended precision modular arithmetic"
2993 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2994 */
2995 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2996 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2997 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2998
2999 /*
3000 * Evaluate the first polynom (0 <= x <= Pi/4)
3001 *
3002 * z = _mm_mul_ps(x,x);
3003 */
3004 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3005
3006 /*
3007 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3008 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3009 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3010 */
3011 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3012 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3013 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3014
3015 /*
3016 * y = *(v4sf*)_ps_coscof_p0;
3017 * y = _mm_mul_ps(y, z);
3018 */
3019 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3020 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3021 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3022 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3023
3024
3025 /*
3026 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3027 * y = _mm_sub_ps(y, tmp);
3028 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3029 */
3030 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3031 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3032 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3033 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3034 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3035
3036 /*
3037 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3038 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3039 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3040 */
3041 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3042 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3043 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3044
3045 /*
3046 * Evaluate the second polynom (Pi/4 <= x <= 0)
3047 *
3048 * y2 = *(v4sf*)_ps_sincof_p0;
3049 * y2 = _mm_mul_ps(y2, z);
3050 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3051 * y2 = _mm_mul_ps(y2, z);
3052 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3053 * y2 = _mm_mul_ps(y2, z);
3054 * y2 = _mm_mul_ps(y2, x);
3055 * y2 = _mm_add_ps(y2, x);
3056 */
3057
3058 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3059 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3060 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3061 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3062
3063 /*
3064 * select the correct result from the two polynoms
3065 * xmm3 = poly_mask;
3066 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3067 * y = _mm_andnot_ps(xmm3, y);
3068 * y = _mm_or_ps(y,y2);
3069 */
3070 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3071 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3072 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3073 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3074 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3075 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3076
3077 /*
3078 * update the sign
3079 * y = _mm_xor_ps(y, sign_bit);
3080 */
3081 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3082 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3083
3084 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3085
3086 /* clamp output to be within [-1, 1] */
3087 y_result = lp_build_clamp(bld, y_result,
3088 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3089 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3090 /* If a is -inf, inf or NaN then return NaN */
3091 y_result = lp_build_select(bld, isfinite, y_result,
3092 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3093 return y_result;
3094 }
3095
3096
3097 /**
3098 * Generate sin(a)
3099 */
3100 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3101 lp_build_sin(struct lp_build_context *bld,
3102 LLVMValueRef a)
3103 {
3104 return lp_build_sin_or_cos(bld, a, FALSE);
3105 }
3106
3107
3108 /**
3109 * Generate cos(a)
3110 */
3111 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3112 lp_build_cos(struct lp_build_context *bld,
3113 LLVMValueRef a)
3114 {
3115 return lp_build_sin_or_cos(bld, a, TRUE);
3116 }
3117
3118
3119 /**
3120 * Generate pow(x, y)
3121 */
3122 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3123 lp_build_pow(struct lp_build_context *bld,
3124 LLVMValueRef x,
3125 LLVMValueRef y)
3126 {
3127 /* TODO: optimize the constant case */
3128 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3129 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3130 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3131 __FUNCTION__);
3132 }
3133
3134 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3135 }
3136
3137
3138 /**
3139 * Generate exp(x)
3140 */
3141 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3142 lp_build_exp(struct lp_build_context *bld,
3143 LLVMValueRef x)
3144 {
3145 /* log2(e) = 1/log(2) */
3146 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3147 1.4426950408889634);
3148
3149 assert(lp_check_value(bld->type, x));
3150
3151 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3152 }
3153
3154
3155 /**
3156 * Generate log(x)
3157 * Behavior is undefined with infs, 0s and nans
3158 */
3159 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3160 lp_build_log(struct lp_build_context *bld,
3161 LLVMValueRef x)
3162 {
3163 /* log(2) */
3164 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3165 0.69314718055994529);
3166
3167 assert(lp_check_value(bld->type, x));
3168
3169 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3170 }
3171
3172 /**
3173 * Generate log(x) that handles edge cases (infs, 0s and nans)
3174 */
3175 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3176 lp_build_log_safe(struct lp_build_context *bld,
3177 LLVMValueRef x)
3178 {
3179 /* log(2) */
3180 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3181 0.69314718055994529);
3182
3183 assert(lp_check_value(bld->type, x));
3184
3185 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3186 }
3187
3188
3189 /**
3190 * Generate polynomial.
3191 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3192 */
3193 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3194 lp_build_polynomial(struct lp_build_context *bld,
3195 LLVMValueRef x,
3196 const double *coeffs,
3197 unsigned num_coeffs)
3198 {
3199 const struct lp_type type = bld->type;
3200 LLVMValueRef even = NULL, odd = NULL;
3201 LLVMValueRef x2;
3202 unsigned i;
3203
3204 assert(lp_check_value(bld->type, x));
3205
3206 /* TODO: optimize the constant case */
3207 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3208 LLVMIsConstant(x)) {
3209 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3210 __FUNCTION__);
3211 }
3212
3213 /*
3214 * Calculate odd and even terms seperately to decrease data dependency
3215 * Ex:
3216 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3217 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3218 */
3219 x2 = lp_build_mul(bld, x, x);
3220
3221 for (i = num_coeffs; i--; ) {
3222 LLVMValueRef coeff;
3223
3224 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3225
3226 if (i % 2 == 0) {
3227 if (even)
3228 even = lp_build_mad(bld, x2, even, coeff);
3229 else
3230 even = coeff;
3231 } else {
3232 if (odd)
3233 odd = lp_build_mad(bld, x2, odd, coeff);
3234 else
3235 odd = coeff;
3236 }
3237 }
3238
3239 if (odd)
3240 return lp_build_mad(bld, odd, x, even);
3241 else if (even)
3242 return even;
3243 else
3244 return bld->undef;
3245 }
3246
3247
3248 /**
3249 * Minimax polynomial fit of 2**x, in range [0, 1[
3250 */
3251 const double lp_build_exp2_polynomial[] = {
3252 #if EXP_POLY_DEGREE == 5
3253 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3254 0.693153073200168932794,
3255 0.240153617044375388211,
3256 0.0558263180532956664775,
3257 0.00898934009049466391101,
3258 0.00187757667519147912699
3259 #elif EXP_POLY_DEGREE == 4
3260 1.00000259337069434683,
3261 0.693003834469974940458,
3262 0.24144275689150793076,
3263 0.0520114606103070150235,
3264 0.0135341679161270268764
3265 #elif EXP_POLY_DEGREE == 3
3266 0.999925218562710312959,
3267 0.695833540494823811697,
3268 0.226067155427249155588,
3269 0.0780245226406372992967
3270 #elif EXP_POLY_DEGREE == 2
3271 1.00172476321474503578,
3272 0.657636275736077639316,
3273 0.33718943461968720704
3274 #else
3275 #error
3276 #endif
3277 };
3278
3279
3280 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3281 lp_build_exp2(struct lp_build_context *bld,
3282 LLVMValueRef x)
3283 {
3284 LLVMBuilderRef builder = bld->gallivm->builder;
3285 const struct lp_type type = bld->type;
3286 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3287 LLVMValueRef ipart = NULL;
3288 LLVMValueRef fpart = NULL;
3289 LLVMValueRef expipart = NULL;
3290 LLVMValueRef expfpart = NULL;
3291 LLVMValueRef res = NULL;
3292
3293 assert(lp_check_value(bld->type, x));
3294
3295 /* TODO: optimize the constant case */
3296 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3297 LLVMIsConstant(x)) {
3298 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3299 __FUNCTION__);
3300 }
3301
3302 assert(type.floating && type.width == 32);
3303
3304 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3305 * the result is INF and if it's smaller than -126.9 the result is 0 */
3306 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3307 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3308 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3309 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3310
3311 /* ipart = floor(x) */
3312 /* fpart = x - ipart */
3313 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3314
3315 /* expipart = (float) (1 << ipart) */
3316 expipart = LLVMBuildAdd(builder, ipart,
3317 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3318 expipart = LLVMBuildShl(builder, expipart,
3319 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3320 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3321
3322 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3323 ARRAY_SIZE(lp_build_exp2_polynomial));
3324
3325 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3326
3327 return res;
3328 }
3329
3330
3331
3332 /**
3333 * Extract the exponent of a IEEE-754 floating point value.
3334 *
3335 * Optionally apply an integer bias.
3336 *
3337 * Result is an integer value with
3338 *
3339 * ifloor(log2(x)) + bias
3340 */
3341 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3342 lp_build_extract_exponent(struct lp_build_context *bld,
3343 LLVMValueRef x,
3344 int bias)
3345 {
3346 LLVMBuilderRef builder = bld->gallivm->builder;
3347 const struct lp_type type = bld->type;
3348 unsigned mantissa = lp_mantissa(type);
3349 LLVMValueRef res;
3350
3351 assert(type.floating);
3352
3353 assert(lp_check_value(bld->type, x));
3354
3355 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3356
3357 res = LLVMBuildLShr(builder, x,
3358 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3359 res = LLVMBuildAnd(builder, res,
3360 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3361 res = LLVMBuildSub(builder, res,
3362 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3363
3364 return res;
3365 }
3366
3367
3368 /**
3369 * Extract the mantissa of the a floating.
3370 *
3371 * Result is a floating point value with
3372 *
3373 * x / floor(log2(x))
3374 */
3375 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3376 lp_build_extract_mantissa(struct lp_build_context *bld,
3377 LLVMValueRef x)
3378 {
3379 LLVMBuilderRef builder = bld->gallivm->builder;
3380 const struct lp_type type = bld->type;
3381 unsigned mantissa = lp_mantissa(type);
3382 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3383 (1ULL << mantissa) - 1);
3384 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3385 LLVMValueRef res;
3386
3387 assert(lp_check_value(bld->type, x));
3388
3389 assert(type.floating);
3390
3391 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3392
3393 /* res = x / 2**ipart */
3394 res = LLVMBuildAnd(builder, x, mantmask, "");
3395 res = LLVMBuildOr(builder, res, one, "");
3396 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3397
3398 return res;
3399 }
3400
3401
3402
3403 /**
3404 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3405 * These coefficients can be generate with
3406 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3407 */
3408 const double lp_build_log2_polynomial[] = {
3409 #if LOG_POLY_DEGREE == 5
3410 2.88539008148777786488L,
3411 0.961796878841293367824L,
3412 0.577058946784739859012L,
3413 0.412914355135828735411L,
3414 0.308591899232910175289L,
3415 0.352376952300281371868L,
3416 #elif LOG_POLY_DEGREE == 4
3417 2.88539009343309178325L,
3418 0.961791550404184197881L,
3419 0.577440339438736392009L,
3420 0.403343858251329912514L,
3421 0.406718052498846252698L,
3422 #elif LOG_POLY_DEGREE == 3
3423 2.88538959748872753838L,
3424 0.961932915889597772928L,
3425 0.571118517972136195241L,
3426 0.493997535084709500285L,
3427 #else
3428 #error
3429 #endif
3430 };
3431
3432 /**
3433 * See http://www.devmaster.net/forums/showthread.php?p=43580
3434 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3435 * http://www.nezumi.demon.co.uk/consult/logx.htm
3436 *
3437 * If handle_edge_cases is true the function will perform computations
3438 * to match the required D3D10+ behavior for each of the edge cases.
3439 * That means that if input is:
3440 * - less than zero (to and including -inf) then NaN will be returned
3441 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3442 * - +infinity, then +infinity will be returned
3443 * - NaN, then NaN will be returned
3444 *
3445 * Those checks are fairly expensive so if you don't need them make sure
3446 * handle_edge_cases is false.
3447 */
3448 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3449 lp_build_log2_approx(struct lp_build_context *bld,
3450 LLVMValueRef x,
3451 LLVMValueRef *p_exp,
3452 LLVMValueRef *p_floor_log2,
3453 LLVMValueRef *p_log2,
3454 boolean handle_edge_cases)
3455 {
3456 LLVMBuilderRef builder = bld->gallivm->builder;
3457 const struct lp_type type = bld->type;
3458 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3459 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3460
3461 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3462 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3463 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3464
3465 LLVMValueRef i = NULL;
3466 LLVMValueRef y = NULL;
3467 LLVMValueRef z = NULL;
3468 LLVMValueRef exp = NULL;
3469 LLVMValueRef mant = NULL;
3470 LLVMValueRef logexp = NULL;
3471 LLVMValueRef p_z = NULL;
3472 LLVMValueRef res = NULL;
3473
3474 assert(lp_check_value(bld->type, x));
3475
3476 if(p_exp || p_floor_log2 || p_log2) {
3477 /* TODO: optimize the constant case */
3478 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3479 LLVMIsConstant(x)) {
3480 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3481 __FUNCTION__);
3482 }
3483
3484 assert(type.floating && type.width == 32);
3485
3486 /*
3487 * We don't explicitly handle denormalized numbers. They will yield a
3488 * result in the neighbourhood of -127, which appears to be adequate
3489 * enough.
3490 */
3491
3492 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3493
3494 /* exp = (float) exponent(x) */
3495 exp = LLVMBuildAnd(builder, i, expmask, "");
3496 }
3497
3498 if(p_floor_log2 || p_log2) {
3499 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3500 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3501 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3502 }
3503
3504 if (p_log2) {
3505 /* mant = 1 + (float) mantissa(x) */
3506 mant = LLVMBuildAnd(builder, i, mantmask, "");
3507 mant = LLVMBuildOr(builder, mant, one, "");
3508 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3509
3510 /* y = (mant - 1) / (mant + 1) */
3511 y = lp_build_div(bld,
3512 lp_build_sub(bld, mant, bld->one),
3513 lp_build_add(bld, mant, bld->one)
3514 );
3515
3516 /* z = y^2 */
3517 z = lp_build_mul(bld, y, y);
3518
3519 /* compute P(z) */
3520 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3521 ARRAY_SIZE(lp_build_log2_polynomial));
3522
3523 /* y * P(z) + logexp */
3524 res = lp_build_mad(bld, y, p_z, logexp);
3525
3526 if (type.floating && handle_edge_cases) {
3527 LLVMValueRef negmask, infmask, zmask;
3528 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3529 lp_build_const_vec(bld->gallivm, type, 0.0f));
3530 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3531 lp_build_const_vec(bld->gallivm, type, 0.0f));
3532 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3533 lp_build_const_vec(bld->gallivm, type, INFINITY));
3534
3535 /* If x is qual to inf make sure we return inf */
3536 res = lp_build_select(bld, infmask,
3537 lp_build_const_vec(bld->gallivm, type, INFINITY),
3538 res);
3539 /* If x is qual to 0, return -inf */
3540 res = lp_build_select(bld, zmask,
3541 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3542 res);
3543 /* If x is nan or less than 0, return nan */
3544 res = lp_build_select(bld, negmask,
3545 lp_build_const_vec(bld->gallivm, type, NAN),
3546 res);
3547 }
3548 }
3549
3550 if (p_exp) {
3551 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3552 *p_exp = exp;
3553 }
3554
3555 if (p_floor_log2)
3556 *p_floor_log2 = logexp;
3557
3558 if (p_log2)
3559 *p_log2 = res;
3560 }
3561
3562
3563 /*
3564 * log2 implementation which doesn't have special code to
3565 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3566 * the results for those cases are undefined.
3567 */
3568 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3569 lp_build_log2(struct lp_build_context *bld,
3570 LLVMValueRef x)
3571 {
3572 LLVMValueRef res;
3573 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3574 return res;
3575 }
3576
3577 /*
3578 * Version of log2 which handles all edge cases.
3579 * Look at documentation of lp_build_log2_approx for
3580 * description of the behavior for each of the edge cases.
3581 */
3582 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3583 lp_build_log2_safe(struct lp_build_context *bld,
3584 LLVMValueRef x)
3585 {
3586 LLVMValueRef res;
3587 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3588 return res;
3589 }
3590
3591
3592 /**
3593 * Faster (and less accurate) log2.
3594 *
3595 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3596 *
3597 * Piece-wise linear approximation, with exact results when x is a
3598 * power of two.
3599 *
3600 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3601 */
3602 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3603 lp_build_fast_log2(struct lp_build_context *bld,
3604 LLVMValueRef x)
3605 {
3606 LLVMBuilderRef builder = bld->gallivm->builder;
3607 LLVMValueRef ipart;
3608 LLVMValueRef fpart;
3609
3610 assert(lp_check_value(bld->type, x));
3611
3612 assert(bld->type.floating);
3613
3614 /* ipart = floor(log2(x)) - 1 */
3615 ipart = lp_build_extract_exponent(bld, x, -1);
3616 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3617
3618 /* fpart = x / 2**ipart */
3619 fpart = lp_build_extract_mantissa(bld, x);
3620
3621 /* ipart + fpart */
3622 return LLVMBuildFAdd(builder, ipart, fpart, "");
3623 }
3624
3625
3626 /**
3627 * Fast implementation of iround(log2(x)).
3628 *
3629 * Not an approximation -- it should give accurate results all the time.
3630 */
3631 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3632 lp_build_ilog2(struct lp_build_context *bld,
3633 LLVMValueRef x)
3634 {
3635 LLVMBuilderRef builder = bld->gallivm->builder;
3636 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3637 LLVMValueRef ipart;
3638
3639 assert(bld->type.floating);
3640
3641 assert(lp_check_value(bld->type, x));
3642
3643 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3644 x = LLVMBuildFMul(builder, x, sqrt2, "");
3645
3646 /* ipart = floor(log2(x) + 0.5) */
3647 ipart = lp_build_extract_exponent(bld, x, 0);
3648
3649 return ipart;
3650 }
3651
3652 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3653 lp_build_mod(struct lp_build_context *bld,
3654 LLVMValueRef x,
3655 LLVMValueRef y)
3656 {
3657 LLVMBuilderRef builder = bld->gallivm->builder;
3658 LLVMValueRef res;
3659 const struct lp_type type = bld->type;
3660
3661 assert(lp_check_value(type, x));
3662 assert(lp_check_value(type, y));
3663
3664 if (type.floating)
3665 res = LLVMBuildFRem(builder, x, y, "");
3666 else if (type.sign)
3667 res = LLVMBuildSRem(builder, x, y, "");
3668 else
3669 res = LLVMBuildURem(builder, x, y, "");
3670 return res;
3671 }
3672
3673
3674 /*
3675 * For floating inputs it creates and returns a mask
3676 * which is all 1's for channels which are NaN.
3677 * Channels inside x which are not NaN will be 0.
3678 */
3679 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3680 lp_build_isnan(struct lp_build_context *bld,
3681 LLVMValueRef x)
3682 {
3683 LLVMValueRef mask;
3684 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3685
3686 assert(bld->type.floating);
3687 assert(lp_check_value(bld->type, x));
3688
3689 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3690 "isnotnan");
3691 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3692 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3693 return mask;
3694 }
3695
3696 /* Returns all 1's for floating point numbers that are
3697 * finite numbers and returns all zeros for -inf,
3698 * inf and nan's */
3699 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3700 lp_build_isfinite(struct lp_build_context *bld,
3701 LLVMValueRef x)
3702 {
3703 LLVMBuilderRef builder = bld->gallivm->builder;
3704 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3705 struct lp_type int_type = lp_int_type(bld->type);
3706 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3707 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3708 0x7f800000);
3709
3710 if (!bld->type.floating) {
3711 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3712 }
3713 assert(bld->type.floating);
3714 assert(lp_check_value(bld->type, x));
3715 assert(bld->type.width == 32);
3716
3717 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3718 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3719 intx, infornan32);
3720 }
3721
3722 /*
3723 * Returns true if the number is nan or inf and false otherwise.
3724 * The input has to be a floating point vector.
3725 */
3726 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3727 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3728 const struct lp_type type,
3729 LLVMValueRef x)
3730 {
3731 LLVMBuilderRef builder = gallivm->builder;
3732 struct lp_type int_type = lp_int_type(type);
3733 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3734 0x7f800000);
3735 LLVMValueRef ret;
3736
3737 assert(type.floating);
3738
3739 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3740 ret = LLVMBuildAnd(builder, ret, const0, "");
3741 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3742 ret, const0);
3743
3744 return ret;
3745 }
3746
3747
3748 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3749 lp_build_fpstate_get(struct gallivm_state *gallivm)
3750 {
3751 if (util_cpu_caps.has_sse) {
3752 LLVMBuilderRef builder = gallivm->builder;
3753 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3754 gallivm,
3755 LLVMInt32TypeInContext(gallivm->context),
3756 "mxcsr_ptr");
3757 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3758 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3759 lp_build_intrinsic(builder,
3760 "llvm.x86.sse.stmxcsr",
3761 LLVMVoidTypeInContext(gallivm->context),
3762 &mxcsr_ptr8, 1, 0);
3763 return mxcsr_ptr;
3764 }
3765 return 0;
3766 }
3767
3768 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3769 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3770 boolean zero)
3771 {
3772 if (util_cpu_caps.has_sse) {
3773 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3774 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3775
3776 LLVMBuilderRef builder = gallivm->builder;
3777 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3778 LLVMValueRef mxcsr =
3779 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3780
3781 if (util_cpu_caps.has_daz) {
3782 /* Enable denormals are zero mode */
3783 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3784 }
3785 if (zero) {
3786 mxcsr = LLVMBuildOr(builder, mxcsr,
3787 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3788 } else {
3789 mxcsr = LLVMBuildAnd(builder, mxcsr,
3790 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3791 }
3792
3793 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3794 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3795 }
3796 }
3797
3798 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3799 lp_build_fpstate_set(struct gallivm_state *gallivm,
3800 LLVMValueRef mxcsr_ptr)
3801 {
3802 if (util_cpu_caps.has_sse) {
3803 LLVMBuilderRef builder = gallivm->builder;
3804 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3805 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3806 lp_build_intrinsic(builder,
3807 "llvm.x86.sse.ldmxcsr",
3808 LLVMVoidTypeInContext(gallivm->context),
3809 &mxcsr_ptr, 1, 0);
3810 }
3811 }
3812