• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include <llvm/Config/llvm-config.h>
51 
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56 
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67 
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71 
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75 
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79 
80 #define EXP_POLY_DEGREE 5
81 
82 #define LOG_POLY_DEGREE 4
83 
84 
85 /**
86  * Generate min(a, b)
87  * No checks for special case values of a or b = 1 or 0 are done.
88  * NaN's are handled according to the behavior specified by the
89  * nan_behavior argument.
90  */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93                     LLVMValueRef a,
94                     LLVMValueRef b,
95                     enum gallivm_nan_behavior nan_behavior)
96 {
97    const struct lp_type type = bld->type;
98    const char *intrinsic = NULL;
99    unsigned intr_size = 0;
100    LLVMValueRef cond;
101 
102    assert(lp_check_value(type, a));
103    assert(lp_check_value(type, b));
104 
105    /* TODO: optimize the constant case */
106 
107    if (type.floating && util_get_cpu_caps()->has_sse) {
108       if (type.width == 32) {
109          if (type.length == 1) {
110             intrinsic = "llvm.x86.sse.min.ss";
111             intr_size = 128;
112          }
113          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114             intrinsic = "llvm.x86.sse.min.ps";
115             intr_size = 128;
116          }
117          else {
118             intrinsic = "llvm.x86.avx.min.ps.256";
119             intr_size = 256;
120          }
121       }
122       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123          if (type.length == 1) {
124             intrinsic = "llvm.x86.sse2.min.sd";
125             intr_size = 128;
126          }
127          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128             intrinsic = "llvm.x86.sse2.min.pd";
129             intr_size = 128;
130          }
131          else {
132             intrinsic = "llvm.x86.avx.min.pd.256";
133             intr_size = 256;
134          }
135       }
136    }
137    else if (type.floating && util_get_cpu_caps()->has_altivec) {
138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140                       __FUNCTION__);
141       }
142       if (type.width == 32 && type.length == 4) {
143          intrinsic = "llvm.ppc.altivec.vminfp";
144          intr_size = 128;
145       }
146    } else if (util_get_cpu_caps()->has_altivec) {
147       intr_size = 128;
148       if (type.width == 8) {
149          if (!type.sign) {
150             intrinsic = "llvm.ppc.altivec.vminub";
151          } else {
152             intrinsic = "llvm.ppc.altivec.vminsb";
153          }
154       } else if (type.width == 16) {
155          if (!type.sign) {
156             intrinsic = "llvm.ppc.altivec.vminuh";
157          } else {
158             intrinsic = "llvm.ppc.altivec.vminsh";
159          }
160       } else if (type.width == 32) {
161          if (!type.sign) {
162             intrinsic = "llvm.ppc.altivec.vminuw";
163          } else {
164             intrinsic = "llvm.ppc.altivec.vminsw";
165          }
166       }
167    }
168 
169    if (intrinsic) {
170       /* We need to handle nan's for floating point numbers. If one of the
171        * inputs is nan the other should be returned (required by both D3D10+
172        * and OpenCL).
173        * The sse intrinsics return the second operator in case of nan by
174        * default so we need to special code to handle those.
175        */
176       if (util_get_cpu_caps()->has_sse && type.floating &&
177           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178          LLVMValueRef isnan, min;
179          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180                                                    type,
181                                                    intr_size, a, b);
182          isnan = lp_build_isnan(bld, b);
183          return lp_build_select(bld, isnan, a, min);
184       } else {
185          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186                                                     type,
187                                                     intr_size, a, b);
188       }
189    }
190 
191    if (type.floating) {
192       switch (nan_behavior) {
193       case GALLIVM_NAN_RETURN_OTHER: {
194          LLVMValueRef isnan = lp_build_isnan(bld, a);
195          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197          return lp_build_select(bld, cond, a, b);
198       }
199          break;
200       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202          return lp_build_select(bld, cond, a, b);
203       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205          return lp_build_select(bld, cond, b, a);
206       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208          return lp_build_select(bld, cond, a, b);
209          break;
210       default:
211          assert(0);
212          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213          return lp_build_select(bld, cond, a, b);
214       }
215    } else {
216       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217       return lp_build_select(bld, cond, a, b);
218    }
219 }
220 
221 
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224                  LLVMValueRef a,
225                  LLVMValueRef b,
226                  LLVMValueRef c)
227 {
228    LLVMTypeRef type = LLVMTypeOf(a);
229    assert(type == LLVMTypeOf(b));
230    assert(type == LLVMTypeOf(c));
231 
232    char intrinsic[32];
233    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234    LLVMValueRef args[] = { a, b, c };
235    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237 
238 
239 /**
240  * Generate max(a, b)
241  * No checks for special case values of a or b = 1 or 0 are done.
242  * NaN's are handled according to the behavior specified by the
243  * nan_behavior argument.
244  */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247                     LLVMValueRef a,
248                     LLVMValueRef b,
249                     enum gallivm_nan_behavior nan_behavior)
250 {
251    const struct lp_type type = bld->type;
252    const char *intrinsic = NULL;
253    unsigned intr_size = 0;
254    LLVMValueRef cond;
255 
256    assert(lp_check_value(type, a));
257    assert(lp_check_value(type, b));
258 
259    /* TODO: optimize the constant case */
260 
261    if (type.floating && util_get_cpu_caps()->has_sse) {
262       if (type.width == 32) {
263          if (type.length == 1) {
264             intrinsic = "llvm.x86.sse.max.ss";
265             intr_size = 128;
266          }
267          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268             intrinsic = "llvm.x86.sse.max.ps";
269             intr_size = 128;
270          }
271          else {
272             intrinsic = "llvm.x86.avx.max.ps.256";
273             intr_size = 256;
274          }
275       }
276       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277          if (type.length == 1) {
278             intrinsic = "llvm.x86.sse2.max.sd";
279             intr_size = 128;
280          }
281          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282             intrinsic = "llvm.x86.sse2.max.pd";
283             intr_size = 128;
284          }
285          else {
286             intrinsic = "llvm.x86.avx.max.pd.256";
287             intr_size = 256;
288          }
289       }
290    }
291    else if (type.floating && util_get_cpu_caps()->has_altivec) {
292       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294                       __FUNCTION__);
295       }
296       if (type.width == 32 || type.length == 4) {
297          intrinsic = "llvm.ppc.altivec.vmaxfp";
298          intr_size = 128;
299       }
300    } else if (util_get_cpu_caps()->has_altivec) {
301      intr_size = 128;
302      if (type.width == 8) {
303        if (!type.sign) {
304          intrinsic = "llvm.ppc.altivec.vmaxub";
305        } else {
306          intrinsic = "llvm.ppc.altivec.vmaxsb";
307        }
308      } else if (type.width == 16) {
309        if (!type.sign) {
310          intrinsic = "llvm.ppc.altivec.vmaxuh";
311        } else {
312          intrinsic = "llvm.ppc.altivec.vmaxsh";
313        }
314      } else if (type.width == 32) {
315        if (!type.sign) {
316          intrinsic = "llvm.ppc.altivec.vmaxuw";
317        } else {
318          intrinsic = "llvm.ppc.altivec.vmaxsw";
319        }
320      }
321    }
322 
323    if (intrinsic) {
324       if (util_get_cpu_caps()->has_sse && type.floating &&
325           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326          LLVMValueRef isnan, max;
327          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328                                                    type,
329                                                    intr_size, a, b);
330          isnan = lp_build_isnan(bld, b);
331          return lp_build_select(bld, isnan, a, max);
332       } else {
333          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334                                                     type,
335                                                     intr_size, a, b);
336       }
337    }
338 
339    if (type.floating) {
340       switch (nan_behavior) {
341       case GALLIVM_NAN_RETURN_OTHER: {
342          LLVMValueRef isnan = lp_build_isnan(bld, a);
343          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345          return lp_build_select(bld, cond, a, b);
346       }
347          break;
348       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350          return lp_build_select(bld, cond, a, b);
351       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353          return lp_build_select(bld, cond, b, a);
354       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356          return lp_build_select(bld, cond, a, b);
357          break;
358       default:
359          assert(0);
360          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361          return lp_build_select(bld, cond, a, b);
362       }
363    } else {
364       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365       return lp_build_select(bld, cond, a, b);
366    }
367 }
368 
369 
370 /**
371  * Generate 1 - a, or ~a depending on bld->type.
372  */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375               LLVMValueRef a)
376 {
377    LLVMBuilderRef builder = bld->gallivm->builder;
378    const struct lp_type type = bld->type;
379 
380    assert(lp_check_value(type, a));
381 
382    if (a == bld->one)
383       return bld->zero;
384    if (a == bld->zero)
385       return bld->one;
386 
387    if (type.norm && !type.floating && !type.fixed && !type.sign) {
388       if (LLVMIsConstant(a))
389          return LLVMConstNot(a);
390       else
391          return LLVMBuildNot(builder, a, "");
392    }
393 
394    if (type.floating)
395       return LLVMBuildFSub(builder, bld->one, a, "");
396    else
397       return LLVMBuildSub(builder, bld->one, a, "");
398 }
399 
400 
401 /**
402  * Generate a + b
403  */
404 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)405 lp_build_add(struct lp_build_context *bld,
406              LLVMValueRef a,
407              LLVMValueRef b)
408 {
409    LLVMBuilderRef builder = bld->gallivm->builder;
410    const struct lp_type type = bld->type;
411    LLVMValueRef res;
412 
413    assert(lp_check_value(type, a));
414    assert(lp_check_value(type, b));
415 
416    if (a == bld->zero)
417       return b;
418    if (b == bld->zero)
419       return a;
420    if (a == bld->undef || b == bld->undef)
421       return bld->undef;
422 
423    if (type.norm) {
424       const char *intrinsic = NULL;
425 
426       if (!type.sign && (a == bld->one || b == bld->one))
427         return bld->one;
428 
429       if (!type.floating && !type.fixed) {
430          if (LLVM_VERSION_MAJOR >= 8) {
431             char intrin[32];
432             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
433             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
434             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
435          }
436          if (type.width * type.length == 128) {
437             if (util_get_cpu_caps()->has_sse2) {
438                if (type.width == 8)
439                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
440                if (type.width == 16)
441                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
442             } else if (util_get_cpu_caps()->has_altivec) {
443                if (type.width == 8)
444                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
445                if (type.width == 16)
446                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
447             }
448          }
449          if (type.width * type.length == 256) {
450             if (util_get_cpu_caps()->has_avx2) {
451                if (type.width == 8)
452                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
453                if (type.width == 16)
454                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
455             }
456          }
457       }
458 
459       if (intrinsic)
460          return lp_build_intrinsic_binary(builder, intrinsic,
461                        lp_build_vec_type(bld->gallivm, bld->type), a, b);
462    }
463 
464    if (type.norm && !type.floating && !type.fixed) {
465       if (type.sign) {
466          uint64_t sign = (uint64_t)1 << (type.width - 1);
467          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
468          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
469          /* a_clamp_max is the maximum a for positive b,
470             a_clamp_min is the minimum a for negative b. */
471          LLVMValueRef a_clamp_max =
472             lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""),
473                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
474          LLVMValueRef a_clamp_min =
475             lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""),
476                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
478                                      bld->zero), a_clamp_max, a_clamp_min);
479       }
480    }
481 
482    if (type.floating)
483       res = LLVMBuildFAdd(builder, a, b, "");
484    else
485       res = LLVMBuildAdd(builder, a, b, "");
486 
487    /* clamp to ceiling of 1.0 */
488    if (bld->type.norm && (bld->type.floating || bld->type.fixed))
489       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
490 
491    if (type.norm && !type.floating && !type.fixed) {
492       if (!type.sign) {
493          /*
494           * newer llvm versions no longer support the intrinsics, but recognize
495           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
496           * code, it is important we match the pattern llvm uses (and pray llvm
497           * doesn't change it - and hope they decide on the same pattern for
498           * all backends supporting it...).
499           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
500           * interfere with llvm's ability to recognize the pattern but seems
501           * a bit brittle.
502           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
503           */
504          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
505          res = lp_build_select(bld, overflowed,
506                                LLVMConstAllOnes(bld->int_vec_type), res);
507       }
508    }
509 
510    /* XXX clamp to floor of -1 or 0??? */
511 
512    return res;
513 }
514 
515 
516 /** Return the scalar sum of the elements of a.
517  * Should avoid this operation whenever possible.
518  */
519 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)520 lp_build_horizontal_add(struct lp_build_context *bld,
521                         LLVMValueRef a)
522 {
523    LLVMBuilderRef builder = bld->gallivm->builder;
524    const struct lp_type type = bld->type;
525    LLVMValueRef index, res;
526    unsigned i, length;
527    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
528    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
529    LLVMValueRef vecres, elem2;
530 
531    assert(lp_check_value(type, a));
532 
533    if (type.length == 1) {
534       return a;
535    }
536 
537    assert(!bld->type.norm);
538 
539    /*
540     * for byte vectors can do much better with psadbw.
541     * Using repeated shuffle/adds here. Note with multiple vectors
542     * this can be done more efficiently as outlined in the intel
543     * optimization manual.
544     * Note: could cause data rearrangement if used with smaller element
545     * sizes.
546     */
547 
548    vecres = a;
549    length = type.length / 2;
550    while (length > 1) {
551       LLVMValueRef vec1, vec2;
552       for (i = 0; i < length; i++) {
553          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
554          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
555       }
556       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
557                                     LLVMConstVector(shuffles1, length), "");
558       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
559                                     LLVMConstVector(shuffles2, length), "");
560       if (type.floating) {
561          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
562       }
563       else {
564          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
565       }
566       length = length >> 1;
567    }
568 
569    /* always have vector of size 2 here */
570    assert(length == 1);
571 
572    index = lp_build_const_int32(bld->gallivm, 0);
573    res = LLVMBuildExtractElement(builder, vecres, index, "");
574    index = lp_build_const_int32(bld->gallivm, 1);
575    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
576 
577    if (type.floating)
578       res = LLVMBuildFAdd(builder, res, elem2, "");
579     else
580       res = LLVMBuildAdd(builder, res, elem2, "");
581 
582    return res;
583 }
584 
585 
586 /**
587  * Return the horizontal sums of 4 float vectors as a float4 vector.
588  * This uses the technique as outlined in Intel Optimization Manual.
589  */
590 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])591 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
592                             LLVMValueRef src[4])
593 {
594    struct gallivm_state *gallivm = bld->gallivm;
595    LLVMBuilderRef builder = gallivm->builder;
596    LLVMValueRef shuffles[4];
597    LLVMValueRef tmp[4];
598    LLVMValueRef sumtmp[2], shuftmp[2];
599 
600    /* lower half of regs */
601    shuffles[0] = lp_build_const_int32(gallivm, 0);
602    shuffles[1] = lp_build_const_int32(gallivm, 1);
603    shuffles[2] = lp_build_const_int32(gallivm, 4);
604    shuffles[3] = lp_build_const_int32(gallivm, 5);
605    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
606                                    LLVMConstVector(shuffles, 4), "");
607    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
608                                    LLVMConstVector(shuffles, 4), "");
609 
610    /* upper half of regs */
611    shuffles[0] = lp_build_const_int32(gallivm, 2);
612    shuffles[1] = lp_build_const_int32(gallivm, 3);
613    shuffles[2] = lp_build_const_int32(gallivm, 6);
614    shuffles[3] = lp_build_const_int32(gallivm, 7);
615    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
616                                    LLVMConstVector(shuffles, 4), "");
617    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
618                                    LLVMConstVector(shuffles, 4), "");
619 
620    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
621    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
622 
623    shuffles[0] = lp_build_const_int32(gallivm, 0);
624    shuffles[1] = lp_build_const_int32(gallivm, 2);
625    shuffles[2] = lp_build_const_int32(gallivm, 4);
626    shuffles[3] = lp_build_const_int32(gallivm, 6);
627    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
628                                        LLVMConstVector(shuffles, 4), "");
629 
630    shuffles[0] = lp_build_const_int32(gallivm, 1);
631    shuffles[1] = lp_build_const_int32(gallivm, 3);
632    shuffles[2] = lp_build_const_int32(gallivm, 5);
633    shuffles[3] = lp_build_const_int32(gallivm, 7);
634    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
635                                        LLVMConstVector(shuffles, 4), "");
636 
637    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
638 }
639 
640 
641 /*
642  * partially horizontally add 2-4 float vectors with length nx4,
643  * i.e. only four adjacent values in each vector will be added,
644  * assuming values are really grouped in 4 which also determines
645  * output order.
646  *
647  * Return a vector of the same length as the initial vectors,
648  * with the excess elements (if any) being undefined.
649  * The element order is independent of number of input vectors.
650  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
651  * the output order thus will be
652  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
653  */
654 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)655 lp_build_hadd_partial4(struct lp_build_context *bld,
656                        LLVMValueRef vectors[],
657                        unsigned num_vecs)
658 {
659    struct gallivm_state *gallivm = bld->gallivm;
660    LLVMBuilderRef builder = gallivm->builder;
661    LLVMValueRef ret_vec;
662    LLVMValueRef tmp[4];
663    const char *intrinsic = NULL;
664 
665    assert(num_vecs >= 2 && num_vecs <= 4);
666    assert(bld->type.floating);
667 
668    /* only use this with at least 2 vectors, as it is sort of expensive
669     * (depending on cpu) and we always need two horizontal adds anyway,
670     * so a shuffle/add approach might be better.
671     */
672 
673    tmp[0] = vectors[0];
674    tmp[1] = vectors[1];
675 
676    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
677    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
678 
679    if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
680        bld->type.length == 4) {
681       intrinsic = "llvm.x86.sse3.hadd.ps";
682    }
683    else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
684             bld->type.length == 8) {
685       intrinsic = "llvm.x86.avx.hadd.ps.256";
686    }
687    if (intrinsic) {
688       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
689                                        lp_build_vec_type(gallivm, bld->type),
690                                        tmp[0], tmp[1]);
691       if (num_vecs > 2) {
692          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
693                                           lp_build_vec_type(gallivm, bld->type),
694                                           tmp[2], tmp[3]);
695       }
696       else {
697          tmp[1] = tmp[0];
698       }
699       return lp_build_intrinsic_binary(builder, intrinsic,
700                                        lp_build_vec_type(gallivm, bld->type),
701                                        tmp[0], tmp[1]);
702    }
703 
704    if (bld->type.length == 4) {
705       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
706    }
707    else {
708       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
709       unsigned j;
710       unsigned num_iter = bld->type.length / 4;
711       struct lp_type parttype = bld->type;
712       parttype.length = 4;
713       for (j = 0; j < num_iter; j++) {
714          LLVMValueRef partsrc[4];
715          unsigned i;
716          for (i = 0; i < 4; i++) {
717             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
718          }
719          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
720       }
721       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
722    }
723    return ret_vec;
724 }
725 
726 
727 /**
728  * Generate a - b
729  */
730 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)731 lp_build_sub(struct lp_build_context *bld,
732              LLVMValueRef a,
733              LLVMValueRef b)
734 {
735    LLVMBuilderRef builder = bld->gallivm->builder;
736    const struct lp_type type = bld->type;
737    LLVMValueRef res;
738 
739    assert(lp_check_value(type, a));
740    assert(lp_check_value(type, b));
741 
742    if (b == bld->zero)
743       return a;
744    if (a == bld->undef || b == bld->undef)
745       return bld->undef;
746    if (a == b)
747       return bld->zero;
748 
749    if (type.norm) {
750       const char *intrinsic = NULL;
751 
752       if (!type.sign && b == bld->one)
753         return bld->zero;
754 
755       if (!type.floating && !type.fixed) {
756          if (LLVM_VERSION_MAJOR >= 8) {
757             char intrin[32];
758             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
759             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
760             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
761          }
762          if (type.width * type.length == 128) {
763             if (util_get_cpu_caps()->has_sse2) {
764                if (type.width == 8)
765                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
766                if (type.width == 16)
767                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
768             } else if (util_get_cpu_caps()->has_altivec) {
769                if (type.width == 8)
770                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
771                if (type.width == 16)
772                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
773             }
774          }
775          if (type.width * type.length == 256) {
776             if (util_get_cpu_caps()->has_avx2) {
777                if (type.width == 8)
778                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
779                if (type.width == 16)
780                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
781             }
782          }
783       }
784 
785       if (intrinsic)
786          return lp_build_intrinsic_binary(builder, intrinsic,
787                       lp_build_vec_type(bld->gallivm, bld->type), a, b);
788    }
789 
790    if (type.norm && !type.floating && !type.fixed) {
791       if (type.sign) {
792          uint64_t sign = (uint64_t)1 << (type.width - 1);
793          LLVMValueRef max_val =
794             lp_build_const_int_vec(bld->gallivm, type, sign - 1);
795          LLVMValueRef min_val =
796             lp_build_const_int_vec(bld->gallivm, type, sign);
797          /* a_clamp_max is the maximum a for negative b,
798             a_clamp_min is the minimum a for positive b. */
799          LLVMValueRef a_clamp_max =
800             lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""),
801                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802          LLVMValueRef a_clamp_min =
803             lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""),
804                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
805          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
806                                                bld->zero),
807                              a_clamp_min, a_clamp_max);
808       } else {
809          /*
810           * This must match llvm pattern for saturated unsigned sub.
811           * (lp_build_max_simple actually does the job with its current
812           * definition but do it explicitly here.)
813           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
814           * interfere with llvm's ability to recognize the pattern but seems
815           * a bit brittle.
816           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
817           */
818          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
819          a = lp_build_select(bld, no_ov, a, b);
820       }
821    }
822 
823    if (type.floating)
824       res = LLVMBuildFSub(builder, a, b, "");
825    else
826       res = LLVMBuildSub(builder, a, b, "");
827 
828    if (bld->type.norm && (bld->type.floating || bld->type.fixed))
829       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
830 
831    return res;
832 }
833 
834 
835 /**
836  * Normalized multiplication.
837  *
838  * There are several approaches for (using 8-bit normalized multiplication as
839  * an example):
840  *
841  * - alpha plus one
842  *
843  *     makes the following approximation to the division (Sree)
844  *
845  *       a*b/255 ~= (a*(b + 1)) >> 256
846  *
847  *     which is the fastest method that satisfies the following OpenGL
848  *     criteria of
849  *
850  *       0*0 = 0 and 255*255 = 255
851  *
852  * - geometric series
853  *
854  *     takes the geometric series approximation to the division
855  *
856  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
857  *
858  *     in this case just the first two terms to fit in 16bit arithmetic
859  *
860  *       t/255 ~= (t + (t >> 8)) >> 8
861  *
862  *     note that just by itself it doesn't satisfies the OpenGL criteria,
863  *     as 255*255 = 254, so the special case b = 255 must be accounted or
864  *     roundoff must be used.
865  *
866  * - geometric series plus rounding
867  *
868  *     when using a geometric series division instead of truncating the result
869  *     use roundoff in the approximation (Jim Blinn)
870  *
871  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
872  *
873  *     achieving the exact results.
874  *
875  *
876  *
877  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
878  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
879  * @sa Michael Herf, The "double blend trick", May 2000,
880  *     http://www.stereopsis.com/doubleblend.html
881  */
882 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)883 lp_build_mul_norm(struct gallivm_state *gallivm,
884                   struct lp_type wide_type,
885                   LLVMValueRef a, LLVMValueRef b)
886 {
887    LLVMBuilderRef builder = gallivm->builder;
888    struct lp_build_context bld;
889    unsigned n;
890    LLVMValueRef half;
891    LLVMValueRef ab;
892 
893    assert(!wide_type.floating);
894    assert(lp_check_value(wide_type, a));
895    assert(lp_check_value(wide_type, b));
896 
897    lp_build_context_init(&bld, gallivm, wide_type);
898 
899    n = wide_type.width / 2;
900    if (wide_type.sign) {
901       --n;
902    }
903 
904    /*
905     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
906     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
907     */
908 
909    /*
910     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
911     */
912 
913    ab = LLVMBuildMul(builder, a, b, "");
914    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
915 
916    /*
917     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
918     */
919 
920    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
921    if (wide_type.sign) {
922       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
923       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
924       half = lp_build_select(&bld, sign, minus_half, half);
925    }
926    ab = LLVMBuildAdd(builder, ab, half, "");
927 
928    /* Final division */
929    ab = lp_build_shr_imm(&bld, ab, n);
930 
931    return ab;
932 }
933 
934 
935 /**
936  * Generate a * b
937  */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940              LLVMValueRef a,
941              LLVMValueRef b)
942 {
943    LLVMBuilderRef builder = bld->gallivm->builder;
944    const struct lp_type type = bld->type;
945 
946    assert(lp_check_value(type, a));
947    assert(lp_check_value(type, b));
948 
949    if (a == bld->zero)
950       return bld->zero;
951    if (a == bld->one)
952       return b;
953    if (b == bld->zero)
954       return bld->zero;
955    if (b == bld->one)
956       return a;
957    if (a == bld->undef || b == bld->undef)
958       return bld->undef;
959 
960    if (!type.floating && !type.fixed && type.norm) {
961       struct lp_type wide_type = lp_wider_type(type);
962       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
963 
964       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
965       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
966 
967       /* PMULLW, PSRLW, PADDW */
968       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
969       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
970 
971       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
972 
973       return ab;
974    }
975 
976    LLVMValueRef shift = type.fixed
977       ? lp_build_const_int_vec(bld->gallivm, type, type.width/2) : NULL;
978 
979    LLVMValueRef res;
980    if (type.floating)
981       res = LLVMBuildFMul(builder, a, b, "");
982    else
983       res = LLVMBuildMul(builder, a, b, "");
984    if (shift) {
985       if (type.sign)
986          res = LLVMBuildAShr(builder, res, shift, "");
987       else
988          res = LLVMBuildLShr(builder, res, shift, "");
989    }
990 
991    return res;
992 }
993 
994 
995 /*
996  * Widening mul, valid for 32x32 bit -> 64bit only.
997  * Result is low 32bits, high bits returned in res_hi.
998  *
999  * Emits code that is meant to be compiled for the host CPU.
1000  */
1001 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1002 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1003                          LLVMValueRef a,
1004                          LLVMValueRef b,
1005                          LLVMValueRef *res_hi)
1006 {
1007    struct gallivm_state *gallivm = bld->gallivm;
1008    LLVMBuilderRef builder = gallivm->builder;
1009 
1010    assert(bld->type.width == 32);
1011    assert(bld->type.floating == 0);
1012    assert(bld->type.fixed == 0);
1013    assert(bld->type.norm == 0);
1014 
1015    /*
1016     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1017     * for x86 simd is atrocious (even if the high bits weren't required),
1018     * trying to handle real 64bit inputs (which of course can't happen due
1019     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1020     * apparently llvm does not recognize this widening mul). This includes 6
1021     * (instead of 2) pmuludq plus extra adds and shifts
1022     * The same story applies to signed mul, albeit fixing this requires sse41.
1023     * https://llvm.org/bugs/show_bug.cgi?id=30845
1024     * So, whip up our own code, albeit only for length 4 and 8 (which
1025     * should be good enough)...
1026     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1027     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1028     * for signed), which the fallback code does not, without this llvm
1029     * will likely still produce atrocious code.
1030     */
1031    if (LLVM_VERSION_MAJOR < 7 &&
1032        (bld->type.length == 4 || bld->type.length == 8) &&
1033        ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1034         util_get_cpu_caps()->has_sse4_1)) {
1035       const char *intrinsic = NULL;
1036       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1037       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1038       struct lp_type type_wide = lp_wider_type(bld->type);
1039       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1040       unsigned i;
1041       for (i = 0; i < bld->type.length; i += 2) {
1042          shuf[i] = lp_build_const_int32(gallivm, i+1);
1043          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1044       }
1045       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1046       aeven = a;
1047       beven = b;
1048       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1049       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1050 
1051       if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1052          if (bld->type.sign) {
1053             intrinsic = "llvm.x86.avx2.pmul.dq";
1054          } else {
1055             intrinsic = "llvm.x86.avx2.pmulu.dq";
1056          }
1057          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1058                                              wider_type, aeven, beven);
1059          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1060                                             wider_type, aodd, bodd);
1061       }
1062       else {
1063          /* for consistent naming look elsewhere... */
1064          if (bld->type.sign) {
1065             intrinsic = "llvm.x86.sse41.pmuldq";
1066          } else {
1067             intrinsic = "llvm.x86.sse2.pmulu.dq";
1068          }
1069          /*
1070           * XXX If we only have AVX but not AVX2 this is a pain.
1071           * lp_build_intrinsic_binary_anylength() can't handle it
1072           * (due to src and dst type not being identical).
1073           */
1074          if (bld->type.length == 8) {
1075             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1076             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1077             LLVMValueRef muleven2[2], mulodd2[2];
1078             struct lp_type type_wide_half = type_wide;
1079             LLVMTypeRef wtype_half;
1080             type_wide_half.length = 2;
1081             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1082             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1083             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1084             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1085             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1086             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1087             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1088             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1089             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1090             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1091                                                     wtype_half, aevenlo, bevenlo);
1092             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1093                                                    wtype_half, aoddlo, boddlo);
1094             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1095                                                     wtype_half, aevenhi, bevenhi);
1096             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1097                                                    wtype_half, aoddhi, boddhi);
1098             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1099             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1100 
1101          }
1102          else {
1103             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1104                                                 wider_type, aeven, beven);
1105             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1106                                                wider_type, aodd, bodd);
1107          }
1108       }
1109       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1110       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1111 
1112       for (i = 0; i < bld->type.length; i += 2) {
1113          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1114          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1115       }
1116       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1117       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1118 
1119       for (i = 0; i < bld->type.length; i += 2) {
1120          shuf[i] = lp_build_const_int32(gallivm, i);
1121          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1122       }
1123       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1124       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1125    }
1126    else {
1127       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1128    }
1129 }
1130 
1131 
1132 /*
1133  * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1134  * Result is low N bits, high bits returned in res_hi.
1135  *
1136  * Emits generic code.
1137  */
1138 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1139 lp_build_mul_32_lohi(struct lp_build_context *bld,
1140                      LLVMValueRef a,
1141                      LLVMValueRef b,
1142                      LLVMValueRef *res_hi)
1143 {
1144    struct gallivm_state *gallivm = bld->gallivm;
1145    LLVMBuilderRef builder = gallivm->builder;
1146    LLVMValueRef tmp, shift, res_lo;
1147    struct lp_type type_tmp;
1148    LLVMTypeRef wide_type, narrow_type;
1149 
1150    type_tmp = bld->type;
1151    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1152    if (bld->type.width < 32)
1153       type_tmp.width = 32;
1154    else
1155       type_tmp.width *= 2;
1156    wide_type = lp_build_vec_type(gallivm, type_tmp);
1157    shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1158 
1159    if (bld->type.sign) {
1160       a = LLVMBuildSExt(builder, a, wide_type, "");
1161       b = LLVMBuildSExt(builder, b, wide_type, "");
1162    } else {
1163       a = LLVMBuildZExt(builder, a, wide_type, "");
1164       b = LLVMBuildZExt(builder, b, wide_type, "");
1165    }
1166    tmp = LLVMBuildMul(builder, a, b, "");
1167 
1168    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1169 
1170    /* Since we truncate anyway, LShr and AShr are equivalent. */
1171    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1172    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1173 
1174    return res_lo;
1175 }
1176 
1177 
1178 /* a * b + c */
1179 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1180 lp_build_mad(struct lp_build_context *bld,
1181              LLVMValueRef a,
1182              LLVMValueRef b,
1183              LLVMValueRef c)
1184 {
1185    const struct lp_type type = bld->type;
1186    if (type.floating) {
1187       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1188    } else {
1189       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1190    }
1191 }
1192 
1193 
1194 /**
1195  * Small vector x scale multiplication optimization.
1196  */
1197 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1198 lp_build_mul_imm(struct lp_build_context *bld,
1199                  LLVMValueRef a,
1200                  int b)
1201 {
1202    LLVMBuilderRef builder = bld->gallivm->builder;
1203    LLVMValueRef factor;
1204 
1205    assert(lp_check_value(bld->type, a));
1206 
1207    if (b == 0)
1208       return bld->zero;
1209 
1210    if (b == 1)
1211       return a;
1212 
1213    if (b == -1)
1214       return lp_build_negate(bld, a);
1215 
1216    if (b == 2 && bld->type.floating)
1217       return lp_build_add(bld, a, a);
1218 
1219    if (util_is_power_of_two_or_zero(b)) {
1220       unsigned shift = ffs(b) - 1;
1221 
1222       if (bld->type.floating) {
1223 #if 0
1224          /*
1225           * Power of two multiplication by directly manipulating the exponent.
1226           *
1227           * XXX: This might not be always faster, it will introduce a small
1228           * error for multiplication by zero, and it will produce wrong results
1229           * for Inf and NaN.
1230           */
1231          unsigned mantissa = lp_mantissa(bld->type);
1232          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1233          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1234          a = LLVMBuildAdd(builder, a, factor, "");
1235          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1236          return a;
1237 #endif
1238       }
1239       else {
1240          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1241          return LLVMBuildShl(builder, a, factor, "");
1242       }
1243    }
1244 
1245    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1246    return lp_build_mul(bld, a, factor);
1247 }
1248 
1249 
1250 /**
1251  * Generate a / b
1252  */
1253 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1254 lp_build_div(struct lp_build_context *bld,
1255              LLVMValueRef a,
1256              LLVMValueRef b)
1257 {
1258    LLVMBuilderRef builder = bld->gallivm->builder;
1259    const struct lp_type type = bld->type;
1260 
1261    assert(lp_check_value(type, a));
1262    assert(lp_check_value(type, b));
1263 
1264    if (a == bld->zero)
1265       return bld->zero;
1266    if (a == bld->one && type.floating)
1267       return lp_build_rcp(bld, b);
1268    if (b == bld->zero)
1269       return bld->undef;
1270    if (b == bld->one)
1271       return a;
1272    if (a == bld->undef || b == bld->undef)
1273       return bld->undef;
1274 
1275    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1276    if (FALSE &&
1277       ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1278        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1279       type.floating)
1280       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1281 
1282    if (type.floating)
1283       return LLVMBuildFDiv(builder, a, b, "");
1284    else if (type.sign)
1285       return LLVMBuildSDiv(builder, a, b, "");
1286    else
1287       return LLVMBuildUDiv(builder, a, b, "");
1288 }
1289 
1290 
1291 /**
1292  * Linear interpolation helper.
1293  *
1294  * @param normalized whether we are interpolating normalized values,
1295  *        encoded in normalized integers, twice as wide.
1296  *
1297  * @sa http://www.stereopsis.com/doubleblend.html
1298  */
1299 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1300 lp_build_lerp_simple(struct lp_build_context *bld,
1301                      LLVMValueRef x,
1302                      LLVMValueRef v0,
1303                      LLVMValueRef v1,
1304                      unsigned flags)
1305 {
1306    unsigned half_width = bld->type.width/2;
1307    LLVMBuilderRef builder = bld->gallivm->builder;
1308    LLVMValueRef delta;
1309    LLVMValueRef res;
1310 
1311    assert(lp_check_value(bld->type, x));
1312    assert(lp_check_value(bld->type, v0));
1313    assert(lp_check_value(bld->type, v1));
1314 
1315    delta = lp_build_sub(bld, v1, v0);
1316 
1317    if (bld->type.floating) {
1318       assert(flags == 0);
1319       return lp_build_mad(bld, x, delta, v0);
1320    }
1321 
1322    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1323       if (!bld->type.sign) {
1324          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1325             /*
1326              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1327              * most-significant-bit to the lowest-significant-bit, so that
1328              * later we can just divide by 2**n instead of 2**n - 1.
1329              */
1330 
1331             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1332          }
1333 
1334          /* (x * delta) >> n */
1335          /*
1336           * For this multiply, higher internal precision is required to pass
1337           * CTS, the most efficient path to that is pmulhrsw on ssse3 and
1338           * above.  This could be opencoded on other arches if conformance was
1339           * required.
1340           */
1341          if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1342             res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1343             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1344          } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1345             res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1346             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1347          } else {
1348             res = lp_build_mul(bld, x, delta);
1349             res = lp_build_shr_imm(bld, res, half_width);
1350          }
1351       } else {
1352          /*
1353           * The rescaling trick above doesn't work for signed numbers, so
1354           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1355           * instead.
1356           */
1357          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1358          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1359       }
1360    } else {
1361       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1362       res = lp_build_mul(bld, x, delta);
1363    }
1364 
1365    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1366       /*
1367        * At this point both res and v0 only use the lower half of the bits,
1368        * the rest is zero. Instead of add / mask, do add with half wide type.
1369        */
1370       struct lp_type narrow_type;
1371       struct lp_build_context narrow_bld;
1372 
1373       memset(&narrow_type, 0, sizeof narrow_type);
1374       narrow_type.sign   = bld->type.sign;
1375       narrow_type.width  = bld->type.width/2;
1376       narrow_type.length = bld->type.length*2;
1377 
1378       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1379       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1380       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1381       res = lp_build_add(&narrow_bld, v0, res);
1382       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1383    } else {
1384       res = lp_build_add(bld, v0, res);
1385 
1386       if (bld->type.fixed) {
1387          /*
1388           * We need to mask out the high order bits when lerping 8bit
1389           * normalized colors stored on 16bits
1390           */
1391          /* XXX: This step is necessary for lerping 8bit colors stored on
1392           * 16bits, but it will be wrong for true fixed point use cases.
1393           * Basically we need a more powerful lp_type, capable of further
1394           * distinguishing the values interpretation from the value storage.
1395           */
1396          LLVMValueRef low_bits;
1397          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1398          res = LLVMBuildAnd(builder, res, low_bits, "");
1399       }
1400    }
1401 
1402    return res;
1403 }
1404 
1405 
1406 /**
1407  * Linear interpolation.
1408  */
1409 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1410 lp_build_lerp(struct lp_build_context *bld,
1411               LLVMValueRef x,
1412               LLVMValueRef v0,
1413               LLVMValueRef v1,
1414               unsigned flags)
1415 {
1416    const struct lp_type type = bld->type;
1417    LLVMValueRef res;
1418 
1419    assert(lp_check_value(type, x));
1420    assert(lp_check_value(type, v0));
1421    assert(lp_check_value(type, v1));
1422 
1423    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1424 
1425    if (type.norm) {
1426       struct lp_type wide_type;
1427       struct lp_build_context wide_bld;
1428       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1429 
1430       assert(type.length >= 2);
1431 
1432       /*
1433        * Create a wider integer type, enough to hold the
1434        * intermediate result of the multiplication.
1435        */
1436       memset(&wide_type, 0, sizeof wide_type);
1437       wide_type.sign   = type.sign;
1438       wide_type.width  = type.width*2;
1439       wide_type.length = type.length/2;
1440 
1441       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1442 
1443       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1444       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1445       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1446 
1447       /*
1448        * Lerp both halves.
1449        */
1450 
1451       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1452 
1453       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1454       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1455 
1456       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1457    } else {
1458       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1459    }
1460 
1461    return res;
1462 }
1463 
1464 
1465 /**
1466  * Bilinear interpolation.
1467  *
1468  * Values indices are in v_{yx}.
1469  */
1470 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1471 lp_build_lerp_2d(struct lp_build_context *bld,
1472                  LLVMValueRef x,
1473                  LLVMValueRef y,
1474                  LLVMValueRef v00,
1475                  LLVMValueRef v01,
1476                  LLVMValueRef v10,
1477                  LLVMValueRef v11,
1478                  unsigned flags)
1479 {
1480    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1481    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1482    return lp_build_lerp(bld, y, v0, v1, flags);
1483 }
1484 
1485 
1486 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1487 lp_build_lerp_3d(struct lp_build_context *bld,
1488                  LLVMValueRef x,
1489                  LLVMValueRef y,
1490                  LLVMValueRef z,
1491                  LLVMValueRef v000,
1492                  LLVMValueRef v001,
1493                  LLVMValueRef v010,
1494                  LLVMValueRef v011,
1495                  LLVMValueRef v100,
1496                  LLVMValueRef v101,
1497                  LLVMValueRef v110,
1498                  LLVMValueRef v111,
1499                  unsigned flags)
1500 {
1501    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1502    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1503    return lp_build_lerp(bld, z, v0, v1, flags);
1504 }
1505 
1506 
1507 /**
1508  * Generate min(a, b)
1509  * Do checks for special cases but not for nans.
1510  */
1511 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1512 lp_build_min(struct lp_build_context *bld,
1513              LLVMValueRef a,
1514              LLVMValueRef b)
1515 {
1516    assert(lp_check_value(bld->type, a));
1517    assert(lp_check_value(bld->type, b));
1518 
1519    if (a == bld->undef || b == bld->undef)
1520       return bld->undef;
1521 
1522    if (a == b)
1523       return a;
1524 
1525    if (bld->type.norm) {
1526       if (!bld->type.sign) {
1527          if (a == bld->zero || b == bld->zero) {
1528             return bld->zero;
1529          }
1530       }
1531       if (a == bld->one)
1532          return b;
1533       if (b == bld->one)
1534          return a;
1535    }
1536 
1537    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1538 }
1539 
1540 
1541 /**
1542  * Generate min(a, b)
1543  * NaN's are handled according to the behavior specified by the
1544  * nan_behavior argument.
1545  */
1546 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1547 lp_build_min_ext(struct lp_build_context *bld,
1548                  LLVMValueRef a,
1549                  LLVMValueRef b,
1550                  enum gallivm_nan_behavior nan_behavior)
1551 {
1552    assert(lp_check_value(bld->type, a));
1553    assert(lp_check_value(bld->type, b));
1554 
1555    if (a == bld->undef || b == bld->undef)
1556       return bld->undef;
1557 
1558    if (a == b)
1559       return a;
1560 
1561    if (bld->type.norm) {
1562       if (!bld->type.sign) {
1563          if (a == bld->zero || b == bld->zero) {
1564             return bld->zero;
1565          }
1566       }
1567       if (a == bld->one)
1568          return b;
1569       if (b == bld->one)
1570          return a;
1571    }
1572 
1573    return lp_build_min_simple(bld, a, b, nan_behavior);
1574 }
1575 
1576 
1577 /**
1578  * Generate max(a, b)
1579  * Do checks for special cases, but NaN behavior is undefined.
1580  */
1581 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1582 lp_build_max(struct lp_build_context *bld,
1583              LLVMValueRef a,
1584              LLVMValueRef b)
1585 {
1586    assert(lp_check_value(bld->type, a));
1587    assert(lp_check_value(bld->type, b));
1588 
1589    if (a == bld->undef || b == bld->undef)
1590       return bld->undef;
1591 
1592    if (a == b)
1593       return a;
1594 
1595    if (bld->type.norm) {
1596       if (a == bld->one || b == bld->one)
1597          return bld->one;
1598       if (!bld->type.sign) {
1599          if (a == bld->zero) {
1600             return b;
1601          }
1602          if (b == bld->zero) {
1603             return a;
1604          }
1605       }
1606    }
1607 
1608    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1609 }
1610 
1611 
1612 /**
1613  * Generate max(a, b)
1614  * Checks for special cases.
1615  * NaN's are handled according to the behavior specified by the
1616  * nan_behavior argument.
1617  */
1618 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1619 lp_build_max_ext(struct lp_build_context *bld,
1620                   LLVMValueRef a,
1621                   LLVMValueRef b,
1622                   enum gallivm_nan_behavior nan_behavior)
1623 {
1624    assert(lp_check_value(bld->type, a));
1625    assert(lp_check_value(bld->type, b));
1626 
1627    if (a == bld->undef || b == bld->undef)
1628       return bld->undef;
1629 
1630    if (a == b)
1631       return a;
1632 
1633    if (bld->type.norm) {
1634       if (a == bld->one || b == bld->one)
1635          return bld->one;
1636       if (!bld->type.sign) {
1637          if (a == bld->zero) {
1638             return b;
1639          }
1640          if (b == bld->zero) {
1641             return a;
1642          }
1643       }
1644    }
1645 
1646    return lp_build_max_simple(bld, a, b, nan_behavior);
1647 }
1648 
1649 
1650 /**
1651  * Generate clamp(a, min, max)
1652  * NaN behavior (for any of a, min, max) is undefined.
1653  * Do checks for special cases.
1654  */
1655 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1656 lp_build_clamp(struct lp_build_context *bld,
1657                LLVMValueRef a,
1658                LLVMValueRef min,
1659                LLVMValueRef max)
1660 {
1661    assert(lp_check_value(bld->type, a));
1662    assert(lp_check_value(bld->type, min));
1663    assert(lp_check_value(bld->type, max));
1664 
1665    a = lp_build_min(bld, a, max);
1666    a = lp_build_max(bld, a, min);
1667    return a;
1668 }
1669 
1670 
1671 /**
1672  * Generate clamp(a, 0, 1)
1673  * A NaN will get converted to zero.
1674  */
1675 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1676 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1677                                 LLVMValueRef a)
1678 {
1679    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1680    a = lp_build_min(bld, a, bld->one);
1681    return a;
1682 }
1683 
1684 
1685 /**
1686  * Generate abs(a)
1687  */
1688 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1689 lp_build_abs(struct lp_build_context *bld,
1690              LLVMValueRef a)
1691 {
1692    LLVMBuilderRef builder = bld->gallivm->builder;
1693    const struct lp_type type = bld->type;
1694    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1695 
1696    assert(lp_check_value(type, a));
1697 
1698    if (!type.sign)
1699       return a;
1700 
1701    if (type.floating) {
1702       char intrinsic[32];
1703       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1704       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1705    }
1706 
1707    if (type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1708       switch(type.width) {
1709       case 8:
1710          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1711       case 16:
1712          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1713       case 32:
1714          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1715       }
1716    }
1717    else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1718       switch(type.width) {
1719       case 8:
1720          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1721       case 16:
1722          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1723       case 32:
1724          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1725       }
1726    }
1727 
1728    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1729                           a, LLVMBuildNeg(builder, a, ""));
1730 }
1731 
1732 
1733 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1734 lp_build_negate(struct lp_build_context *bld,
1735                 LLVMValueRef a)
1736 {
1737    LLVMBuilderRef builder = bld->gallivm->builder;
1738 
1739    assert(lp_check_value(bld->type, a));
1740 
1741    if (bld->type.floating)
1742       a = LLVMBuildFNeg(builder, a, "");
1743    else
1744       a = LLVMBuildNeg(builder, a, "");
1745 
1746    return a;
1747 }
1748 
1749 
1750 /** Return -1, 0 or +1 depending on the sign of a */
1751 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1752 lp_build_sgn(struct lp_build_context *bld,
1753              LLVMValueRef a)
1754 {
1755    LLVMBuilderRef builder = bld->gallivm->builder;
1756    const struct lp_type type = bld->type;
1757    LLVMValueRef cond;
1758    LLVMValueRef res;
1759 
1760    assert(lp_check_value(type, a));
1761 
1762    /* Handle non-zero case */
1763    if (!type.sign) {
1764       /* if not zero then sign must be positive */
1765       res = bld->one;
1766    }
1767    else if (type.floating) {
1768       LLVMTypeRef vec_type;
1769       LLVMTypeRef int_type;
1770       LLVMValueRef mask;
1771       LLVMValueRef sign;
1772       LLVMValueRef one;
1773       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1774 
1775       int_type = lp_build_int_vec_type(bld->gallivm, type);
1776       vec_type = lp_build_vec_type(bld->gallivm, type);
1777       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1778 
1779       /* Take the sign bit and add it to 1 constant */
1780       sign = LLVMBuildBitCast(builder, a, int_type, "");
1781       sign = LLVMBuildAnd(builder, sign, mask, "");
1782       one = LLVMConstBitCast(bld->one, int_type);
1783       res = LLVMBuildOr(builder, sign, one, "");
1784       res = LLVMBuildBitCast(builder, res, vec_type, "");
1785    }
1786    else
1787    {
1788       /* signed int/norm/fixed point */
1789       /* could use psign with sse3 and appropriate vectors here */
1790       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1791       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1792       res = lp_build_select(bld, cond, bld->one, minus_one);
1793    }
1794 
1795    /* Handle zero */
1796    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1797    res = lp_build_select(bld, cond, bld->zero, res);
1798 
1799    return res;
1800 }
1801 
1802 
1803 /**
1804  * Set the sign of float vector 'a' according to 'sign'.
1805  * If sign==0, return abs(a).
1806  * If sign==1, return -abs(a);
1807  * Other values for sign produce undefined results.
1808  */
1809 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1810 lp_build_set_sign(struct lp_build_context *bld,
1811                   LLVMValueRef a, LLVMValueRef sign)
1812 {
1813    LLVMBuilderRef builder = bld->gallivm->builder;
1814    const struct lp_type type = bld->type;
1815    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1816    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1817    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1818    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1819                              ~((unsigned long long) 1 << (type.width - 1)));
1820    LLVMValueRef val, res;
1821 
1822    assert(type.floating);
1823    assert(lp_check_value(type, a));
1824 
1825    /* val = reinterpret_cast<int>(a) */
1826    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1827    /* val = val & mask */
1828    val = LLVMBuildAnd(builder, val, mask, "");
1829    /* sign = sign << shift */
1830    sign = LLVMBuildShl(builder, sign, shift, "");
1831    /* res = val | sign */
1832    res = LLVMBuildOr(builder, val, sign, "");
1833    /* res = reinterpret_cast<float>(res) */
1834    res = LLVMBuildBitCast(builder, res, vec_type, "");
1835 
1836    return res;
1837 }
1838 
1839 
1840 /**
1841  * Convert vector of (or scalar) int to vector of (or scalar) float.
1842  */
1843 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1844 lp_build_int_to_float(struct lp_build_context *bld,
1845                       LLVMValueRef a)
1846 {
1847    LLVMBuilderRef builder = bld->gallivm->builder;
1848    const struct lp_type type = bld->type;
1849    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1850 
1851    assert(type.floating);
1852 
1853    return LLVMBuildSIToFP(builder, a, vec_type, "");
1854 }
1855 
1856 
1857 static boolean
arch_rounding_available(const struct lp_type type)1858 arch_rounding_available(const struct lp_type type)
1859 {
1860    if ((util_get_cpu_caps()->has_sse4_1 &&
1861        (type.length == 1 || type.width*type.length == 128)) ||
1862        (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1863        (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1864       return TRUE;
1865    else if ((util_get_cpu_caps()->has_altivec &&
1866             (type.width == 32 && type.length == 4)))
1867       return TRUE;
1868    else if (util_get_cpu_caps()->has_neon)
1869       return TRUE;
1870    else if (util_get_cpu_caps()->family == CPU_S390X)
1871       return TRUE;
1872 
1873    return FALSE;
1874 }
1875 
1876 enum lp_build_round_mode
1877 {
1878    LP_BUILD_ROUND_NEAREST = 0,
1879    LP_BUILD_ROUND_FLOOR = 1,
1880    LP_BUILD_ROUND_CEIL = 2,
1881    LP_BUILD_ROUND_TRUNCATE = 3
1882 };
1883 
1884 
1885 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1886 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1887                              LLVMValueRef a)
1888 {
1889    LLVMBuilderRef builder = bld->gallivm->builder;
1890    const struct lp_type type = bld->type;
1891    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1892    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1893    const char *intrinsic;
1894    LLVMValueRef res;
1895 
1896    assert(type.floating);
1897    /* using the double precision conversions is a bit more complicated */
1898    assert(type.width == 32);
1899 
1900    assert(lp_check_value(type, a));
1901    assert(util_get_cpu_caps()->has_sse2);
1902 
1903    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1904    if (type.length == 1) {
1905       LLVMTypeRef vec_type;
1906       LLVMValueRef undef;
1907       LLVMValueRef arg;
1908       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1909 
1910       vec_type = LLVMVectorType(bld->elem_type, 4);
1911 
1912       intrinsic = "llvm.x86.sse.cvtss2si";
1913 
1914       undef = LLVMGetUndef(vec_type);
1915 
1916       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1917 
1918       res = lp_build_intrinsic_unary(builder, intrinsic,
1919                                      ret_type, arg);
1920    }
1921    else {
1922       if (type.width* type.length == 128) {
1923          intrinsic = "llvm.x86.sse2.cvtps2dq";
1924       }
1925       else {
1926          assert(type.width*type.length == 256);
1927          assert(util_get_cpu_caps()->has_avx);
1928 
1929          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1930       }
1931       res = lp_build_intrinsic_unary(builder, intrinsic,
1932                                      ret_type, a);
1933    }
1934 
1935    return res;
1936 }
1937 
1938 
1939 /*
1940  */
1941 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1942 lp_build_round_altivec(struct lp_build_context *bld,
1943                        LLVMValueRef a,
1944                        enum lp_build_round_mode mode)
1945 {
1946    LLVMBuilderRef builder = bld->gallivm->builder;
1947    const struct lp_type type = bld->type;
1948    const char *intrinsic = NULL;
1949 
1950    assert(type.floating);
1951 
1952    assert(lp_check_value(type, a));
1953    assert(util_get_cpu_caps()->has_altivec);
1954 
1955    (void)type;
1956 
1957    switch (mode) {
1958    case LP_BUILD_ROUND_NEAREST:
1959       intrinsic = "llvm.ppc.altivec.vrfin";
1960       break;
1961    case LP_BUILD_ROUND_FLOOR:
1962       intrinsic = "llvm.ppc.altivec.vrfim";
1963       break;
1964    case LP_BUILD_ROUND_CEIL:
1965       intrinsic = "llvm.ppc.altivec.vrfip";
1966       break;
1967    case LP_BUILD_ROUND_TRUNCATE:
1968       intrinsic = "llvm.ppc.altivec.vrfiz";
1969       break;
1970    }
1971 
1972    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1973 }
1974 
1975 
1976 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1977 lp_build_round_arch(struct lp_build_context *bld,
1978                     LLVMValueRef a,
1979                     enum lp_build_round_mode mode)
1980 {
1981    if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
1982        util_get_cpu_caps()->family == CPU_S390X) {
1983       LLVMBuilderRef builder = bld->gallivm->builder;
1984       const struct lp_type type = bld->type;
1985       const char *intrinsic_root;
1986       char intrinsic[32];
1987 
1988       assert(type.floating);
1989       assert(lp_check_value(type, a));
1990       (void)type;
1991 
1992       switch (mode) {
1993       case LP_BUILD_ROUND_NEAREST:
1994          intrinsic_root = "llvm.nearbyint";
1995          break;
1996       case LP_BUILD_ROUND_FLOOR:
1997          intrinsic_root = "llvm.floor";
1998          break;
1999       case LP_BUILD_ROUND_CEIL:
2000          intrinsic_root = "llvm.ceil";
2001          break;
2002       case LP_BUILD_ROUND_TRUNCATE:
2003          intrinsic_root = "llvm.trunc";
2004          break;
2005       default:
2006          unreachable("unhandled lp_build_round_mode");
2007       }
2008 
2009       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2010       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2011    }
2012    else /* (util_get_cpu_caps()->has_altivec) */
2013      return lp_build_round_altivec(bld, a, mode);
2014 }
2015 
2016 
2017 /**
2018  * Return the integer part of a float (vector) value (== round toward zero).
2019  * The returned value is a float (vector).
2020  * Ex: trunc(-1.5) = -1.0
2021  */
2022 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2023 lp_build_trunc(struct lp_build_context *bld,
2024                LLVMValueRef a)
2025 {
2026    LLVMBuilderRef builder = bld->gallivm->builder;
2027    const struct lp_type type = bld->type;
2028 
2029    assert(type.floating);
2030    assert(lp_check_value(type, a));
2031 
2032    if (type.width == 16) {
2033       char intrinsic[64];
2034       lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2035       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2036    }
2037 
2038    if (arch_rounding_available(type)) {
2039       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2040    }
2041    else {
2042       const struct lp_type type = bld->type;
2043       struct lp_type inttype;
2044       struct lp_build_context intbld;
2045       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2046       LLVMValueRef trunc, res, anosign, mask;
2047       LLVMTypeRef int_vec_type = bld->int_vec_type;
2048       LLVMTypeRef vec_type = bld->vec_type;
2049 
2050       inttype = type;
2051       inttype.floating = 0;
2052       lp_build_context_init(&intbld, bld->gallivm, inttype);
2053 
2054       /* round by truncation */
2055       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2056       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2057 
2058       /* mask out sign bit */
2059       anosign = lp_build_abs(bld, a);
2060       /*
2061        * mask out all values if anosign > 2^24
2062        * This should work both for large ints (all rounding is no-op for them
2063        * because such floats are always exact) as well as special cases like
2064        * NaNs, Infs (taking advantage of the fact they use max exponent).
2065        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066        */
2067       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070       return lp_build_select(bld, mask, a, res);
2071    }
2072 }
2073 
2074 
2075 /**
2076  * Return float (vector) rounded to nearest integer (vector).  The returned
2077  * value is a float (vector).
2078  * Ex: round(0.9) = 1.0
2079  * Ex: round(-1.5) = -2.0
2080  */
2081 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2082 lp_build_round(struct lp_build_context *bld,
2083                LLVMValueRef a)
2084 {
2085    LLVMBuilderRef builder = bld->gallivm->builder;
2086    const struct lp_type type = bld->type;
2087 
2088    assert(type.floating);
2089    assert(lp_check_value(type, a));
2090 
2091    if (type.width == 16) {
2092       char intrinsic[64];
2093       lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2094       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2095    }
2096 
2097    if (arch_rounding_available(type)) {
2098       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2099    }
2100    else {
2101       const struct lp_type type = bld->type;
2102       struct lp_type inttype;
2103       struct lp_build_context intbld;
2104       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2105       LLVMValueRef res, anosign, mask;
2106       LLVMTypeRef int_vec_type = bld->int_vec_type;
2107       LLVMTypeRef vec_type = bld->vec_type;
2108 
2109       inttype = type;
2110       inttype.floating = 0;
2111       lp_build_context_init(&intbld, bld->gallivm, inttype);
2112 
2113       res = lp_build_iround(bld, a);
2114       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2115 
2116       /* mask out sign bit */
2117       anosign = lp_build_abs(bld, a);
2118       /*
2119        * mask out all values if anosign > 2^24
2120        * This should work both for large ints (all rounding is no-op for them
2121        * because such floats are always exact) as well as special cases like
2122        * NaNs, Infs (taking advantage of the fact they use max exponent).
2123        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2124        */
2125       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2126       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2127       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2128       return lp_build_select(bld, mask, a, res);
2129    }
2130 }
2131 
2132 
2133 /**
2134  * Return floor of float (vector), result is a float (vector)
2135  * Ex: floor(1.1) = 1.0
2136  * Ex: floor(-1.1) = -2.0
2137  */
2138 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2139 lp_build_floor(struct lp_build_context *bld,
2140                LLVMValueRef a)
2141 {
2142    LLVMBuilderRef builder = bld->gallivm->builder;
2143    const struct lp_type type = bld->type;
2144 
2145    assert(type.floating);
2146    assert(lp_check_value(type, a));
2147 
2148    if (arch_rounding_available(type)) {
2149       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2150    }
2151    else {
2152       const struct lp_type type = bld->type;
2153       struct lp_type inttype;
2154       struct lp_build_context intbld;
2155       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2156       LLVMValueRef trunc, res, anosign, mask;
2157       LLVMTypeRef int_vec_type = bld->int_vec_type;
2158       LLVMTypeRef vec_type = bld->vec_type;
2159 
2160       if (type.width != 32) {
2161          char intrinsic[32];
2162          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2163          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2164       }
2165 
2166       assert(type.width == 32); /* might want to handle doubles at some point */
2167 
2168       inttype = type;
2169       inttype.floating = 0;
2170       lp_build_context_init(&intbld, bld->gallivm, inttype);
2171 
2172       /* round by truncation */
2173       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2174       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2175 
2176       if (type.sign) {
2177          LLVMValueRef tmp;
2178 
2179          /*
2180           * fix values if rounding is wrong (for non-special cases)
2181           * - this is the case if trunc > a
2182           */
2183          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2184          /* tmp = trunc > a ? 1.0 : 0.0 */
2185          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2186          tmp = lp_build_and(&intbld, mask, tmp);
2187          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2188          res = lp_build_sub(bld, res, tmp);
2189       }
2190 
2191       /* mask out sign bit */
2192       anosign = lp_build_abs(bld, a);
2193       /*
2194        * mask out all values if anosign > 2^24
2195        * This should work both for large ints (all rounding is no-op for them
2196        * because such floats are always exact) as well as special cases like
2197        * NaNs, Infs (taking advantage of the fact they use max exponent).
2198        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2199        */
2200       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2201       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2202       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2203       return lp_build_select(bld, mask, a, res);
2204    }
2205 }
2206 
2207 
2208 /**
2209  * Return ceiling of float (vector), returning float (vector).
2210  * Ex: ceil( 1.1) = 2.0
2211  * Ex: ceil(-1.1) = -1.0
2212  */
2213 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2214 lp_build_ceil(struct lp_build_context *bld,
2215               LLVMValueRef a)
2216 {
2217    LLVMBuilderRef builder = bld->gallivm->builder;
2218    const struct lp_type type = bld->type;
2219 
2220    assert(type.floating);
2221    assert(lp_check_value(type, a));
2222 
2223    if (arch_rounding_available(type)) {
2224       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2225    }
2226    else {
2227       const struct lp_type type = bld->type;
2228       struct lp_type inttype;
2229       struct lp_build_context intbld;
2230       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2231       LLVMValueRef trunc, res, anosign, mask, tmp;
2232       LLVMTypeRef int_vec_type = bld->int_vec_type;
2233       LLVMTypeRef vec_type = bld->vec_type;
2234 
2235       if (type.width != 32) {
2236          char intrinsic[32];
2237          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2238          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2239       }
2240 
2241       assert(type.width == 32); /* might want to handle doubles at some point */
2242 
2243       inttype = type;
2244       inttype.floating = 0;
2245       lp_build_context_init(&intbld, bld->gallivm, inttype);
2246 
2247       /* round by truncation */
2248       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2249       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2250 
2251       /*
2252        * fix values if rounding is wrong (for non-special cases)
2253        * - this is the case if trunc < a
2254        */
2255       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2256       /* tmp = trunc < a ? 1.0 : 0.0 */
2257       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2258       tmp = lp_build_and(&intbld, mask, tmp);
2259       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2260       res = lp_build_add(bld, trunc, tmp);
2261 
2262       /* mask out sign bit */
2263       anosign = lp_build_abs(bld, a);
2264       /*
2265        * mask out all values if anosign > 2^24
2266        * This should work both for large ints (all rounding is no-op for them
2267        * because such floats are always exact) as well as special cases like
2268        * NaNs, Infs (taking advantage of the fact they use max exponent).
2269        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2270        */
2271       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2272       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2273       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2274       return lp_build_select(bld, mask, a, res);
2275    }
2276 }
2277 
2278 
2279 /**
2280  * Return fractional part of 'a' computed as a - floor(a)
2281  * Typically used in texture coord arithmetic.
2282  */
2283 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2284 lp_build_fract(struct lp_build_context *bld,
2285                LLVMValueRef a)
2286 {
2287    assert(bld->type.floating);
2288    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2289 }
2290 
2291 
2292 /**
2293  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2294  * against 0.99999(9). (Will also return that value for NaNs.)
2295  */
2296 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2297 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2298 {
2299    LLVMValueRef max;
2300 
2301    /* this is the largest number smaller than 1.0 representable as float */
2302    max = lp_build_const_vec(bld->gallivm, bld->type,
2303                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2304    return lp_build_min_ext(bld, fract, max,
2305                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2306 }
2307 
2308 
2309 /**
2310  * Same as lp_build_fract, but guarantees that the result is always smaller
2311  * than one. Will also return the smaller-than-one value for infs, NaNs.
2312  */
2313 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2314 lp_build_fract_safe(struct lp_build_context *bld,
2315                     LLVMValueRef a)
2316 {
2317    return clamp_fract(bld, lp_build_fract(bld, a));
2318 }
2319 
2320 
2321 /**
2322  * Return the integer part of a float (vector) value (== round toward zero).
2323  * The returned value is an integer (vector).
2324  * Ex: itrunc(-1.5) = -1
2325  */
2326 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2327 lp_build_itrunc(struct lp_build_context *bld,
2328                 LLVMValueRef a)
2329 {
2330    LLVMBuilderRef builder = bld->gallivm->builder;
2331    const struct lp_type type = bld->type;
2332    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2333 
2334    assert(type.floating);
2335    assert(lp_check_value(type, a));
2336 
2337    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2338 }
2339 
2340 
2341 /**
2342  * Return float (vector) rounded to nearest integer (vector).  The returned
2343  * value is an integer (vector).
2344  * Ex: iround(0.9) = 1
2345  * Ex: iround(-1.5) = -2
2346  */
2347 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2348 lp_build_iround(struct lp_build_context *bld,
2349                 LLVMValueRef a)
2350 {
2351    LLVMBuilderRef builder = bld->gallivm->builder;
2352    const struct lp_type type = bld->type;
2353    LLVMTypeRef int_vec_type = bld->int_vec_type;
2354    LLVMValueRef res;
2355 
2356    assert(type.floating);
2357 
2358    assert(lp_check_value(type, a));
2359 
2360    if ((util_get_cpu_caps()->has_sse2 &&
2361        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2362        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2363       return lp_build_iround_nearest_sse2(bld, a);
2364    }
2365    if (arch_rounding_available(type)) {
2366       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2367    }
2368    else {
2369       LLVMValueRef half;
2370 
2371       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2372 
2373       if (type.sign) {
2374          LLVMTypeRef vec_type = bld->vec_type;
2375          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2376                                     (unsigned long long)1 << (type.width - 1));
2377          LLVMValueRef sign;
2378 
2379          /* get sign bit */
2380          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2381          sign = LLVMBuildAnd(builder, sign, mask, "");
2382 
2383          /* sign * 0.5 */
2384          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2385          half = LLVMBuildOr(builder, sign, half, "");
2386          half = LLVMBuildBitCast(builder, half, vec_type, "");
2387       }
2388 
2389       res = LLVMBuildFAdd(builder, a, half, "");
2390    }
2391 
2392    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2393 
2394    return res;
2395 }
2396 
2397 
2398 /**
2399  * Return floor of float (vector), result is an int (vector)
2400  * Ex: ifloor(1.1) = 1.0
2401  * Ex: ifloor(-1.1) = -2.0
2402  */
2403 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2404 lp_build_ifloor(struct lp_build_context *bld,
2405                 LLVMValueRef a)
2406 {
2407    LLVMBuilderRef builder = bld->gallivm->builder;
2408    const struct lp_type type = bld->type;
2409    LLVMTypeRef int_vec_type = bld->int_vec_type;
2410    LLVMValueRef res;
2411 
2412    assert(type.floating);
2413    assert(lp_check_value(type, a));
2414 
2415    res = a;
2416    if (type.sign) {
2417       if (arch_rounding_available(type)) {
2418          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2419       }
2420       else {
2421          struct lp_type inttype;
2422          struct lp_build_context intbld;
2423          LLVMValueRef trunc, itrunc, mask;
2424 
2425          assert(type.floating);
2426          assert(lp_check_value(type, a));
2427 
2428          inttype = type;
2429          inttype.floating = 0;
2430          lp_build_context_init(&intbld, bld->gallivm, inttype);
2431 
2432          /* round by truncation */
2433          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2434          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2435 
2436          /*
2437           * fix values if rounding is wrong (for non-special cases)
2438           * - this is the case if trunc > a
2439           * The results of doing this with NaNs, very large values etc.
2440           * are undefined but this seems to be the case anyway.
2441           */
2442          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2443          /* cheapie minus one with mask since the mask is minus one / zero */
2444          return lp_build_add(&intbld, itrunc, mask);
2445       }
2446    }
2447 
2448    /* round to nearest (toward zero) */
2449    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2450 
2451    return res;
2452 }
2453 
2454 
2455 /**
2456  * Return ceiling of float (vector), returning int (vector).
2457  * Ex: iceil( 1.1) = 2
2458  * Ex: iceil(-1.1) = -1
2459  */
2460 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2461 lp_build_iceil(struct lp_build_context *bld,
2462                LLVMValueRef a)
2463 {
2464    LLVMBuilderRef builder = bld->gallivm->builder;
2465    const struct lp_type type = bld->type;
2466    LLVMTypeRef int_vec_type = bld->int_vec_type;
2467    LLVMValueRef res;
2468 
2469    assert(type.floating);
2470    assert(lp_check_value(type, a));
2471 
2472    if (arch_rounding_available(type)) {
2473       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2474    }
2475    else {
2476       struct lp_type inttype;
2477       struct lp_build_context intbld;
2478       LLVMValueRef trunc, itrunc, mask;
2479 
2480       assert(type.floating);
2481       assert(lp_check_value(type, a));
2482 
2483       inttype = type;
2484       inttype.floating = 0;
2485       lp_build_context_init(&intbld, bld->gallivm, inttype);
2486 
2487       /* round by truncation */
2488       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2489       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2490 
2491       /*
2492        * fix values if rounding is wrong (for non-special cases)
2493        * - this is the case if trunc < a
2494        * The results of doing this with NaNs, very large values etc.
2495        * are undefined but this seems to be the case anyway.
2496        */
2497       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2498       /* cheapie plus one with mask since the mask is minus one / zero */
2499       return lp_build_sub(&intbld, itrunc, mask);
2500    }
2501 
2502    /* round to nearest (toward zero) */
2503    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2504 
2505    return res;
2506 }
2507 
2508 
2509 /**
2510  * Combined ifloor() & fract().
2511  *
2512  * Preferred to calling the functions separately, as it will ensure that the
2513  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2514  */
2515 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2516 lp_build_ifloor_fract(struct lp_build_context *bld,
2517                       LLVMValueRef a,
2518                       LLVMValueRef *out_ipart,
2519                       LLVMValueRef *out_fpart)
2520 {
2521    LLVMBuilderRef builder = bld->gallivm->builder;
2522    const struct lp_type type = bld->type;
2523    LLVMValueRef ipart;
2524 
2525    assert(type.floating);
2526    assert(lp_check_value(type, a));
2527 
2528    if (arch_rounding_available(type)) {
2529       /*
2530        * floor() is easier.
2531        */
2532 
2533       ipart = lp_build_floor(bld, a);
2534       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2535       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2536    }
2537    else {
2538       /*
2539        * ifloor() is easier.
2540        */
2541 
2542       *out_ipart = lp_build_ifloor(bld, a);
2543       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2544       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2545    }
2546 }
2547 
2548 
2549 /**
2550  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2551  * always smaller than one.
2552  */
2553 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2554 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2555                            LLVMValueRef a,
2556                            LLVMValueRef *out_ipart,
2557                            LLVMValueRef *out_fpart)
2558 {
2559    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2560    *out_fpart = clamp_fract(bld, *out_fpart);
2561 }
2562 
2563 
2564 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2565 lp_build_sqrt(struct lp_build_context *bld,
2566               LLVMValueRef a)
2567 {
2568    LLVMBuilderRef builder = bld->gallivm->builder;
2569    const struct lp_type type = bld->type;
2570    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2571    char intrinsic[32];
2572 
2573    assert(lp_check_value(type, a));
2574 
2575    assert(type.floating);
2576    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2577 
2578    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2579 }
2580 
2581 
2582 /**
2583  * Do one Newton-Raphson step to improve reciprocate precision:
2584  *
2585  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2586  *
2587  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2588  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2589  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2590  * halo. It would be necessary to clamp the argument to prevent this.
2591  *
2592  * See also:
2593  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2594  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2595  */
2596 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2597 lp_build_rcp_refine(struct lp_build_context *bld,
2598                     LLVMValueRef a,
2599                     LLVMValueRef rcp_a)
2600 {
2601    LLVMBuilderRef builder = bld->gallivm->builder;
2602    LLVMValueRef neg_a;
2603    LLVMValueRef res;
2604 
2605    neg_a = LLVMBuildFNeg(builder, a, "");
2606    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2607    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2608 
2609    return res;
2610 }
2611 
2612 
2613 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2614 lp_build_rcp(struct lp_build_context *bld,
2615              LLVMValueRef a)
2616 {
2617    LLVMBuilderRef builder = bld->gallivm->builder;
2618    const struct lp_type type = bld->type;
2619 
2620    assert(lp_check_value(type, a));
2621 
2622    if (a == bld->zero)
2623       return bld->undef;
2624    if (a == bld->one)
2625       return bld->one;
2626    if (a == bld->undef)
2627       return bld->undef;
2628 
2629    assert(type.floating);
2630 
2631    if (LLVMIsConstant(a))
2632       return LLVMBuildFDiv(builder, bld->one, a, "");
2633 
2634    /*
2635     * We don't use RCPPS because:
2636     * - it only has 10bits of precision
2637     * - it doesn't even get the reciprocate of 1.0 exactly
2638     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2639     * - for recent processors the benefit over DIVPS is marginal, a case
2640     *   dependent
2641     *
2642     * We could still use it on certain processors if benchmarks show that the
2643     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2644     * particular uses that require less workarounds.
2645     */
2646 
2647    if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2648          (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2649       const unsigned num_iterations = 0;
2650       LLVMValueRef res;
2651       unsigned i;
2652       const char *intrinsic = NULL;
2653 
2654       if (type.length == 4) {
2655          intrinsic = "llvm.x86.sse.rcp.ps";
2656       }
2657       else {
2658          intrinsic = "llvm.x86.avx.rcp.ps.256";
2659       }
2660 
2661       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2662 
2663       for (i = 0; i < num_iterations; ++i) {
2664          res = lp_build_rcp_refine(bld, a, res);
2665       }
2666 
2667       return res;
2668    }
2669 
2670    return LLVMBuildFDiv(builder, bld->one, a, "");
2671 }
2672 
2673 
2674 /**
2675  * Do one Newton-Raphson step to improve rsqrt precision:
2676  *
2677  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2678  *
2679  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2680  */
2681 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2682 lp_build_rsqrt_refine(struct lp_build_context *bld,
2683                       LLVMValueRef a,
2684                       LLVMValueRef rsqrt_a)
2685 {
2686    LLVMBuilderRef builder = bld->gallivm->builder;
2687    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2688    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2689    LLVMValueRef res;
2690 
2691    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2692    res = LLVMBuildFMul(builder, a, res, "");
2693    res = LLVMBuildFSub(builder, three, res, "");
2694    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2695    res = LLVMBuildFMul(builder, half, res, "");
2696 
2697    return res;
2698 }
2699 
2700 
2701 /**
2702  * Generate 1/sqrt(a).
2703  * Result is undefined for values < 0, infinity for +0.
2704  */
2705 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2706 lp_build_rsqrt(struct lp_build_context *bld,
2707                LLVMValueRef a)
2708 {
2709    const struct lp_type type = bld->type;
2710 
2711    assert(lp_check_value(type, a));
2712 
2713    assert(type.floating);
2714 
2715    /*
2716     * This should be faster but all denormals will end up as infinity.
2717     */
2718    if (0 && lp_build_fast_rsqrt_available(type)) {
2719       const unsigned num_iterations = 1;
2720       LLVMValueRef res;
2721       unsigned i;
2722 
2723       /* rsqrt(1.0) != 1.0 here */
2724       res = lp_build_fast_rsqrt(bld, a);
2725 
2726       if (num_iterations) {
2727          /*
2728           * Newton-Raphson will result in NaN instead of infinity for zero,
2729           * and NaN instead of zero for infinity.
2730           * Also, need to ensure rsqrt(1.0) == 1.0.
2731           * All numbers smaller than FLT_MIN will result in +infinity
2732           * (rsqrtps treats all denormals as zero).
2733           */
2734          LLVMValueRef cmp;
2735          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2736          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2737 
2738          for (i = 0; i < num_iterations; ++i) {
2739             res = lp_build_rsqrt_refine(bld, a, res);
2740          }
2741          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2742          res = lp_build_select(bld, cmp, inf, res);
2743          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2744          res = lp_build_select(bld, cmp, bld->zero, res);
2745          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2746          res = lp_build_select(bld, cmp, bld->one, res);
2747       }
2748 
2749       return res;
2750    }
2751 
2752    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2753 }
2754 
2755 
2756 /**
2757  * If there's a fast (inaccurate) rsqrt instruction available
2758  * (caller may want to avoid to call rsqrt_fast if it's not available,
2759  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2760  * unavailable it would result in sqrt/div/mul so obviously
2761  * much better to just call sqrt, skipping both div and mul).
2762  */
2763 boolean
lp_build_fast_rsqrt_available(struct lp_type type)2764 lp_build_fast_rsqrt_available(struct lp_type type)
2765 {
2766    assert(type.floating);
2767 
2768    if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2769        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2770       return true;
2771    }
2772    return false;
2773 }
2774 
2775 
2776 /**
2777  * Generate 1/sqrt(a).
2778  * Result is undefined for values < 0, infinity for +0.
2779  * Precision is limited, only ~10 bits guaranteed
2780  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2781  */
2782 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2783 lp_build_fast_rsqrt(struct lp_build_context *bld,
2784                     LLVMValueRef a)
2785 {
2786    LLVMBuilderRef builder = bld->gallivm->builder;
2787    const struct lp_type type = bld->type;
2788 
2789    assert(lp_check_value(type, a));
2790 
2791    if (lp_build_fast_rsqrt_available(type)) {
2792       const char *intrinsic = NULL;
2793 
2794       if (type.length == 4) {
2795          intrinsic = "llvm.x86.sse.rsqrt.ps";
2796       }
2797       else {
2798          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2799       }
2800       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2801    }
2802    else {
2803       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2804    }
2805    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2806 }
2807 
2808 
2809 /**
2810  * Generate sin(a) or cos(a) using polynomial approximation.
2811  * TODO: it might be worth recognizing sin and cos using same source
2812  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2813  * would be way cheaper than calculating (nearly) everything twice...
2814  * Not sure it's common enough to be worth bothering however, scs
2815  * opcode could also benefit from calculating both though.
2816  */
2817 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,boolean cos)2818 lp_build_sin_or_cos(struct lp_build_context *bld,
2819                     LLVMValueRef a,
2820                     boolean cos)
2821 {
2822    struct gallivm_state *gallivm = bld->gallivm;
2823    LLVMBuilderRef b = gallivm->builder;
2824    struct lp_type int_type = lp_int_type(bld->type);
2825 
2826    /*
2827     *  take the absolute value,
2828     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2829     */
2830 
2831    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2832    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2833 
2834    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2835    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2836 
2837    /*
2838     * scale by 4/Pi
2839     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2840     */
2841 
2842    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2843    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2844 
2845    /*
2846     * store the integer part of y in mm0
2847     * emm2 = _mm_cvttps_epi32(y);
2848     */
2849 
2850    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2851 
2852    /*
2853     * j=(j+1) & (~1) (see the cephes sources)
2854     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2855     */
2856 
2857    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2858    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2859    /*
2860     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2861     */
2862    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2863    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2864 
2865    /*
2866     * y = _mm_cvtepi32_ps(emm2);
2867     */
2868    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2869 
2870    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2871    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2872    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2873    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2874 
2875    /*
2876     * Argument used for poly selection and sign bit determination
2877     * is different for sin vs. cos.
2878     */
2879    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2880                                emm2_and;
2881 
2882    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2883                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2884                                               const_29, "sign_bit") :
2885                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2886                                                               LLVMBuildShl(b, emm2_add,
2887                                                                            const_29, ""), ""),
2888                                               sign_mask, "sign_bit");
2889 
2890    /*
2891     * get the polynom selection mask
2892     * there is one polynom for 0 <= x <= Pi/4
2893     * and another one for Pi/4<x<=Pi/2
2894     * Both branches will be computed.
2895     *
2896     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2897     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2898     */
2899 
2900    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2901    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2902                                              int_type, PIPE_FUNC_EQUAL,
2903                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2904 
2905    /*
2906     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2907     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2908     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2909     */
2910    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2911    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2912    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2913 
2914    /*
2915     * The magic pass: "Extended precision modular arithmetic"
2916     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2917     */
2918    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2919    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2920    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2921 
2922    /*
2923     * Evaluate the first polynom  (0 <= x <= Pi/4)
2924     *
2925     * z = _mm_mul_ps(x,x);
2926     */
2927    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2928 
2929    /*
2930     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2931     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2932     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2933     */
2934    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2935    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2936    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2937 
2938    /*
2939     * y = *(v4sf*)_ps_coscof_p0;
2940     * y = _mm_mul_ps(y, z);
2941     */
2942    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2943    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2944    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2945    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2946 
2947 
2948    /*
2949     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2950     * y = _mm_sub_ps(y, tmp);
2951     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2952     */
2953    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2954    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2955    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2956    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2957    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2958 
2959    /*
2960     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2961     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2962     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2963     */
2964    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2965    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2966    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2967 
2968    /*
2969     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2970     *
2971     * y2 = *(v4sf*)_ps_sincof_p0;
2972     * y2 = _mm_mul_ps(y2, z);
2973     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2974     * y2 = _mm_mul_ps(y2, z);
2975     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2976     * y2 = _mm_mul_ps(y2, z);
2977     * y2 = _mm_mul_ps(y2, x);
2978     * y2 = _mm_add_ps(y2, x);
2979     */
2980 
2981    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2982    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2983    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2984    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2985 
2986    /*
2987     * select the correct result from the two polynoms
2988     * xmm3 = poly_mask;
2989     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2990     * y = _mm_andnot_ps(xmm3, y);
2991     * y = _mm_or_ps(y,y2);
2992     */
2993    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2994    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2995    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2996    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2997    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2998    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2999 
3000    /*
3001     * update the sign
3002     * y = _mm_xor_ps(y, sign_bit);
3003     */
3004    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3005    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3006 
3007    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3008 
3009    /* clamp output to be within [-1, 1] */
3010    y_result = lp_build_clamp(bld, y_result,
3011                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3012                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3013    /* If a is -inf, inf or NaN then return NaN */
3014    y_result = lp_build_select(bld, isfinite, y_result,
3015                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3016    return y_result;
3017 }
3018 
3019 
3020 /**
3021  * Generate sin(a)
3022  */
3023 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3024 lp_build_sin(struct lp_build_context *bld,
3025              LLVMValueRef a)
3026 {
3027    const struct lp_type type = bld->type;
3028 
3029    if (type.width == 16) {
3030       LLVMBuilderRef builder = bld->gallivm->builder;
3031       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3032       char intrinsic[32];
3033       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3034       LLVMValueRef args[] = { a };
3035       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3036    }
3037 
3038    return lp_build_sin_or_cos(bld, a, FALSE);
3039 }
3040 
3041 
3042 /**
3043  * Generate cos(a)
3044  */
3045 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3046 lp_build_cos(struct lp_build_context *bld,
3047              LLVMValueRef a)
3048 {
3049    const struct lp_type type = bld->type;
3050 
3051    if (type.width == 16) {
3052       LLVMBuilderRef builder = bld->gallivm->builder;
3053       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3054       char intrinsic[32];
3055       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3056       LLVMValueRef args[] = { a };
3057       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3058    }
3059 
3060    return lp_build_sin_or_cos(bld, a, TRUE);
3061 }
3062 
3063 
3064 /**
3065  * Generate pow(x, y)
3066  */
3067 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3068 lp_build_pow(struct lp_build_context *bld,
3069              LLVMValueRef x,
3070              LLVMValueRef y)
3071 {
3072    /* TODO: optimize the constant case */
3073    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3074        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3075       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3076                    __FUNCTION__);
3077    }
3078 
3079    LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3080    LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3081 
3082    res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3083    return res;
3084 }
3085 
3086 
3087 /**
3088  * Generate exp(x)
3089  */
3090 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3091 lp_build_exp(struct lp_build_context *bld,
3092              LLVMValueRef x)
3093 {
3094    /* log2(e) = 1/log(2) */
3095    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3096                                            1.4426950408889634);
3097 
3098    assert(lp_check_value(bld->type, x));
3099 
3100    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3101 }
3102 
3103 
3104 /**
3105  * Generate log(x)
3106  * Behavior is undefined with infs, 0s and nans
3107  */
3108 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3109 lp_build_log(struct lp_build_context *bld,
3110              LLVMValueRef x)
3111 {
3112    /* log(2) */
3113    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3114                                           0.69314718055994529);
3115 
3116    assert(lp_check_value(bld->type, x));
3117 
3118    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3119 }
3120 
3121 
3122 /**
3123  * Generate log(x) that handles edge cases (infs, 0s and nans)
3124  */
3125 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3126 lp_build_log_safe(struct lp_build_context *bld,
3127                   LLVMValueRef x)
3128 {
3129    /* log(2) */
3130    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3131                                           0.69314718055994529);
3132 
3133    assert(lp_check_value(bld->type, x));
3134 
3135    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3136 }
3137 
3138 
3139 /**
3140  * Generate polynomial.
3141  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3142  */
3143 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3144 lp_build_polynomial(struct lp_build_context *bld,
3145                     LLVMValueRef x,
3146                     const double *coeffs,
3147                     unsigned num_coeffs)
3148 {
3149    const struct lp_type type = bld->type;
3150    LLVMValueRef even = NULL, odd = NULL;
3151    LLVMValueRef x2;
3152    unsigned i;
3153 
3154    assert(lp_check_value(bld->type, x));
3155 
3156    /* TODO: optimize the constant case */
3157    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3158        LLVMIsConstant(x)) {
3159       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3160                    __FUNCTION__);
3161    }
3162 
3163    /*
3164     * Calculate odd and even terms seperately to decrease data dependency
3165     * Ex:
3166     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3167     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3168     */
3169    x2 = lp_build_mul(bld, x, x);
3170 
3171    for (i = num_coeffs; i--; ) {
3172       LLVMValueRef coeff;
3173 
3174       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3175 
3176       if (i % 2 == 0) {
3177          if (even)
3178             even = lp_build_mad(bld, x2, even, coeff);
3179          else
3180             even = coeff;
3181       } else {
3182          if (odd)
3183             odd = lp_build_mad(bld, x2, odd, coeff);
3184          else
3185             odd = coeff;
3186       }
3187    }
3188 
3189    if (odd)
3190       return lp_build_mad(bld, odd, x, even);
3191    else if (even)
3192       return even;
3193    else
3194       return bld->undef;
3195 }
3196 
3197 
3198 /**
3199  * Minimax polynomial fit of 2**x, in range [0, 1[
3200  */
3201 static const double lp_build_exp2_polynomial[] = {
3202 #if EXP_POLY_DEGREE == 5
3203    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3204    0.693153073200168932794,
3205    0.240153617044375388211,
3206    0.0558263180532956664775,
3207    0.00898934009049466391101,
3208    0.00187757667519147912699
3209 #elif EXP_POLY_DEGREE == 4
3210    1.00000259337069434683,
3211    0.693003834469974940458,
3212    0.24144275689150793076,
3213    0.0520114606103070150235,
3214    0.0135341679161270268764
3215 #elif EXP_POLY_DEGREE == 3
3216    0.999925218562710312959,
3217    0.695833540494823811697,
3218    0.226067155427249155588,
3219    0.0780245226406372992967
3220 #elif EXP_POLY_DEGREE == 2
3221    1.00172476321474503578,
3222    0.657636275736077639316,
3223    0.33718943461968720704
3224 #else
3225 #error
3226 #endif
3227 };
3228 
3229 
3230 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3231 lp_build_exp2(struct lp_build_context *bld,
3232               LLVMValueRef x)
3233 {
3234    LLVMBuilderRef builder = bld->gallivm->builder;
3235    const struct lp_type type = bld->type;
3236    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3237    LLVMValueRef ipart = NULL;
3238    LLVMValueRef fpart = NULL;
3239    LLVMValueRef expipart = NULL;
3240    LLVMValueRef expfpart = NULL;
3241    LLVMValueRef res = NULL;
3242 
3243    if (type.floating && type.width == 16) {
3244       char intrinsic[32];
3245       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3246       LLVMValueRef args[] = { x };
3247       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3248    }
3249 
3250    assert(lp_check_value(bld->type, x));
3251 
3252    /* TODO: optimize the constant case */
3253    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3254        LLVMIsConstant(x)) {
3255       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3256                    __FUNCTION__);
3257    }
3258 
3259    assert(type.floating && type.width == 32);
3260 
3261    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3262     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3263    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3264                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3265    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3266                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3267 
3268    /* ipart = floor(x) */
3269    /* fpart = x - ipart */
3270    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3271 
3272    /* expipart = (float) (1 << ipart) */
3273    expipart = LLVMBuildAdd(builder, ipart,
3274                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3275    expipart = LLVMBuildShl(builder, expipart,
3276                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3277    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3278 
3279    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3280                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3281 
3282    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3283 
3284    return res;
3285 }
3286 
3287 
3288 /**
3289  * Extract the exponent of a IEEE-754 floating point value.
3290  *
3291  * Optionally apply an integer bias.
3292  *
3293  * Result is an integer value with
3294  *
3295  *   ifloor(log2(x)) + bias
3296  */
3297 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3298 lp_build_extract_exponent(struct lp_build_context *bld,
3299                           LLVMValueRef x,
3300                           int bias)
3301 {
3302    LLVMBuilderRef builder = bld->gallivm->builder;
3303    const struct lp_type type = bld->type;
3304    unsigned mantissa = lp_mantissa(type);
3305    LLVMValueRef res;
3306 
3307    assert(type.floating);
3308 
3309    assert(lp_check_value(bld->type, x));
3310 
3311    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3312 
3313    res = LLVMBuildLShr(builder, x,
3314                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3315    res = LLVMBuildAnd(builder, res,
3316                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3317    res = LLVMBuildSub(builder, res,
3318                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3319 
3320    return res;
3321 }
3322 
3323 
3324 /**
3325  * Extract the mantissa of the a floating.
3326  *
3327  * Result is a floating point value with
3328  *
3329  *   x / floor(log2(x))
3330  */
3331 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3332 lp_build_extract_mantissa(struct lp_build_context *bld,
3333                           LLVMValueRef x)
3334 {
3335    LLVMBuilderRef builder = bld->gallivm->builder;
3336    const struct lp_type type = bld->type;
3337    unsigned mantissa = lp_mantissa(type);
3338    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3339                                                   (1ULL << mantissa) - 1);
3340    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3341    LLVMValueRef res;
3342 
3343    assert(lp_check_value(bld->type, x));
3344 
3345    assert(type.floating);
3346 
3347    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3348 
3349    /* res = x / 2**ipart */
3350    res = LLVMBuildAnd(builder, x, mantmask, "");
3351    res = LLVMBuildOr(builder, res, one, "");
3352    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3353 
3354    return res;
3355 }
3356 
3357 
3358 
3359 /**
3360  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3361  * These coefficients can be generate with
3362  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3363  */
3364 static const double lp_build_log2_polynomial[] = {
3365 #if LOG_POLY_DEGREE == 5
3366    2.88539008148777786488L,
3367    0.961796878841293367824L,
3368    0.577058946784739859012L,
3369    0.412914355135828735411L,
3370    0.308591899232910175289L,
3371    0.352376952300281371868L,
3372 #elif LOG_POLY_DEGREE == 4
3373    2.88539009343309178325L,
3374    0.961791550404184197881L,
3375    0.577440339438736392009L,
3376    0.403343858251329912514L,
3377    0.406718052498846252698L,
3378 #elif LOG_POLY_DEGREE == 3
3379    2.88538959748872753838L,
3380    0.961932915889597772928L,
3381    0.571118517972136195241L,
3382    0.493997535084709500285L,
3383 #else
3384 #error
3385 #endif
3386 };
3387 
3388 
3389 /**
3390  * See http://www.devmaster.net/forums/showthread.php?p=43580
3391  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3392  * http://www.nezumi.demon.co.uk/consult/logx.htm
3393  *
3394  * If handle_edge_cases is true the function will perform computations
3395  * to match the required D3D10+ behavior for each of the edge cases.
3396  * That means that if input is:
3397  * - less than zero (to and including -inf) then NaN will be returned
3398  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3399  * - +infinity, then +infinity will be returned
3400  * - NaN, then NaN will be returned
3401  *
3402  * Those checks are fairly expensive so if you don't need them make sure
3403  * handle_edge_cases is false.
3404  */
3405 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,boolean handle_edge_cases)3406 lp_build_log2_approx(struct lp_build_context *bld,
3407                      LLVMValueRef x,
3408                      LLVMValueRef *p_exp,
3409                      LLVMValueRef *p_floor_log2,
3410                      LLVMValueRef *p_log2,
3411                      boolean handle_edge_cases)
3412 {
3413    LLVMBuilderRef builder = bld->gallivm->builder;
3414    const struct lp_type type = bld->type;
3415    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3416    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3417 
3418    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3419    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3420    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3421 
3422    LLVMValueRef i = NULL;
3423    LLVMValueRef y = NULL;
3424    LLVMValueRef z = NULL;
3425    LLVMValueRef exp = NULL;
3426    LLVMValueRef mant = NULL;
3427    LLVMValueRef logexp = NULL;
3428    LLVMValueRef p_z = NULL;
3429    LLVMValueRef res = NULL;
3430 
3431    if (bld->type.width == 16) {
3432       char intrinsic[32];
3433       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3434       LLVMValueRef args[] = { x };
3435       if (p_log2)
3436          *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3437       return;
3438    }
3439 
3440    assert(lp_check_value(bld->type, x));
3441 
3442    if (p_exp || p_floor_log2 || p_log2) {
3443       /* TODO: optimize the constant case */
3444       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3445           LLVMIsConstant(x)) {
3446          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3447                       __FUNCTION__);
3448       }
3449 
3450       assert(type.floating && type.width == 32);
3451 
3452       /*
3453        * We don't explicitly handle denormalized numbers. They will yield a
3454        * result in the neighbourhood of -127, which appears to be adequate
3455        * enough.
3456        */
3457 
3458       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3459 
3460       /* exp = (float) exponent(x) */
3461       exp = LLVMBuildAnd(builder, i, expmask, "");
3462    }
3463 
3464    if (p_floor_log2 || p_log2) {
3465       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3466       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3467       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3468    }
3469 
3470    if (p_log2) {
3471       /* mant = 1 + (float) mantissa(x) */
3472       mant = LLVMBuildAnd(builder, i, mantmask, "");
3473       mant = LLVMBuildOr(builder, mant, one, "");
3474       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3475 
3476       /* y = (mant - 1) / (mant + 1) */
3477       y = lp_build_div(bld,
3478          lp_build_sub(bld, mant, bld->one),
3479          lp_build_add(bld, mant, bld->one));
3480 
3481       /* z = y^2 */
3482       z = lp_build_mul(bld, y, y);
3483 
3484       /* compute P(z) */
3485       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3486                                 ARRAY_SIZE(lp_build_log2_polynomial));
3487 
3488       /* y * P(z) + logexp */
3489       res = lp_build_mad(bld, y, p_z, logexp);
3490 
3491       if (type.floating && handle_edge_cases) {
3492          LLVMValueRef negmask, infmask,  zmask;
3493          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3494                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3495          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3496                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3497          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3498                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3499 
3500          /* If x is qual to inf make sure we return inf */
3501          res = lp_build_select(bld, infmask,
3502                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3503                                res);
3504          /* If x is qual to 0, return -inf */
3505          res = lp_build_select(bld, zmask,
3506                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3507                                res);
3508          /* If x is nan or less than 0, return nan */
3509          res = lp_build_select(bld, negmask,
3510                                lp_build_const_vec(bld->gallivm, type,  NAN),
3511                                res);
3512       }
3513    }
3514 
3515    if (p_exp) {
3516       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3517       *p_exp = exp;
3518    }
3519 
3520    if (p_floor_log2)
3521       *p_floor_log2 = logexp;
3522 
3523    if (p_log2)
3524       *p_log2 = res;
3525 }
3526 
3527 
3528 /*
3529  * log2 implementation which doesn't have special code to
3530  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3531  * the results for those cases are undefined.
3532  */
3533 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3534 lp_build_log2(struct lp_build_context *bld,
3535               LLVMValueRef x)
3536 {
3537    LLVMValueRef res;
3538    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3539    return res;
3540 }
3541 
3542 
3543 /*
3544  * Version of log2 which handles all edge cases.
3545  * Look at documentation of lp_build_log2_approx for
3546  * description of the behavior for each of the edge cases.
3547  */
3548 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3549 lp_build_log2_safe(struct lp_build_context *bld,
3550                    LLVMValueRef x)
3551 {
3552    LLVMValueRef res;
3553    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3554    return res;
3555 }
3556 
3557 
3558 /**
3559  * Faster (and less accurate) log2.
3560  *
3561  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3562  *
3563  * Piece-wise linear approximation, with exact results when x is a
3564  * power of two.
3565  *
3566  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3567  */
3568 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3569 lp_build_fast_log2(struct lp_build_context *bld,
3570                    LLVMValueRef x)
3571 {
3572    LLVMBuilderRef builder = bld->gallivm->builder;
3573    LLVMValueRef ipart;
3574    LLVMValueRef fpart;
3575 
3576    assert(lp_check_value(bld->type, x));
3577 
3578    assert(bld->type.floating);
3579 
3580    /* ipart = floor(log2(x)) - 1 */
3581    ipart = lp_build_extract_exponent(bld, x, -1);
3582    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3583 
3584    /* fpart = x / 2**ipart */
3585    fpart = lp_build_extract_mantissa(bld, x);
3586 
3587    /* ipart + fpart */
3588    return LLVMBuildFAdd(builder, ipart, fpart, "");
3589 }
3590 
3591 
3592 /**
3593  * Fast implementation of iround(log2(x)).
3594  *
3595  * Not an approximation -- it should give accurate results all the time.
3596  */
3597 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3598 lp_build_ilog2(struct lp_build_context *bld,
3599                LLVMValueRef x)
3600 {
3601    LLVMBuilderRef builder = bld->gallivm->builder;
3602    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3603    LLVMValueRef ipart;
3604 
3605    assert(bld->type.floating);
3606 
3607    assert(lp_check_value(bld->type, x));
3608 
3609    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3610    x = LLVMBuildFMul(builder, x, sqrt2, "");
3611 
3612    /* ipart = floor(log2(x) + 0.5)  */
3613    ipart = lp_build_extract_exponent(bld, x, 0);
3614 
3615    return ipart;
3616 }
3617 
3618 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3619 lp_build_mod(struct lp_build_context *bld,
3620              LLVMValueRef x,
3621              LLVMValueRef y)
3622 {
3623    LLVMBuilderRef builder = bld->gallivm->builder;
3624    LLVMValueRef res;
3625    const struct lp_type type = bld->type;
3626 
3627    assert(lp_check_value(type, x));
3628    assert(lp_check_value(type, y));
3629 
3630    if (type.floating)
3631       res = LLVMBuildFRem(builder, x, y, "");
3632    else if (type.sign)
3633       res = LLVMBuildSRem(builder, x, y, "");
3634    else
3635       res = LLVMBuildURem(builder, x, y, "");
3636    return res;
3637 }
3638 
3639 
3640 /*
3641  * For floating inputs it creates and returns a mask
3642  * which is all 1's for channels which are NaN.
3643  * Channels inside x which are not NaN will be 0.
3644  */
3645 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3646 lp_build_isnan(struct lp_build_context *bld,
3647                LLVMValueRef x)
3648 {
3649    LLVMValueRef mask;
3650    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3651 
3652    assert(bld->type.floating);
3653    assert(lp_check_value(bld->type, x));
3654 
3655    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3656                         "isnotnan");
3657    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3658    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3659    return mask;
3660 }
3661 
3662 
3663 /* Returns all 1's for floating point numbers that are
3664  * finite numbers and returns all zeros for -inf,
3665  * inf and nan's */
3666 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3667 lp_build_isfinite(struct lp_build_context *bld,
3668                   LLVMValueRef x)
3669 {
3670    LLVMBuilderRef builder = bld->gallivm->builder;
3671    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3672    struct lp_type int_type = lp_int_type(bld->type);
3673    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3674    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3675                                                     0x7f800000);
3676 
3677    if (!bld->type.floating) {
3678       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3679    }
3680    assert(bld->type.floating);
3681    assert(lp_check_value(bld->type, x));
3682    assert(bld->type.width == 32);
3683 
3684    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3685    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3686                            intx, infornan32);
3687 }
3688 
3689 
3690 /*
3691  * Returns true if the number is nan or inf and false otherwise.
3692  * The input has to be a floating point vector.
3693  */
3694 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3695 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3696                        const struct lp_type type,
3697                        LLVMValueRef x)
3698 {
3699    LLVMBuilderRef builder = gallivm->builder;
3700    struct lp_type int_type = lp_int_type(type);
3701    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3702                                                 0x7f800000);
3703    LLVMValueRef ret;
3704 
3705    assert(type.floating);
3706 
3707    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3708    ret = LLVMBuildAnd(builder, ret, const0, "");
3709    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3710                           ret, const0);
3711 
3712    return ret;
3713 }
3714 
3715 
3716 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3717 lp_build_fpstate_get(struct gallivm_state *gallivm)
3718 {
3719    if (util_get_cpu_caps()->has_sse) {
3720       LLVMBuilderRef builder = gallivm->builder;
3721       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3722          gallivm,
3723          LLVMInt32TypeInContext(gallivm->context),
3724          "mxcsr_ptr");
3725       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3726           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3727       lp_build_intrinsic(builder,
3728                          "llvm.x86.sse.stmxcsr",
3729                          LLVMVoidTypeInContext(gallivm->context),
3730                          &mxcsr_ptr8, 1, 0);
3731       return mxcsr_ptr;
3732    }
3733    return 0;
3734 }
3735 
3736 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,boolean zero)3737 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3738                                   boolean zero)
3739 {
3740    if (util_get_cpu_caps()->has_sse) {
3741       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3742       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3743 
3744       LLVMBuilderRef builder = gallivm->builder;
3745       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3746       LLVMValueRef mxcsr =
3747          LLVMBuildLoad2(builder, LLVMInt32TypeInContext(gallivm->context), mxcsr_ptr, "mxcsr");
3748 
3749       if (util_get_cpu_caps()->has_daz) {
3750          /* Enable denormals are zero mode */
3751          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3752       }
3753       if (zero) {
3754          mxcsr = LLVMBuildOr(builder, mxcsr,
3755                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3756       } else {
3757          mxcsr = LLVMBuildAnd(builder, mxcsr,
3758                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3759       }
3760 
3761       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3762       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3763    }
3764 }
3765 
3766 
3767 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3768 lp_build_fpstate_set(struct gallivm_state *gallivm,
3769                      LLVMValueRef mxcsr_ptr)
3770 {
3771    if (util_get_cpu_caps()->has_sse) {
3772       LLVMBuilderRef builder = gallivm->builder;
3773       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3774                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3775       lp_build_intrinsic(builder,
3776                          "llvm.x86.sse.ldmxcsr",
3777                          LLVMVoidTypeInContext(gallivm->context),
3778                          &mxcsr_ptr, 1, 0);
3779    }
3780 }
3781