• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009-2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper
32  *
33  * LLVM IR doesn't support all basic arithmetic operations we care about (most
34  * notably min/max and saturated operations), and it is often necessary to
35  * resort machine-specific intrinsics directly. The functions here hide all
36  * these implementation details from the other modules.
37  *
38  * We also do simple expressions simplification here. Reasons are:
39  * - it is very easy given we have all necessary information readily available
40  * - LLVM optimization passes fail to simplify several vector expressions
41  * - We often know value constraints which the optimization passes have no way
42  *   of knowing, such as when source arguments are known to be in [0, 1] range.
43  *
44  * @author Jose Fonseca <jfonseca@vmware.com>
45  */
46 
47 
48 #include <float.h>
49 
50 #include <llvm/Config/llvm-config.h>
51 
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56 
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67 
68 #if DETECT_ARCH_SSE
69 #include <xmmintrin.h>
70 #endif
71 
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75 
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79 
80 #define EXP_POLY_DEGREE 5
81 
82 #define LOG_POLY_DEGREE 4
83 
84 
85 /**
86  * Generate min(a, b)
87  * No checks for special case values of a or b = 1 or 0 are done.
88  * NaN's are handled according to the behavior specified by the
89  * nan_behavior argument.
90  */
91 static LLVMValueRef
lp_build_min_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)92 lp_build_min_simple(struct lp_build_context *bld,
93                     LLVMValueRef a,
94                     LLVMValueRef b,
95                     enum gallivm_nan_behavior nan_behavior)
96 {
97    const struct lp_type type = bld->type;
98    const char *intrinsic = NULL;
99    unsigned intr_size = 0;
100    LLVMValueRef cond;
101 
102    assert(lp_check_value(type, a));
103    assert(lp_check_value(type, b));
104 
105    /* TODO: optimize the constant case */
106 
107    if (type.floating && util_get_cpu_caps()->has_sse) {
108       if (type.width == 32) {
109          if (type.length == 1) {
110             intrinsic = "llvm.x86.sse.min.ss";
111             intr_size = 128;
112          }
113          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114             intrinsic = "llvm.x86.sse.min.ps";
115             intr_size = 128;
116          }
117          else {
118             intrinsic = "llvm.x86.avx.min.ps.256";
119             intr_size = 256;
120          }
121       }
122       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123          if (type.length == 1) {
124             intrinsic = "llvm.x86.sse2.min.sd";
125             intr_size = 128;
126          }
127          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128             intrinsic = "llvm.x86.sse2.min.pd";
129             intr_size = 128;
130          }
131          else {
132             intrinsic = "llvm.x86.avx.min.pd.256";
133             intr_size = 256;
134          }
135       }
136    }
137    else if (type.floating && util_get_cpu_caps()->has_altivec) {
138       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140                       __func__);
141       }
142       if (type.width == 32 && type.length == 4) {
143          intrinsic = "llvm.ppc.altivec.vminfp";
144          intr_size = 128;
145       }
146    } else if (util_get_cpu_caps()->has_altivec) {
147       intr_size = 128;
148       if (type.width == 8) {
149          if (!type.sign) {
150             intrinsic = "llvm.ppc.altivec.vminub";
151          } else {
152             intrinsic = "llvm.ppc.altivec.vminsb";
153          }
154       } else if (type.width == 16) {
155          if (!type.sign) {
156             intrinsic = "llvm.ppc.altivec.vminuh";
157          } else {
158             intrinsic = "llvm.ppc.altivec.vminsh";
159          }
160       } else if (type.width == 32) {
161          if (!type.sign) {
162             intrinsic = "llvm.ppc.altivec.vminuw";
163          } else {
164             intrinsic = "llvm.ppc.altivec.vminsw";
165          }
166       }
167    }
168 
169    if (intrinsic) {
170       /* We need to handle nan's for floating point numbers. If one of the
171        * inputs is nan the other should be returned (required by both D3D10+
172        * and OpenCL).
173        * The sse intrinsics return the second operator in case of nan by
174        * default so we need to special code to handle those.
175        */
176       if (util_get_cpu_caps()->has_sse && type.floating &&
177           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178          LLVMValueRef isnan, min;
179          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180                                                    type,
181                                                    intr_size, a, b);
182          isnan = lp_build_isnan(bld, b);
183          return lp_build_select(bld, isnan, a, min);
184       } else {
185          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186                                                     type,
187                                                     intr_size, a, b);
188       }
189    }
190 
191    if (type.floating) {
192       switch (nan_behavior) {
193       case GALLIVM_NAN_RETURN_OTHER: {
194          LLVMValueRef isnan = lp_build_isnan(bld, a);
195          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197          return lp_build_select(bld, cond, a, b);
198       }
199          break;
200       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202          return lp_build_select(bld, cond, a, b);
203       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205          return lp_build_select(bld, cond, b, a);
206       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208          return lp_build_select(bld, cond, a, b);
209          break;
210       default:
211          assert(0);
212          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213          return lp_build_select(bld, cond, a, b);
214       }
215    } else {
216       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217       return lp_build_select(bld, cond, a, b);
218    }
219 }
220 
221 
222 LLVMValueRef
lp_build_fmuladd(LLVMBuilderRef builder,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)223 lp_build_fmuladd(LLVMBuilderRef builder,
224                  LLVMValueRef a,
225                  LLVMValueRef b,
226                  LLVMValueRef c)
227 {
228    LLVMTypeRef type = LLVMTypeOf(a);
229    assert(type == LLVMTypeOf(b));
230    assert(type == LLVMTypeOf(c));
231 
232    char intrinsic[32];
233    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234    LLVMValueRef args[] = { a, b, c };
235    return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236 }
237 
238 
239 /**
240  * Generate max(a, b)
241  * No checks for special case values of a or b = 1 or 0 are done.
242  * NaN's are handled according to the behavior specified by the
243  * nan_behavior argument.
244  */
245 static LLVMValueRef
lp_build_max_simple(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)246 lp_build_max_simple(struct lp_build_context *bld,
247                     LLVMValueRef a,
248                     LLVMValueRef b,
249                     enum gallivm_nan_behavior nan_behavior)
250 {
251    const struct lp_type type = bld->type;
252    const char *intrinsic = NULL;
253    unsigned intr_size = 0;
254    LLVMValueRef cond;
255 
256    assert(lp_check_value(type, a));
257    assert(lp_check_value(type, b));
258 
259    /* TODO: optimize the constant case */
260 
261    if (type.floating && util_get_cpu_caps()->has_sse) {
262       if (type.width == 32) {
263          if (type.length == 1) {
264             intrinsic = "llvm.x86.sse.max.ss";
265             intr_size = 128;
266          }
267          else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268             intrinsic = "llvm.x86.sse.max.ps";
269             intr_size = 128;
270          }
271          else {
272             intrinsic = "llvm.x86.avx.max.ps.256";
273             intr_size = 256;
274          }
275       }
276       if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277          if (type.length == 1) {
278             intrinsic = "llvm.x86.sse2.max.sd";
279             intr_size = 128;
280          }
281          else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282             intrinsic = "llvm.x86.sse2.max.pd";
283             intr_size = 128;
284          }
285          else {
286             intrinsic = "llvm.x86.avx.max.pd.256";
287             intr_size = 256;
288          }
289       }
290    }
291    else if (type.floating && util_get_cpu_caps()->has_altivec) {
292       if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294                       __func__);
295       }
296       if (type.width == 32 || type.length == 4) {
297          intrinsic = "llvm.ppc.altivec.vmaxfp";
298          intr_size = 128;
299       }
300    } else if (util_get_cpu_caps()->has_altivec) {
301      intr_size = 128;
302      if (type.width == 8) {
303        if (!type.sign) {
304          intrinsic = "llvm.ppc.altivec.vmaxub";
305        } else {
306          intrinsic = "llvm.ppc.altivec.vmaxsb";
307        }
308      } else if (type.width == 16) {
309        if (!type.sign) {
310          intrinsic = "llvm.ppc.altivec.vmaxuh";
311        } else {
312          intrinsic = "llvm.ppc.altivec.vmaxsh";
313        }
314      } else if (type.width == 32) {
315        if (!type.sign) {
316          intrinsic = "llvm.ppc.altivec.vmaxuw";
317        } else {
318          intrinsic = "llvm.ppc.altivec.vmaxsw";
319        }
320      }
321    }
322 
323    if (intrinsic) {
324       if (util_get_cpu_caps()->has_sse && type.floating &&
325           nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326          LLVMValueRef isnan, max;
327          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328                                                    type,
329                                                    intr_size, a, b);
330          isnan = lp_build_isnan(bld, b);
331          return lp_build_select(bld, isnan, a, max);
332       } else {
333          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334                                                     type,
335                                                     intr_size, a, b);
336       }
337    }
338 
339    if (type.floating) {
340       switch (nan_behavior) {
341       case GALLIVM_NAN_RETURN_OTHER: {
342          LLVMValueRef isnan = lp_build_isnan(bld, a);
343          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345          return lp_build_select(bld, cond, a, b);
346       }
347          break;
348       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350          return lp_build_select(bld, cond, a, b);
351       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353          return lp_build_select(bld, cond, b, a);
354       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356          return lp_build_select(bld, cond, a, b);
357          break;
358       default:
359          assert(0);
360          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361          return lp_build_select(bld, cond, a, b);
362       }
363    } else {
364       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365       return lp_build_select(bld, cond, a, b);
366    }
367 }
368 
369 
370 /**
371  * Generate 1 - a, or ~a depending on bld->type.
372  */
373 LLVMValueRef
lp_build_comp(struct lp_build_context * bld,LLVMValueRef a)374 lp_build_comp(struct lp_build_context *bld,
375               LLVMValueRef a)
376 {
377    LLVMBuilderRef builder = bld->gallivm->builder;
378    const struct lp_type type = bld->type;
379 
380    assert(lp_check_value(type, a));
381 
382    if (a == bld->one)
383       return bld->zero;
384    if (a == bld->zero)
385       return bld->one;
386 
387    if (type.norm && !type.floating && !type.fixed && !type.sign) {
388       if (LLVMIsConstant(a))
389          return LLVMConstNot(a);
390       else
391          return LLVMBuildNot(builder, a, "");
392    }
393 
394    if (type.floating)
395       return LLVMBuildFSub(builder, bld->one, a, "");
396    else
397       return LLVMBuildSub(builder, bld->one, a, "");
398 }
399 
400 
401 /**
402  * Generate a + b
403  */
404 LLVMValueRef
lp_build_add(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)405 lp_build_add(struct lp_build_context *bld,
406              LLVMValueRef a,
407              LLVMValueRef b)
408 {
409    LLVMBuilderRef builder = bld->gallivm->builder;
410    const struct lp_type type = bld->type;
411    LLVMValueRef res;
412 
413    assert(lp_check_value(type, a));
414    assert(lp_check_value(type, b));
415 
416    if (a == bld->zero)
417       return b;
418    if (b == bld->zero)
419       return a;
420    if (a == bld->undef || b == bld->undef)
421       return bld->undef;
422 
423    if (type.norm) {
424       const char *intrinsic = NULL;
425 
426       if (!type.sign && (a == bld->one || b == bld->one))
427         return bld->one;
428 
429       if (!type.floating && !type.fixed) {
430          if (LLVM_VERSION_MAJOR >= 8) {
431             char intrin[32];
432             intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
433             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
434             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
435          }
436          if (type.width * type.length == 128) {
437             if (util_get_cpu_caps()->has_sse2) {
438                if (type.width == 8)
439                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
440                if (type.width == 16)
441                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
442             } else if (util_get_cpu_caps()->has_altivec) {
443                if (type.width == 8)
444                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
445                if (type.width == 16)
446                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
447             }
448          }
449          if (type.width * type.length == 256) {
450             if (util_get_cpu_caps()->has_avx2) {
451                if (type.width == 8)
452                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
453                if (type.width == 16)
454                   intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
455             }
456          }
457       }
458 
459       if (intrinsic)
460          return lp_build_intrinsic_binary(builder, intrinsic,
461                        lp_build_vec_type(bld->gallivm, bld->type), a, b);
462    }
463 
464    if (type.norm && !type.floating && !type.fixed) {
465       if (type.sign) {
466          uint64_t sign = (uint64_t)1 << (type.width - 1);
467          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
468          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
469          /* a_clamp_max is the maximum a for positive b,
470             a_clamp_min is the minimum a for negative b. */
471          LLVMValueRef a_clamp_max =
472             lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""),
473                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
474          LLVMValueRef a_clamp_min =
475             lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""),
476                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
478                                      bld->zero), a_clamp_max, a_clamp_min);
479       }
480    }
481 
482    if (type.floating)
483       res = LLVMBuildFAdd(builder, a, b, "");
484    else
485       res = LLVMBuildAdd(builder, a, b, "");
486 
487    /* clamp to ceiling of 1.0 */
488    if (bld->type.norm && (bld->type.floating || bld->type.fixed))
489       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
490 
491    if (type.norm && !type.floating && !type.fixed) {
492       if (!type.sign) {
493          /*
494           * newer llvm versions no longer support the intrinsics, but recognize
495           * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
496           * code, it is important we match the pattern llvm uses (and pray llvm
497           * doesn't change it - and hope they decide on the same pattern for
498           * all backends supporting it...).
499           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
500           * interfere with llvm's ability to recognize the pattern but seems
501           * a bit brittle.
502           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
503           */
504          LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
505          res = lp_build_select(bld, overflowed,
506                                LLVMConstAllOnes(bld->int_vec_type), res);
507       }
508    }
509 
510    /* XXX clamp to floor of -1 or 0??? */
511 
512    return res;
513 }
514 
515 
516 /** Return the scalar sum of the elements of a.
517  * Should avoid this operation whenever possible.
518  */
519 LLVMValueRef
lp_build_horizontal_add(struct lp_build_context * bld,LLVMValueRef a)520 lp_build_horizontal_add(struct lp_build_context *bld,
521                         LLVMValueRef a)
522 {
523    LLVMBuilderRef builder = bld->gallivm->builder;
524    const struct lp_type type = bld->type;
525    LLVMValueRef index, res;
526    unsigned i, length;
527    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
528    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
529    LLVMValueRef vecres, elem2;
530 
531    assert(lp_check_value(type, a));
532 
533    if (type.length == 1) {
534       return a;
535    }
536 
537    assert(!bld->type.norm);
538 
539    /*
540     * for byte vectors can do much better with psadbw.
541     * Using repeated shuffle/adds here. Note with multiple vectors
542     * this can be done more efficiently as outlined in the intel
543     * optimization manual.
544     * Note: could cause data rearrangement if used with smaller element
545     * sizes.
546     */
547 
548    vecres = a;
549    length = type.length / 2;
550    while (length > 1) {
551       LLVMValueRef vec1, vec2;
552       for (i = 0; i < length; i++) {
553          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
554          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
555       }
556       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
557                                     LLVMConstVector(shuffles1, length), "");
558       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
559                                     LLVMConstVector(shuffles2, length), "");
560       if (type.floating) {
561          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
562       }
563       else {
564          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
565       }
566       length = length >> 1;
567    }
568 
569    /* always have vector of size 2 here */
570    assert(length == 1);
571 
572    index = lp_build_const_int32(bld->gallivm, 0);
573    res = LLVMBuildExtractElement(builder, vecres, index, "");
574    index = lp_build_const_int32(bld->gallivm, 1);
575    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
576 
577    if (type.floating)
578       res = LLVMBuildFAdd(builder, res, elem2, "");
579     else
580       res = LLVMBuildAdd(builder, res, elem2, "");
581 
582    return res;
583 }
584 
585 
586 /**
587  * Return the horizontal sums of 4 float vectors as a float4 vector.
588  * This uses the technique as outlined in Intel Optimization Manual.
589  */
590 static LLVMValueRef
lp_build_horizontal_add4x4f(struct lp_build_context * bld,LLVMValueRef src[4])591 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
592                             LLVMValueRef src[4])
593 {
594    struct gallivm_state *gallivm = bld->gallivm;
595    LLVMBuilderRef builder = gallivm->builder;
596    LLVMValueRef shuffles[4];
597    LLVMValueRef tmp[4];
598    LLVMValueRef sumtmp[2], shuftmp[2];
599 
600    /* lower half of regs */
601    shuffles[0] = lp_build_const_int32(gallivm, 0);
602    shuffles[1] = lp_build_const_int32(gallivm, 1);
603    shuffles[2] = lp_build_const_int32(gallivm, 4);
604    shuffles[3] = lp_build_const_int32(gallivm, 5);
605    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
606                                    LLVMConstVector(shuffles, 4), "");
607    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
608                                    LLVMConstVector(shuffles, 4), "");
609 
610    /* upper half of regs */
611    shuffles[0] = lp_build_const_int32(gallivm, 2);
612    shuffles[1] = lp_build_const_int32(gallivm, 3);
613    shuffles[2] = lp_build_const_int32(gallivm, 6);
614    shuffles[3] = lp_build_const_int32(gallivm, 7);
615    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
616                                    LLVMConstVector(shuffles, 4), "");
617    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
618                                    LLVMConstVector(shuffles, 4), "");
619 
620    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
621    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
622 
623    shuffles[0] = lp_build_const_int32(gallivm, 0);
624    shuffles[1] = lp_build_const_int32(gallivm, 2);
625    shuffles[2] = lp_build_const_int32(gallivm, 4);
626    shuffles[3] = lp_build_const_int32(gallivm, 6);
627    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
628                                        LLVMConstVector(shuffles, 4), "");
629 
630    shuffles[0] = lp_build_const_int32(gallivm, 1);
631    shuffles[1] = lp_build_const_int32(gallivm, 3);
632    shuffles[2] = lp_build_const_int32(gallivm, 5);
633    shuffles[3] = lp_build_const_int32(gallivm, 7);
634    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
635                                        LLVMConstVector(shuffles, 4), "");
636 
637    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
638 }
639 
640 
641 /*
642  * partially horizontally add 2-4 float vectors with length nx4,
643  * i.e. only four adjacent values in each vector will be added,
644  * assuming values are really grouped in 4 which also determines
645  * output order.
646  *
647  * Return a vector of the same length as the initial vectors,
648  * with the excess elements (if any) being undefined.
649  * The element order is independent of number of input vectors.
650  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
651  * the output order thus will be
652  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
653  */
654 LLVMValueRef
lp_build_hadd_partial4(struct lp_build_context * bld,LLVMValueRef vectors[],unsigned num_vecs)655 lp_build_hadd_partial4(struct lp_build_context *bld,
656                        LLVMValueRef vectors[],
657                        unsigned num_vecs)
658 {
659    struct gallivm_state *gallivm = bld->gallivm;
660    LLVMBuilderRef builder = gallivm->builder;
661    LLVMValueRef ret_vec;
662    LLVMValueRef tmp[4];
663    const char *intrinsic = NULL;
664 
665    assert(num_vecs >= 2 && num_vecs <= 4);
666    assert(bld->type.floating);
667 
668    /* only use this with at least 2 vectors, as it is sort of expensive
669     * (depending on cpu) and we always need two horizontal adds anyway,
670     * so a shuffle/add approach might be better.
671     */
672 
673    tmp[0] = vectors[0];
674    tmp[1] = vectors[1];
675 
676    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
677    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
678 
679    if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
680        bld->type.length == 4) {
681       intrinsic = "llvm.x86.sse3.hadd.ps";
682    }
683    else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
684             bld->type.length == 8) {
685       intrinsic = "llvm.x86.avx.hadd.ps.256";
686    }
687    if (intrinsic) {
688       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
689                                        lp_build_vec_type(gallivm, bld->type),
690                                        tmp[0], tmp[1]);
691       if (num_vecs > 2) {
692          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
693                                           lp_build_vec_type(gallivm, bld->type),
694                                           tmp[2], tmp[3]);
695       }
696       else {
697          tmp[1] = tmp[0];
698       }
699       return lp_build_intrinsic_binary(builder, intrinsic,
700                                        lp_build_vec_type(gallivm, bld->type),
701                                        tmp[0], tmp[1]);
702    }
703 
704    if (bld->type.length == 4) {
705       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
706    }
707    else {
708       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
709       unsigned j;
710       unsigned num_iter = bld->type.length / 4;
711       struct lp_type parttype = bld->type;
712       parttype.length = 4;
713       for (j = 0; j < num_iter; j++) {
714          LLVMValueRef partsrc[4];
715          unsigned i;
716          for (i = 0; i < 4; i++) {
717             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
718          }
719          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
720       }
721       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
722    }
723    return ret_vec;
724 }
725 
726 
727 /**
728  * Generate a - b
729  */
730 LLVMValueRef
lp_build_sub(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)731 lp_build_sub(struct lp_build_context *bld,
732              LLVMValueRef a,
733              LLVMValueRef b)
734 {
735    LLVMBuilderRef builder = bld->gallivm->builder;
736    const struct lp_type type = bld->type;
737    LLVMValueRef res;
738 
739    assert(lp_check_value(type, a));
740    assert(lp_check_value(type, b));
741 
742    if (b == bld->zero)
743       return a;
744    if (a == bld->undef || b == bld->undef)
745       return bld->undef;
746    if (a == b)
747       return bld->zero;
748 
749    if (type.norm) {
750       const char *intrinsic = NULL;
751 
752       if (!type.sign && b == bld->one)
753         return bld->zero;
754 
755       if (!type.floating && !type.fixed) {
756          if (LLVM_VERSION_MAJOR >= 8) {
757             char intrin[32];
758             intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
759             lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
760             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
761          }
762          if (type.width * type.length == 128) {
763             if (util_get_cpu_caps()->has_sse2) {
764                if (type.width == 8)
765                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
766                if (type.width == 16)
767                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
768             } else if (util_get_cpu_caps()->has_altivec) {
769                if (type.width == 8)
770                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
771                if (type.width == 16)
772                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
773             }
774          }
775          if (type.width * type.length == 256) {
776             if (util_get_cpu_caps()->has_avx2) {
777                if (type.width == 8)
778                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
779                if (type.width == 16)
780                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
781             }
782          }
783       }
784 
785       if (intrinsic)
786          return lp_build_intrinsic_binary(builder, intrinsic,
787                       lp_build_vec_type(bld->gallivm, bld->type), a, b);
788    }
789 
790    if (type.norm && !type.floating && !type.fixed) {
791       if (type.sign) {
792          uint64_t sign = (uint64_t)1 << (type.width - 1);
793          LLVMValueRef max_val =
794             lp_build_const_int_vec(bld->gallivm, type, sign - 1);
795          LLVMValueRef min_val =
796             lp_build_const_int_vec(bld->gallivm, type, sign);
797          /* a_clamp_max is the maximum a for negative b,
798             a_clamp_min is the minimum a for positive b. */
799          LLVMValueRef a_clamp_max =
800             lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""),
801                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802          LLVMValueRef a_clamp_min =
803             lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""),
804                                 GALLIVM_NAN_BEHAVIOR_UNDEFINED);
805          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b,
806                                                bld->zero),
807                              a_clamp_min, a_clamp_max);
808       } else {
809          /*
810           * This must match llvm pattern for saturated unsigned sub.
811           * (lp_build_max_simple actually does the job with its current
812           * definition but do it explicitly here.)
813           * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
814           * interfere with llvm's ability to recognize the pattern but seems
815           * a bit brittle.
816           * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
817           */
818          LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
819          a = lp_build_select(bld, no_ov, a, b);
820       }
821    }
822 
823    if (type.floating)
824       res = LLVMBuildFSub(builder, a, b, "");
825    else
826       res = LLVMBuildSub(builder, a, b, "");
827 
828    if (bld->type.norm && (bld->type.floating || bld->type.fixed))
829       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
830 
831    return res;
832 }
833 
834 
835 /**
836  * Normalized multiplication.
837  *
838  * There are several approaches for (using 8-bit normalized multiplication as
839  * an example):
840  *
841  * - alpha plus one
842  *
843  *     makes the following approximation to the division (Sree)
844  *
845  *       a*b/255 ~= (a*(b + 1)) >> 256
846  *
847  *     which is the fastest method that satisfies the following OpenGL
848  *     criteria of
849  *
850  *       0*0 = 0 and 255*255 = 255
851  *
852  * - geometric series
853  *
854  *     takes the geometric series approximation to the division
855  *
856  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
857  *
858  *     in this case just the first two terms to fit in 16bit arithmetic
859  *
860  *       t/255 ~= (t + (t >> 8)) >> 8
861  *
862  *     note that just by itself it doesn't satisfies the OpenGL criteria,
863  *     as 255*255 = 254, so the special case b = 255 must be accounted or
864  *     roundoff must be used.
865  *
866  * - geometric series plus rounding
867  *
868  *     when using a geometric series division instead of truncating the result
869  *     use roundoff in the approximation (Jim Blinn)
870  *
871  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
872  *
873  *     achieving the exact results.
874  *
875  *
876  *
877  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
878  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
879  * @sa Michael Herf, The "double blend trick", May 2000,
880  *     http://www.stereopsis.com/doubleblend.html
881  */
882 LLVMValueRef
lp_build_mul_norm(struct gallivm_state * gallivm,struct lp_type wide_type,LLVMValueRef a,LLVMValueRef b)883 lp_build_mul_norm(struct gallivm_state *gallivm,
884                   struct lp_type wide_type,
885                   LLVMValueRef a, LLVMValueRef b)
886 {
887    LLVMBuilderRef builder = gallivm->builder;
888    struct lp_build_context bld;
889    unsigned n;
890    LLVMValueRef half;
891    LLVMValueRef ab;
892 
893    assert(!wide_type.floating);
894    assert(lp_check_value(wide_type, a));
895    assert(lp_check_value(wide_type, b));
896 
897    lp_build_context_init(&bld, gallivm, wide_type);
898 
899    n = wide_type.width / 2;
900    if (wide_type.sign) {
901       --n;
902    }
903 
904    /*
905     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
906     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
907     */
908 
909    /*
910     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
911     */
912 
913    ab = LLVMBuildMul(builder, a, b, "");
914    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
915 
916    /*
917     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
918     */
919 
920    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
921    if (wide_type.sign) {
922       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
923       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
924       half = lp_build_select(&bld, sign, minus_half, half);
925    }
926    ab = LLVMBuildAdd(builder, ab, half, "");
927 
928    /* Final division */
929    ab = lp_build_shr_imm(&bld, ab, n);
930 
931    return ab;
932 }
933 
934 
935 /**
936  * Generate a * b
937  */
938 LLVMValueRef
lp_build_mul(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)939 lp_build_mul(struct lp_build_context *bld,
940              LLVMValueRef a,
941              LLVMValueRef b)
942 {
943    LLVMBuilderRef builder = bld->gallivm->builder;
944    const struct lp_type type = bld->type;
945 
946    assert(lp_check_value(type, a));
947    assert(lp_check_value(type, b));
948 
949    if (!type.floating || !type.nan_preserve) {
950       if (a == bld->zero)
951          return bld->zero;
952       if (b == bld->zero)
953          return bld->zero;
954    }
955 
956    if (a == bld->one)
957       return b;
958    if (b == bld->one)
959       return a;
960    if (a == bld->undef || b == bld->undef)
961       return bld->undef;
962 
963    if (!type.floating && !type.fixed && type.norm) {
964       struct lp_type wide_type = lp_wider_type(type);
965       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
966 
967       lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
968       lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
969 
970       /* PMULLW, PSRLW, PADDW */
971       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
972       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
973 
974       ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
975 
976       return ab;
977    }
978 
979    LLVMValueRef shift = type.fixed
980       ? lp_build_const_int_vec(bld->gallivm, type, type.width/2) : NULL;
981 
982    LLVMValueRef res;
983    if (type.floating)
984       res = LLVMBuildFMul(builder, a, b, "");
985    else
986       res = LLVMBuildMul(builder, a, b, "");
987    if (shift) {
988       if (type.sign)
989          res = LLVMBuildAShr(builder, res, shift, "");
990       else
991          res = LLVMBuildLShr(builder, res, shift, "");
992    }
993 
994    return res;
995 }
996 
997 
998 /*
999  * Widening mul, valid for 32x32 bit -> 64bit only.
1000  * Result is low 32bits, high bits returned in res_hi.
1001  *
1002  * Emits code that is meant to be compiled for the host CPU.
1003  */
1004 LLVMValueRef
lp_build_mul_32_lohi_cpu(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1005 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1006                          LLVMValueRef a,
1007                          LLVMValueRef b,
1008                          LLVMValueRef *res_hi)
1009 {
1010    struct gallivm_state *gallivm = bld->gallivm;
1011    LLVMBuilderRef builder = gallivm->builder;
1012 
1013    assert(bld->type.width == 32);
1014    assert(bld->type.floating == 0);
1015    assert(bld->type.fixed == 0);
1016    assert(bld->type.norm == 0);
1017 
1018    /*
1019     * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1020     * for x86 simd is atrocious (even if the high bits weren't required),
1021     * trying to handle real 64bit inputs (which of course can't happen due
1022     * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1023     * apparently llvm does not recognize this widening mul). This includes 6
1024     * (instead of 2) pmuludq plus extra adds and shifts
1025     * The same story applies to signed mul, albeit fixing this requires sse41.
1026     * https://llvm.org/bugs/show_bug.cgi?id=30845
1027     * So, whip up our own code, albeit only for length 4 and 8 (which
1028     * should be good enough)...
1029     * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1030     * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1031     * for signed), which the fallback code does not, without this llvm
1032     * will likely still produce atrocious code.
1033     */
1034    if (LLVM_VERSION_MAJOR < 7 &&
1035        (bld->type.length == 4 || bld->type.length == 8) &&
1036        ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1037         util_get_cpu_caps()->has_sse4_1)) {
1038       const char *intrinsic = NULL;
1039       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1040       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1041       struct lp_type type_wide = lp_wider_type(bld->type);
1042       LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1043       unsigned i;
1044       for (i = 0; i < bld->type.length; i += 2) {
1045          shuf[i] = lp_build_const_int32(gallivm, i+1);
1046          shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1047       }
1048       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1049       aeven = a;
1050       beven = b;
1051       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1052       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1053 
1054       if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1055          if (bld->type.sign) {
1056             intrinsic = "llvm.x86.avx2.pmul.dq";
1057          } else {
1058             intrinsic = "llvm.x86.avx2.pmulu.dq";
1059          }
1060          muleven = lp_build_intrinsic_binary(builder, intrinsic,
1061                                              wider_type, aeven, beven);
1062          mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1063                                             wider_type, aodd, bodd);
1064       }
1065       else {
1066          /* for consistent naming look elsewhere... */
1067          if (bld->type.sign) {
1068             intrinsic = "llvm.x86.sse41.pmuldq";
1069          } else {
1070             intrinsic = "llvm.x86.sse2.pmulu.dq";
1071          }
1072          /*
1073           * XXX If we only have AVX but not AVX2 this is a pain.
1074           * lp_build_intrinsic_binary_anylength() can't handle it
1075           * (due to src and dst type not being identical).
1076           */
1077          if (bld->type.length == 8) {
1078             LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1079             LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1080             LLVMValueRef muleven2[2], mulodd2[2];
1081             struct lp_type type_wide_half = type_wide;
1082             LLVMTypeRef wtype_half;
1083             type_wide_half.length = 2;
1084             wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1085             aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1086             aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1087             bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1088             bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1089             aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1090             aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1091             boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1092             boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1093             muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1094                                                     wtype_half, aevenlo, bevenlo);
1095             mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1096                                                    wtype_half, aoddlo, boddlo);
1097             muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1098                                                     wtype_half, aevenhi, bevenhi);
1099             mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1100                                                    wtype_half, aoddhi, boddhi);
1101             muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1102             mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1103 
1104          }
1105          else {
1106             muleven = lp_build_intrinsic_binary(builder, intrinsic,
1107                                                 wider_type, aeven, beven);
1108             mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1109                                                wider_type, aodd, bodd);
1110          }
1111       }
1112       muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1113       mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1114 
1115       for (i = 0; i < bld->type.length; i += 2) {
1116          shuf[i] = lp_build_const_int32(gallivm, i + 1);
1117          shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1118       }
1119       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1120       *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1121 
1122       for (i = 0; i < bld->type.length; i += 2) {
1123          shuf[i] = lp_build_const_int32(gallivm, i);
1124          shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1125       }
1126       shuf_vec = LLVMConstVector(shuf, bld->type.length);
1127       return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1128    }
1129    else {
1130       return lp_build_mul_32_lohi(bld, a, b, res_hi);
1131    }
1132 }
1133 
1134 
1135 /*
1136  * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1137  * Result is low N bits, high bits returned in res_hi.
1138  *
1139  * Emits generic code.
1140  */
1141 LLVMValueRef
lp_build_mul_32_lohi(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef * res_hi)1142 lp_build_mul_32_lohi(struct lp_build_context *bld,
1143                      LLVMValueRef a,
1144                      LLVMValueRef b,
1145                      LLVMValueRef *res_hi)
1146 {
1147    struct gallivm_state *gallivm = bld->gallivm;
1148    LLVMBuilderRef builder = gallivm->builder;
1149    LLVMValueRef tmp, shift, res_lo;
1150    struct lp_type type_tmp;
1151    LLVMTypeRef wide_type, narrow_type;
1152 
1153    type_tmp = bld->type;
1154    narrow_type = lp_build_vec_type(gallivm, type_tmp);
1155    if (bld->type.width < 32)
1156       type_tmp.width = 32;
1157    else
1158       type_tmp.width *= 2;
1159    wide_type = lp_build_vec_type(gallivm, type_tmp);
1160    shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1161 
1162    if (bld->type.sign) {
1163       a = LLVMBuildSExt(builder, a, wide_type, "");
1164       b = LLVMBuildSExt(builder, b, wide_type, "");
1165    } else {
1166       a = LLVMBuildZExt(builder, a, wide_type, "");
1167       b = LLVMBuildZExt(builder, b, wide_type, "");
1168    }
1169    tmp = LLVMBuildMul(builder, a, b, "");
1170 
1171    res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1172 
1173    /* Since we truncate anyway, LShr and AShr are equivalent. */
1174    tmp = LLVMBuildLShr(builder, tmp, shift, "");
1175    *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1176 
1177    return res_lo;
1178 }
1179 
1180 
1181 /* a * b + c */
1182 LLVMValueRef
lp_build_mad(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c)1183 lp_build_mad(struct lp_build_context *bld,
1184              LLVMValueRef a,
1185              LLVMValueRef b,
1186              LLVMValueRef c)
1187 {
1188    const struct lp_type type = bld->type;
1189    if (type.floating) {
1190       return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1191    } else {
1192       return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1193    }
1194 }
1195 
1196 
1197 /**
1198  * Small vector x scale multiplication optimization.
1199  */
1200 LLVMValueRef
lp_build_mul_imm(struct lp_build_context * bld,LLVMValueRef a,int b)1201 lp_build_mul_imm(struct lp_build_context *bld,
1202                  LLVMValueRef a,
1203                  int b)
1204 {
1205    LLVMBuilderRef builder = bld->gallivm->builder;
1206    LLVMValueRef factor;
1207 
1208    assert(lp_check_value(bld->type, a));
1209 
1210    if (b == 0)
1211       return bld->zero;
1212 
1213    if (b == 1)
1214       return a;
1215 
1216    if (b == -1)
1217       return lp_build_negate(bld, a);
1218 
1219    if (b == 2 && bld->type.floating)
1220       return lp_build_add(bld, a, a);
1221 
1222    if (util_is_power_of_two_or_zero(b)) {
1223       unsigned shift = ffs(b) - 1;
1224 
1225       if (bld->type.floating) {
1226 #if 0
1227          /*
1228           * Power of two multiplication by directly manipulating the exponent.
1229           *
1230           * XXX: This might not be always faster, it will introduce a small
1231           * error for multiplication by zero, and it will produce wrong results
1232           * for Inf and NaN.
1233           */
1234          unsigned mantissa = lp_mantissa(bld->type);
1235          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1236          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1237          a = LLVMBuildAdd(builder, a, factor, "");
1238          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1239          return a;
1240 #endif
1241       }
1242       else {
1243          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1244          return LLVMBuildShl(builder, a, factor, "");
1245       }
1246    }
1247 
1248    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1249    return lp_build_mul(bld, a, factor);
1250 }
1251 
1252 
1253 /**
1254  * Generate a / b
1255  */
1256 LLVMValueRef
lp_build_div(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1257 lp_build_div(struct lp_build_context *bld,
1258              LLVMValueRef a,
1259              LLVMValueRef b)
1260 {
1261    LLVMBuilderRef builder = bld->gallivm->builder;
1262    const struct lp_type type = bld->type;
1263 
1264    assert(lp_check_value(type, a));
1265    assert(lp_check_value(type, b));
1266 
1267    if (a == bld->zero)
1268       return bld->zero;
1269    if (a == bld->one && type.floating)
1270       return lp_build_rcp(bld, b);
1271    if (b == bld->zero)
1272       return bld->undef;
1273    if (b == bld->one)
1274       return a;
1275    if (a == bld->undef || b == bld->undef)
1276       return bld->undef;
1277 
1278    /* fast rcp is disabled (just uses div), so makes no sense to try that */
1279    if (false &&
1280       ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1281        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1282       type.floating)
1283       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1284 
1285    if (type.floating)
1286       return LLVMBuildFDiv(builder, a, b, "");
1287    else if (type.sign)
1288       return LLVMBuildSDiv(builder, a, b, "");
1289    else
1290       return LLVMBuildUDiv(builder, a, b, "");
1291 }
1292 
1293 
1294 /**
1295  * Linear interpolation helper.
1296  *
1297  * @param normalized whether we are interpolating normalized values,
1298  *        encoded in normalized integers, twice as wide.
1299  *
1300  * @sa http://www.stereopsis.com/doubleblend.html
1301  */
1302 static inline LLVMValueRef
lp_build_lerp_simple(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1303 lp_build_lerp_simple(struct lp_build_context *bld,
1304                      LLVMValueRef x,
1305                      LLVMValueRef v0,
1306                      LLVMValueRef v1,
1307                      unsigned flags)
1308 {
1309    unsigned half_width = bld->type.width/2;
1310    LLVMBuilderRef builder = bld->gallivm->builder;
1311    LLVMValueRef delta;
1312    LLVMValueRef res;
1313 
1314    assert(lp_check_value(bld->type, x));
1315    assert(lp_check_value(bld->type, v0));
1316    assert(lp_check_value(bld->type, v1));
1317 
1318    delta = lp_build_sub(bld, v1, v0);
1319 
1320    if (bld->type.floating) {
1321       assert(flags == 0);
1322       return lp_build_mad(bld, x, delta, v0);
1323    }
1324 
1325    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1326       if (!bld->type.sign) {
1327          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1328             /*
1329              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1330              * most-significant-bit to the lowest-significant-bit, so that
1331              * later we can just divide by 2**n instead of 2**n - 1.
1332              */
1333 
1334             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1335          }
1336 
1337          /* (x * delta) >> n */
1338          /*
1339           * For this multiply, higher internal precision is required to pass
1340           * CTS, the most efficient path to that is pmulhrsw on ssse3 and
1341           * above.  This could be opencoded on other arches if conformance was
1342           * required.
1343           */
1344          if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1345             res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1346             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1347          } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1348             res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1349             res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1350          } else {
1351             res = lp_build_mul(bld, x, delta);
1352             res = lp_build_shr_imm(bld, res, half_width);
1353          }
1354       } else {
1355          /*
1356           * The rescaling trick above doesn't work for signed numbers, so
1357           * use the 2**n - 1 divison approximation in lp_build_mul_norm
1358           * instead.
1359           */
1360          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1361          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1362       }
1363    } else {
1364       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1365       res = lp_build_mul(bld, x, delta);
1366    }
1367 
1368    if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1369       /*
1370        * At this point both res and v0 only use the lower half of the bits,
1371        * the rest is zero. Instead of add / mask, do add with half wide type.
1372        */
1373       struct lp_type narrow_type;
1374       struct lp_build_context narrow_bld;
1375 
1376       memset(&narrow_type, 0, sizeof narrow_type);
1377       narrow_type.sign   = bld->type.sign;
1378       narrow_type.width  = bld->type.width/2;
1379       narrow_type.length = bld->type.length*2;
1380 
1381       lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1382       res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1383       v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1384       res = lp_build_add(&narrow_bld, v0, res);
1385       res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1386    } else {
1387       res = lp_build_add(bld, v0, res);
1388 
1389       if (bld->type.fixed) {
1390          /*
1391           * We need to mask out the high order bits when lerping 8bit
1392           * normalized colors stored on 16bits
1393           */
1394          /* XXX: This step is necessary for lerping 8bit colors stored on
1395           * 16bits, but it will be wrong for true fixed point use cases.
1396           * Basically we need a more powerful lp_type, capable of further
1397           * distinguishing the values interpretation from the value storage.
1398           */
1399          LLVMValueRef low_bits;
1400          low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1401          res = LLVMBuildAnd(builder, res, low_bits, "");
1402       }
1403    }
1404 
1405    return res;
1406 }
1407 
1408 
1409 /**
1410  * Linear interpolation.
1411  */
1412 LLVMValueRef
lp_build_lerp(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1,unsigned flags)1413 lp_build_lerp(struct lp_build_context *bld,
1414               LLVMValueRef x,
1415               LLVMValueRef v0,
1416               LLVMValueRef v1,
1417               unsigned flags)
1418 {
1419    const struct lp_type type = bld->type;
1420    LLVMValueRef res;
1421 
1422    assert(lp_check_value(type, x));
1423    assert(lp_check_value(type, v0));
1424    assert(lp_check_value(type, v1));
1425 
1426    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1427 
1428    if (type.norm) {
1429       struct lp_type wide_type;
1430       struct lp_build_context wide_bld;
1431       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1432 
1433       assert(type.length >= 2);
1434 
1435       /*
1436        * Create a wider integer type, enough to hold the
1437        * intermediate result of the multiplication.
1438        */
1439       memset(&wide_type, 0, sizeof wide_type);
1440       wide_type.sign   = type.sign;
1441       wide_type.width  = type.width*2;
1442       wide_type.length = type.length/2;
1443 
1444       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1445 
1446       lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1447       lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1448       lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1449 
1450       /*
1451        * Lerp both halves.
1452        */
1453 
1454       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1455 
1456       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1457       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1458 
1459       res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1460    } else {
1461       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1462    }
1463 
1464    return res;
1465 }
1466 
1467 
1468 /**
1469  * Bilinear interpolation.
1470  *
1471  * Values indices are in v_{yx}.
1472  */
1473 LLVMValueRef
lp_build_lerp_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef v00,LLVMValueRef v01,LLVMValueRef v10,LLVMValueRef v11,unsigned flags)1474 lp_build_lerp_2d(struct lp_build_context *bld,
1475                  LLVMValueRef x,
1476                  LLVMValueRef y,
1477                  LLVMValueRef v00,
1478                  LLVMValueRef v01,
1479                  LLVMValueRef v10,
1480                  LLVMValueRef v11,
1481                  unsigned flags)
1482 {
1483    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1484    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1485    return lp_build_lerp(bld, y, v0, v1, flags);
1486 }
1487 
1488 
1489 LLVMValueRef
lp_build_lerp_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef v000,LLVMValueRef v001,LLVMValueRef v010,LLVMValueRef v011,LLVMValueRef v100,LLVMValueRef v101,LLVMValueRef v110,LLVMValueRef v111,unsigned flags)1490 lp_build_lerp_3d(struct lp_build_context *bld,
1491                  LLVMValueRef x,
1492                  LLVMValueRef y,
1493                  LLVMValueRef z,
1494                  LLVMValueRef v000,
1495                  LLVMValueRef v001,
1496                  LLVMValueRef v010,
1497                  LLVMValueRef v011,
1498                  LLVMValueRef v100,
1499                  LLVMValueRef v101,
1500                  LLVMValueRef v110,
1501                  LLVMValueRef v111,
1502                  unsigned flags)
1503 {
1504    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1505    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1506    return lp_build_lerp(bld, z, v0, v1, flags);
1507 }
1508 
1509 
1510 /**
1511  * Generate min(a, b)
1512  * Do checks for special cases but not for nans.
1513  */
1514 LLVMValueRef
lp_build_min(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1515 lp_build_min(struct lp_build_context *bld,
1516              LLVMValueRef a,
1517              LLVMValueRef b)
1518 {
1519    assert(lp_check_value(bld->type, a));
1520    assert(lp_check_value(bld->type, b));
1521 
1522    if (a == bld->undef || b == bld->undef)
1523       return bld->undef;
1524 
1525    if (a == b)
1526       return a;
1527 
1528    if (bld->type.norm) {
1529       if (!bld->type.sign) {
1530          if (a == bld->zero || b == bld->zero) {
1531             return bld->zero;
1532          }
1533       }
1534       if (a == bld->one)
1535          return b;
1536       if (b == bld->one)
1537          return a;
1538    }
1539 
1540    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1541 }
1542 
1543 
1544 /**
1545  * Generate min(a, b)
1546  * NaN's are handled according to the behavior specified by the
1547  * nan_behavior argument.
1548  */
1549 LLVMValueRef
lp_build_min_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1550 lp_build_min_ext(struct lp_build_context *bld,
1551                  LLVMValueRef a,
1552                  LLVMValueRef b,
1553                  enum gallivm_nan_behavior nan_behavior)
1554 {
1555    assert(lp_check_value(bld->type, a));
1556    assert(lp_check_value(bld->type, b));
1557 
1558    if (a == bld->undef || b == bld->undef)
1559       return bld->undef;
1560 
1561    if (a == b)
1562       return a;
1563 
1564    if (bld->type.norm) {
1565       if (!bld->type.sign) {
1566          if (a == bld->zero || b == bld->zero) {
1567             return bld->zero;
1568          }
1569       }
1570       if (a == bld->one)
1571          return b;
1572       if (b == bld->one)
1573          return a;
1574    }
1575 
1576    return lp_build_min_simple(bld, a, b, nan_behavior);
1577 }
1578 
1579 
1580 /**
1581  * Generate max(a, b)
1582  * Do checks for special cases, but NaN behavior is undefined.
1583  */
1584 LLVMValueRef
lp_build_max(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b)1585 lp_build_max(struct lp_build_context *bld,
1586              LLVMValueRef a,
1587              LLVMValueRef b)
1588 {
1589    assert(lp_check_value(bld->type, a));
1590    assert(lp_check_value(bld->type, b));
1591 
1592    if (a == bld->undef || b == bld->undef)
1593       return bld->undef;
1594 
1595    if (a == b)
1596       return a;
1597 
1598    if (bld->type.norm) {
1599       if (a == bld->one || b == bld->one)
1600          return bld->one;
1601       if (!bld->type.sign) {
1602          if (a == bld->zero) {
1603             return b;
1604          }
1605          if (b == bld->zero) {
1606             return a;
1607          }
1608       }
1609    }
1610 
1611    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1612 }
1613 
1614 
1615 /**
1616  * Generate max(a, b)
1617  * Checks for special cases.
1618  * NaN's are handled according to the behavior specified by the
1619  * nan_behavior argument.
1620  */
1621 LLVMValueRef
lp_build_max_ext(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef b,enum gallivm_nan_behavior nan_behavior)1622 lp_build_max_ext(struct lp_build_context *bld,
1623                   LLVMValueRef a,
1624                   LLVMValueRef b,
1625                   enum gallivm_nan_behavior nan_behavior)
1626 {
1627    assert(lp_check_value(bld->type, a));
1628    assert(lp_check_value(bld->type, b));
1629 
1630    if (a == bld->undef || b == bld->undef)
1631       return bld->undef;
1632 
1633    if (a == b)
1634       return a;
1635 
1636    if (bld->type.norm) {
1637       if (a == bld->one || b == bld->one)
1638          return bld->one;
1639       if (!bld->type.sign) {
1640          if (a == bld->zero) {
1641             return b;
1642          }
1643          if (b == bld->zero) {
1644             return a;
1645          }
1646       }
1647    }
1648 
1649    return lp_build_max_simple(bld, a, b, nan_behavior);
1650 }
1651 
1652 
1653 /**
1654  * Generate clamp(a, min, max)
1655  * NaN behavior (for any of a, min, max) is undefined.
1656  * Do checks for special cases.
1657  */
1658 LLVMValueRef
lp_build_clamp(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef min,LLVMValueRef max)1659 lp_build_clamp(struct lp_build_context *bld,
1660                LLVMValueRef a,
1661                LLVMValueRef min,
1662                LLVMValueRef max)
1663 {
1664    assert(lp_check_value(bld->type, a));
1665    assert(lp_check_value(bld->type, min));
1666    assert(lp_check_value(bld->type, max));
1667 
1668    a = lp_build_min(bld, a, max);
1669    a = lp_build_max(bld, a, min);
1670    return a;
1671 }
1672 
1673 
1674 /**
1675  * Generate clamp(a, 0, 1)
1676  * A NaN will get converted to zero.
1677  */
1678 LLVMValueRef
lp_build_clamp_zero_one_nanzero(struct lp_build_context * bld,LLVMValueRef a)1679 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1680                                 LLVMValueRef a)
1681 {
1682    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1683    a = lp_build_min(bld, a, bld->one);
1684    return a;
1685 }
1686 
1687 
1688 /**
1689  * Generate abs(a)
1690  */
1691 LLVMValueRef
lp_build_abs(struct lp_build_context * bld,LLVMValueRef a)1692 lp_build_abs(struct lp_build_context *bld,
1693              LLVMValueRef a)
1694 {
1695    LLVMBuilderRef builder = bld->gallivm->builder;
1696    const struct lp_type type = bld->type;
1697    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1698 
1699    assert(lp_check_value(type, a));
1700 
1701    if (!type.sign)
1702       return a;
1703 
1704    if (type.floating) {
1705       char intrinsic[32];
1706       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1707       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1708    }
1709 
1710    if (type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1711       switch(type.width) {
1712       case 8:
1713          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1714       case 16:
1715          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1716       case 32:
1717          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1718       }
1719    }
1720    else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1721       switch(type.width) {
1722       case 8:
1723          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1724       case 16:
1725          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1726       case 32:
1727          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1728       }
1729    }
1730 
1731    return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1732                           a, LLVMBuildNeg(builder, a, ""));
1733 }
1734 
1735 
1736 LLVMValueRef
lp_build_negate(struct lp_build_context * bld,LLVMValueRef a)1737 lp_build_negate(struct lp_build_context *bld,
1738                 LLVMValueRef a)
1739 {
1740    LLVMBuilderRef builder = bld->gallivm->builder;
1741 
1742    assert(lp_check_value(bld->type, a));
1743 
1744    if (bld->type.floating)
1745       a = LLVMBuildFNeg(builder, a, "");
1746    else
1747       a = LLVMBuildNeg(builder, a, "");
1748 
1749    return a;
1750 }
1751 
1752 
1753 /** Return -1, 0 or +1 depending on the sign of a */
1754 LLVMValueRef
lp_build_sgn(struct lp_build_context * bld,LLVMValueRef a)1755 lp_build_sgn(struct lp_build_context *bld,
1756              LLVMValueRef a)
1757 {
1758    LLVMBuilderRef builder = bld->gallivm->builder;
1759    const struct lp_type type = bld->type;
1760    LLVMValueRef cond;
1761    LLVMValueRef res;
1762 
1763    assert(lp_check_value(type, a));
1764 
1765    /* Handle non-zero case */
1766    if (!type.sign) {
1767       /* if not zero then sign must be positive */
1768       res = bld->one;
1769    }
1770    else if (type.floating) {
1771       LLVMTypeRef vec_type;
1772       LLVMTypeRef int_type;
1773       LLVMValueRef mask;
1774       LLVMValueRef sign;
1775       LLVMValueRef one;
1776       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1777 
1778       int_type = lp_build_int_vec_type(bld->gallivm, type);
1779       vec_type = lp_build_vec_type(bld->gallivm, type);
1780       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1781 
1782       /* Take the sign bit and add it to 1 constant */
1783       sign = LLVMBuildBitCast(builder, a, int_type, "");
1784       sign = LLVMBuildAnd(builder, sign, mask, "");
1785       one = LLVMConstBitCast(bld->one, int_type);
1786       res = LLVMBuildOr(builder, sign, one, "");
1787       res = LLVMBuildBitCast(builder, res, vec_type, "");
1788    }
1789    else
1790    {
1791       /* signed int/norm/fixed point */
1792       /* could use psign with sse3 and appropriate vectors here */
1793       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1794       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1795       res = lp_build_select(bld, cond, bld->one, minus_one);
1796    }
1797 
1798    /* Handle zero */
1799    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1800    res = lp_build_select(bld, cond, bld->zero, res);
1801 
1802    return res;
1803 }
1804 
1805 
1806 /**
1807  * Set the sign of float vector 'a' according to 'sign'.
1808  * If sign==0, return abs(a).
1809  * If sign==1, return -abs(a);
1810  * Other values for sign produce undefined results.
1811  */
1812 LLVMValueRef
lp_build_set_sign(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef sign)1813 lp_build_set_sign(struct lp_build_context *bld,
1814                   LLVMValueRef a, LLVMValueRef sign)
1815 {
1816    LLVMBuilderRef builder = bld->gallivm->builder;
1817    const struct lp_type type = bld->type;
1818    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1819    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1820    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1821    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1822                              ~((unsigned long long) 1 << (type.width - 1)));
1823    LLVMValueRef val, res;
1824 
1825    assert(type.floating);
1826    assert(lp_check_value(type, a));
1827 
1828    /* val = reinterpret_cast<int>(a) */
1829    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1830    /* val = val & mask */
1831    val = LLVMBuildAnd(builder, val, mask, "");
1832    /* sign = sign << shift */
1833    sign = LLVMBuildShl(builder, sign, shift, "");
1834    /* res = val | sign */
1835    res = LLVMBuildOr(builder, val, sign, "");
1836    /* res = reinterpret_cast<float>(res) */
1837    res = LLVMBuildBitCast(builder, res, vec_type, "");
1838 
1839    return res;
1840 }
1841 
1842 
1843 /**
1844  * Convert vector of (or scalar) int to vector of (or scalar) float.
1845  */
1846 LLVMValueRef
lp_build_int_to_float(struct lp_build_context * bld,LLVMValueRef a)1847 lp_build_int_to_float(struct lp_build_context *bld,
1848                       LLVMValueRef a)
1849 {
1850    LLVMBuilderRef builder = bld->gallivm->builder;
1851    const struct lp_type type = bld->type;
1852    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1853 
1854    assert(type.floating);
1855 
1856    return LLVMBuildSIToFP(builder, a, vec_type, "");
1857 }
1858 
1859 
1860 static bool
arch_rounding_available(const struct lp_type type)1861 arch_rounding_available(const struct lp_type type)
1862 {
1863    if ((util_get_cpu_caps()->has_sse4_1 &&
1864        (type.length == 1 || (LLVM_VERSION_MAJOR >= 8 && type.length == 2) ||
1865         type.width * type.length == 128)) ||
1866        (util_get_cpu_caps()->has_avx && type.width * type.length == 256) ||
1867        (util_get_cpu_caps()->has_avx512f && type.width * type.length == 512))
1868       return true;
1869    else if ((util_get_cpu_caps()->has_altivec &&
1870             (type.width == 32 && type.length == 4)))
1871       return true;
1872    else if (util_get_cpu_caps()->has_neon)
1873       return true;
1874    else if (util_get_cpu_caps()->family == CPU_S390X)
1875       return true;
1876 
1877    return false;
1878 }
1879 
1880 enum lp_build_round_mode
1881 {
1882    LP_BUILD_ROUND_NEAREST = 0,
1883    LP_BUILD_ROUND_FLOOR = 1,
1884    LP_BUILD_ROUND_CEIL = 2,
1885    LP_BUILD_ROUND_TRUNCATE = 3
1886 };
1887 
1888 
1889 static inline LLVMValueRef
lp_build_iround_nearest_sse2(struct lp_build_context * bld,LLVMValueRef a)1890 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1891                              LLVMValueRef a)
1892 {
1893    LLVMBuilderRef builder = bld->gallivm->builder;
1894    const struct lp_type type = bld->type;
1895    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1896    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1897    const char *intrinsic;
1898    LLVMValueRef res;
1899 
1900    assert(type.floating);
1901    /* using the double precision conversions is a bit more complicated */
1902    assert(type.width == 32);
1903 
1904    assert(lp_check_value(type, a));
1905    assert(util_get_cpu_caps()->has_sse2);
1906 
1907    /* This is relying on MXCSR rounding mode, which should always be nearest. */
1908    if (type.length == 1) {
1909       LLVMTypeRef vec_type;
1910       LLVMValueRef undef;
1911       LLVMValueRef arg;
1912       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1913 
1914       vec_type = LLVMVectorType(bld->elem_type, 4);
1915 
1916       intrinsic = "llvm.x86.sse.cvtss2si";
1917 
1918       undef = LLVMGetUndef(vec_type);
1919 
1920       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1921 
1922       res = lp_build_intrinsic_unary(builder, intrinsic,
1923                                      ret_type, arg);
1924    }
1925    else {
1926       if (type.width* type.length == 128) {
1927          intrinsic = "llvm.x86.sse2.cvtps2dq";
1928       }
1929       else {
1930          assert(type.width*type.length == 256);
1931          assert(util_get_cpu_caps()->has_avx);
1932 
1933          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1934       }
1935       res = lp_build_intrinsic_unary(builder, intrinsic,
1936                                      ret_type, a);
1937    }
1938 
1939    return res;
1940 }
1941 
1942 
1943 /*
1944  */
1945 static inline LLVMValueRef
lp_build_round_altivec(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1946 lp_build_round_altivec(struct lp_build_context *bld,
1947                        LLVMValueRef a,
1948                        enum lp_build_round_mode mode)
1949 {
1950    LLVMBuilderRef builder = bld->gallivm->builder;
1951    const struct lp_type type = bld->type;
1952    const char *intrinsic = NULL;
1953 
1954    assert(type.floating);
1955 
1956    assert(lp_check_value(type, a));
1957    assert(util_get_cpu_caps()->has_altivec);
1958 
1959    (void)type;
1960 
1961    switch (mode) {
1962    case LP_BUILD_ROUND_NEAREST:
1963       intrinsic = "llvm.ppc.altivec.vrfin";
1964       break;
1965    case LP_BUILD_ROUND_FLOOR:
1966       intrinsic = "llvm.ppc.altivec.vrfim";
1967       break;
1968    case LP_BUILD_ROUND_CEIL:
1969       intrinsic = "llvm.ppc.altivec.vrfip";
1970       break;
1971    case LP_BUILD_ROUND_TRUNCATE:
1972       intrinsic = "llvm.ppc.altivec.vrfiz";
1973       break;
1974    }
1975 
1976    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1977 }
1978 
1979 
1980 static inline LLVMValueRef
lp_build_round_arch(struct lp_build_context * bld,LLVMValueRef a,enum lp_build_round_mode mode)1981 lp_build_round_arch(struct lp_build_context *bld,
1982                     LLVMValueRef a,
1983                     enum lp_build_round_mode mode)
1984 {
1985    if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon ||
1986        util_get_cpu_caps()->family == CPU_S390X) {
1987       LLVMBuilderRef builder = bld->gallivm->builder;
1988       const struct lp_type type = bld->type;
1989       const char *intrinsic_root;
1990       char intrinsic[32];
1991 
1992       assert(type.floating);
1993       assert(lp_check_value(type, a));
1994       (void)type;
1995 
1996       switch (mode) {
1997       case LP_BUILD_ROUND_NEAREST:
1998          intrinsic_root = "llvm.nearbyint";
1999          break;
2000       case LP_BUILD_ROUND_FLOOR:
2001          intrinsic_root = "llvm.floor";
2002          break;
2003       case LP_BUILD_ROUND_CEIL:
2004          intrinsic_root = "llvm.ceil";
2005          break;
2006       case LP_BUILD_ROUND_TRUNCATE:
2007          intrinsic_root = "llvm.trunc";
2008          break;
2009       default:
2010          unreachable("unhandled lp_build_round_mode");
2011       }
2012 
2013       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2014       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2015    }
2016    else /* (util_get_cpu_caps()->has_altivec) */
2017      return lp_build_round_altivec(bld, a, mode);
2018 }
2019 
2020 
2021 /**
2022  * Return the integer part of a float (vector) value (== round toward zero).
2023  * The returned value is a float (vector).
2024  * Ex: trunc(-1.5) = -1.0
2025  */
2026 LLVMValueRef
lp_build_trunc(struct lp_build_context * bld,LLVMValueRef a)2027 lp_build_trunc(struct lp_build_context *bld,
2028                LLVMValueRef a)
2029 {
2030    LLVMBuilderRef builder = bld->gallivm->builder;
2031    const struct lp_type type = bld->type;
2032 
2033    assert(type.floating);
2034    assert(lp_check_value(type, a));
2035 
2036    if (type.width == 16) {
2037       char intrinsic[64];
2038       lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2039       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2040    }
2041 
2042    if (arch_rounding_available(type)) {
2043       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2044    }
2045    else {
2046       const struct lp_type type = bld->type;
2047       struct lp_type inttype;
2048       struct lp_build_context intbld;
2049       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2050       LLVMValueRef trunc, res, anosign, mask;
2051       LLVMTypeRef int_vec_type = bld->int_vec_type;
2052       LLVMTypeRef vec_type = bld->vec_type;
2053 
2054       inttype = type;
2055       inttype.floating = 0;
2056       lp_build_context_init(&intbld, bld->gallivm, inttype);
2057 
2058       /* round by truncation */
2059       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2060       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2061 
2062       if (type.signed_zero_preserve) {
2063          char intrinsic[64];
2064          lp_format_intrinsic(intrinsic, 64, "llvm.copysign", bld->vec_type);
2065          res = lp_build_intrinsic_binary(builder, intrinsic, vec_type, res, a);
2066       }
2067 
2068       /* mask out sign bit */
2069       anosign = lp_build_abs(bld, a);
2070       /*
2071        * mask out all values if anosign > 2^24
2072        * This should work both for large ints (all rounding is no-op for them
2073        * because such floats are always exact) as well as special cases like
2074        * NaNs, Infs (taking advantage of the fact they use max exponent).
2075        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2076        */
2077       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2078       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2079       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2080       return lp_build_select(bld, mask, a, res);
2081    }
2082 }
2083 
2084 
2085 /**
2086  * Return float (vector) rounded to nearest integer (vector).  The returned
2087  * value is a float (vector).
2088  * Ex: round(0.9) = 1.0
2089  * Ex: round(-1.5) = -2.0
2090  */
2091 LLVMValueRef
lp_build_round(struct lp_build_context * bld,LLVMValueRef a)2092 lp_build_round(struct lp_build_context *bld,
2093                LLVMValueRef a)
2094 {
2095    LLVMBuilderRef builder = bld->gallivm->builder;
2096    const struct lp_type type = bld->type;
2097 
2098    assert(type.floating);
2099    assert(lp_check_value(type, a));
2100 
2101    if (type.width == 16) {
2102       char intrinsic[64];
2103       lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2104       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2105    }
2106 
2107    if (arch_rounding_available(type)) {
2108       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2109    }
2110    else {
2111       const struct lp_type type = bld->type;
2112       struct lp_type inttype;
2113       struct lp_build_context intbld;
2114       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2115       LLVMValueRef res, anosign, mask;
2116       LLVMTypeRef int_vec_type = bld->int_vec_type;
2117       LLVMTypeRef vec_type = bld->vec_type;
2118 
2119       inttype = type;
2120       inttype.floating = 0;
2121       lp_build_context_init(&intbld, bld->gallivm, inttype);
2122 
2123       res = lp_build_iround(bld, a);
2124       res = LLVMBuildSIToFP(builder, res, vec_type, "");
2125 
2126       if (type.signed_zero_preserve) {
2127          LLVMValueRef sign_mask =
2128             lp_build_const_int_vec(bld->gallivm, type, 1llu << (type.width - 1));
2129          LLVMValueRef a_sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2130          a_sign = LLVMBuildAnd(builder, a_sign, sign_mask, "");
2131 
2132          res = LLVMBuildBitCast(builder, res, int_vec_type, "");
2133          res = LLVMBuildOr(builder, res, a_sign, "");
2134          res = LLVMBuildBitCast(builder, res, vec_type, "");
2135       }
2136 
2137       /* mask out sign bit */
2138       anosign = lp_build_abs(bld, a);
2139       /*
2140        * mask out all values if anosign > 2^24
2141        * This should work both for large ints (all rounding is no-op for them
2142        * because such floats are always exact) as well as special cases like
2143        * NaNs, Infs (taking advantage of the fact they use max exponent).
2144        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2145        */
2146       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2147       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2148       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2149       return lp_build_select(bld, mask, a, res);
2150    }
2151 }
2152 
2153 
2154 /**
2155  * Return floor of float (vector), result is a float (vector)
2156  * Ex: floor(1.1) = 1.0
2157  * Ex: floor(-1.1) = -2.0
2158  */
2159 LLVMValueRef
lp_build_floor(struct lp_build_context * bld,LLVMValueRef a)2160 lp_build_floor(struct lp_build_context *bld,
2161                LLVMValueRef a)
2162 {
2163    LLVMBuilderRef builder = bld->gallivm->builder;
2164    const struct lp_type type = bld->type;
2165 
2166    assert(type.floating);
2167    assert(lp_check_value(type, a));
2168 
2169    if (arch_rounding_available(type)) {
2170       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2171    }
2172    else {
2173       const struct lp_type type = bld->type;
2174       struct lp_type inttype;
2175       struct lp_build_context intbld;
2176       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177       LLVMValueRef trunc, res, anosign, mask;
2178       LLVMTypeRef int_vec_type = bld->int_vec_type;
2179       LLVMTypeRef vec_type = bld->vec_type;
2180 
2181       if (type.width != 32) {
2182          char intrinsic[32];
2183          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2184          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2185       }
2186 
2187       assert(type.width == 32); /* might want to handle doubles at some point */
2188 
2189       inttype = type;
2190       inttype.floating = 0;
2191       lp_build_context_init(&intbld, bld->gallivm, inttype);
2192 
2193       /* round by truncation */
2194       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2195       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2196 
2197       if (type.sign) {
2198          LLVMValueRef tmp;
2199 
2200          /*
2201           * fix values if rounding is wrong (for non-special cases)
2202           * - this is the case if trunc > a
2203           */
2204          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2205          /* tmp = trunc > a ? 1.0 : 0.0 */
2206          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2207          tmp = lp_build_and(&intbld, mask, tmp);
2208          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2209          res = lp_build_sub(bld, res, tmp);
2210       }
2211 
2212       /* mask out sign bit */
2213       anosign = lp_build_abs(bld, a);
2214       /*
2215        * mask out all values if anosign > 2^24
2216        * This should work both for large ints (all rounding is no-op for them
2217        * because such floats are always exact) as well as special cases like
2218        * NaNs, Infs (taking advantage of the fact they use max exponent).
2219        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2220        */
2221       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2222       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2223       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2224       return lp_build_select(bld, mask, a, res);
2225    }
2226 }
2227 
2228 
2229 /**
2230  * Return ceiling of float (vector), returning float (vector).
2231  * Ex: ceil( 1.1) = 2.0
2232  * Ex: ceil(-1.1) = -1.0
2233  */
2234 LLVMValueRef
lp_build_ceil(struct lp_build_context * bld,LLVMValueRef a)2235 lp_build_ceil(struct lp_build_context *bld,
2236               LLVMValueRef a)
2237 {
2238    LLVMBuilderRef builder = bld->gallivm->builder;
2239    const struct lp_type type = bld->type;
2240 
2241    assert(type.floating);
2242    assert(lp_check_value(type, a));
2243 
2244    if (arch_rounding_available(type)) {
2245       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2246    }
2247    else {
2248       const struct lp_type type = bld->type;
2249       struct lp_type inttype;
2250       struct lp_build_context intbld;
2251       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2252       LLVMValueRef trunc, res, anosign, mask, tmp;
2253       LLVMTypeRef int_vec_type = bld->int_vec_type;
2254       LLVMTypeRef vec_type = bld->vec_type;
2255 
2256       if (type.width != 32) {
2257          char intrinsic[32];
2258          lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2259          return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2260       }
2261 
2262       assert(type.width == 32); /* might want to handle doubles at some point */
2263 
2264       inttype = type;
2265       inttype.floating = 0;
2266       lp_build_context_init(&intbld, bld->gallivm, inttype);
2267 
2268       /* round by truncation */
2269       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2270       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2271 
2272       /*
2273        * fix values if rounding is wrong (for non-special cases)
2274        * - this is the case if trunc < a
2275        */
2276       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2277       /* tmp = trunc < a ? 1.0 : 0.0 */
2278       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2279       tmp = lp_build_and(&intbld, mask, tmp);
2280       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2281       res = lp_build_add(bld, trunc, tmp);
2282 
2283       /* mask out sign bit */
2284       anosign = lp_build_abs(bld, a);
2285       /*
2286        * mask out all values if anosign > 2^24
2287        * This should work both for large ints (all rounding is no-op for them
2288        * because such floats are always exact) as well as special cases like
2289        * NaNs, Infs (taking advantage of the fact they use max exponent).
2290        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2291        */
2292       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2293       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2294       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2295       return lp_build_select(bld, mask, a, res);
2296    }
2297 }
2298 
2299 
2300 /**
2301  * Return fractional part of 'a' computed as a - floor(a)
2302  * Typically used in texture coord arithmetic.
2303  */
2304 LLVMValueRef
lp_build_fract(struct lp_build_context * bld,LLVMValueRef a)2305 lp_build_fract(struct lp_build_context *bld,
2306                LLVMValueRef a)
2307 {
2308    assert(bld->type.floating);
2309    return lp_build_sub(bld, a, lp_build_floor(bld, a));
2310 }
2311 
2312 
2313 /**
2314  * Prevent returning 1.0 for very small negative values of 'a' by clamping
2315  * against 0.99999(9). (Will also return that value for NaNs.)
2316  */
2317 static inline LLVMValueRef
clamp_fract(struct lp_build_context * bld,LLVMValueRef fract)2318 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2319 {
2320    LLVMValueRef max;
2321 
2322    /* this is the largest number smaller than 1.0 representable as float */
2323    max = lp_build_const_vec(bld->gallivm, bld->type,
2324                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2325    return lp_build_min_ext(bld, fract, max,
2326                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2327 }
2328 
2329 
2330 /**
2331  * Same as lp_build_fract, but guarantees that the result is always smaller
2332  * than one. Will also return the smaller-than-one value for infs, NaNs.
2333  */
2334 LLVMValueRef
lp_build_fract_safe(struct lp_build_context * bld,LLVMValueRef a)2335 lp_build_fract_safe(struct lp_build_context *bld,
2336                     LLVMValueRef a)
2337 {
2338    return clamp_fract(bld, lp_build_fract(bld, a));
2339 }
2340 
2341 
2342 /**
2343  * Return the integer part of a float (vector) value (== round toward zero).
2344  * The returned value is an integer (vector).
2345  * Ex: itrunc(-1.5) = -1
2346  */
2347 LLVMValueRef
lp_build_itrunc(struct lp_build_context * bld,LLVMValueRef a)2348 lp_build_itrunc(struct lp_build_context *bld,
2349                 LLVMValueRef a)
2350 {
2351    LLVMBuilderRef builder = bld->gallivm->builder;
2352    const struct lp_type type = bld->type;
2353    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2354 
2355    assert(type.floating);
2356    assert(lp_check_value(type, a));
2357 
2358    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2359 }
2360 
2361 
2362 /**
2363  * Return float (vector) rounded to nearest integer (vector).  The returned
2364  * value is an integer (vector).
2365  * Ex: iround(0.9) = 1
2366  * Ex: iround(-1.5) = -2
2367  */
2368 LLVMValueRef
lp_build_iround(struct lp_build_context * bld,LLVMValueRef a)2369 lp_build_iround(struct lp_build_context *bld,
2370                 LLVMValueRef a)
2371 {
2372    LLVMBuilderRef builder = bld->gallivm->builder;
2373    const struct lp_type type = bld->type;
2374    LLVMTypeRef int_vec_type = bld->int_vec_type;
2375    LLVMValueRef res;
2376 
2377    assert(type.floating);
2378 
2379    assert(lp_check_value(type, a));
2380 
2381    if ((util_get_cpu_caps()->has_sse2 &&
2382        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2383        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2384       return lp_build_iround_nearest_sse2(bld, a);
2385    }
2386    if (arch_rounding_available(type)) {
2387       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2388    }
2389    else {
2390       LLVMValueRef half;
2391 
2392       half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2393 
2394       if (type.sign) {
2395          LLVMTypeRef vec_type = bld->vec_type;
2396          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2397                                     (unsigned long long)1 << (type.width - 1));
2398          LLVMValueRef sign;
2399 
2400          /* get sign bit */
2401          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2402          sign = LLVMBuildAnd(builder, sign, mask, "");
2403 
2404          /* sign * 0.5 */
2405          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2406          half = LLVMBuildOr(builder, sign, half, "");
2407          half = LLVMBuildBitCast(builder, half, vec_type, "");
2408       }
2409 
2410       res = LLVMBuildFAdd(builder, a, half, "");
2411    }
2412 
2413    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2414 
2415    return res;
2416 }
2417 
2418 
2419 /**
2420  * Return floor of float (vector), result is an int (vector)
2421  * Ex: ifloor(1.1) = 1.0
2422  * Ex: ifloor(-1.1) = -2.0
2423  */
2424 LLVMValueRef
lp_build_ifloor(struct lp_build_context * bld,LLVMValueRef a)2425 lp_build_ifloor(struct lp_build_context *bld,
2426                 LLVMValueRef a)
2427 {
2428    LLVMBuilderRef builder = bld->gallivm->builder;
2429    const struct lp_type type = bld->type;
2430    LLVMTypeRef int_vec_type = bld->int_vec_type;
2431    LLVMValueRef res;
2432 
2433    assert(type.floating);
2434    assert(lp_check_value(type, a));
2435 
2436    res = a;
2437    if (type.sign) {
2438       if (arch_rounding_available(type)) {
2439          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2440       }
2441       else {
2442          struct lp_type inttype;
2443          struct lp_build_context intbld;
2444          LLVMValueRef trunc, itrunc, mask;
2445 
2446          assert(type.floating);
2447          assert(lp_check_value(type, a));
2448 
2449          inttype = type;
2450          inttype.floating = 0;
2451          lp_build_context_init(&intbld, bld->gallivm, inttype);
2452 
2453          /* round by truncation */
2454          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2455          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2456 
2457          /*
2458           * fix values if rounding is wrong (for non-special cases)
2459           * - this is the case if trunc > a
2460           * The results of doing this with NaNs, very large values etc.
2461           * are undefined but this seems to be the case anyway.
2462           */
2463          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2464          /* cheapie minus one with mask since the mask is minus one / zero */
2465          return lp_build_add(&intbld, itrunc, mask);
2466       }
2467    }
2468 
2469    /* round to nearest (toward zero) */
2470    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2471 
2472    return res;
2473 }
2474 
2475 
2476 /**
2477  * Return ceiling of float (vector), returning int (vector).
2478  * Ex: iceil( 1.1) = 2
2479  * Ex: iceil(-1.1) = -1
2480  */
2481 LLVMValueRef
lp_build_iceil(struct lp_build_context * bld,LLVMValueRef a)2482 lp_build_iceil(struct lp_build_context *bld,
2483                LLVMValueRef a)
2484 {
2485    LLVMBuilderRef builder = bld->gallivm->builder;
2486    const struct lp_type type = bld->type;
2487    LLVMTypeRef int_vec_type = bld->int_vec_type;
2488    LLVMValueRef res;
2489 
2490    assert(type.floating);
2491    assert(lp_check_value(type, a));
2492 
2493    if (arch_rounding_available(type)) {
2494       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2495    }
2496    else {
2497       struct lp_type inttype;
2498       struct lp_build_context intbld;
2499       LLVMValueRef trunc, itrunc, mask;
2500 
2501       assert(type.floating);
2502       assert(lp_check_value(type, a));
2503 
2504       inttype = type;
2505       inttype.floating = 0;
2506       lp_build_context_init(&intbld, bld->gallivm, inttype);
2507 
2508       /* round by truncation */
2509       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2511 
2512       /*
2513        * fix values if rounding is wrong (for non-special cases)
2514        * - this is the case if trunc < a
2515        * The results of doing this with NaNs, very large values etc.
2516        * are undefined but this seems to be the case anyway.
2517        */
2518       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2519       /* cheapie plus one with mask since the mask is minus one / zero */
2520       return lp_build_sub(&intbld, itrunc, mask);
2521    }
2522 
2523    /* round to nearest (toward zero) */
2524    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2525 
2526    return res;
2527 }
2528 
2529 
2530 /**
2531  * Combined ifloor() & fract().
2532  *
2533  * Preferred to calling the functions separately, as it will ensure that the
2534  * strategy (floor() vs ifloor()) that results in less redundant work is used.
2535  */
2536 void
lp_build_ifloor_fract(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2537 lp_build_ifloor_fract(struct lp_build_context *bld,
2538                       LLVMValueRef a,
2539                       LLVMValueRef *out_ipart,
2540                       LLVMValueRef *out_fpart)
2541 {
2542    LLVMBuilderRef builder = bld->gallivm->builder;
2543    const struct lp_type type = bld->type;
2544    LLVMValueRef ipart;
2545 
2546    assert(type.floating);
2547    assert(lp_check_value(type, a));
2548 
2549    if (arch_rounding_available(type)) {
2550       /*
2551        * floor() is easier.
2552        */
2553 
2554       ipart = lp_build_floor(bld, a);
2555       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2556       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2557    }
2558    else {
2559       /*
2560        * ifloor() is easier.
2561        */
2562 
2563       *out_ipart = lp_build_ifloor(bld, a);
2564       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2565       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2566    }
2567 }
2568 
2569 
2570 /**
2571  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2572  * always smaller than one.
2573  */
2574 void
lp_build_ifloor_fract_safe(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef * out_ipart,LLVMValueRef * out_fpart)2575 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2576                            LLVMValueRef a,
2577                            LLVMValueRef *out_ipart,
2578                            LLVMValueRef *out_fpart)
2579 {
2580    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2581    *out_fpart = clamp_fract(bld, *out_fpart);
2582 }
2583 
2584 
2585 LLVMValueRef
lp_build_sqrt(struct lp_build_context * bld,LLVMValueRef a)2586 lp_build_sqrt(struct lp_build_context *bld,
2587               LLVMValueRef a)
2588 {
2589    LLVMBuilderRef builder = bld->gallivm->builder;
2590    const struct lp_type type = bld->type;
2591    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2592    char intrinsic[32];
2593 
2594    assert(lp_check_value(type, a));
2595 
2596    assert(type.floating);
2597    lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2598 
2599    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2600 }
2601 
2602 
2603 /**
2604  * Do one Newton-Raphson step to improve reciprocate precision:
2605  *
2606  *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2607  *
2608  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2609  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2610  * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2611  * halo. It would be necessary to clamp the argument to prevent this.
2612  *
2613  * See also:
2614  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2615  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2616  */
2617 static inline LLVMValueRef
lp_build_rcp_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rcp_a)2618 lp_build_rcp_refine(struct lp_build_context *bld,
2619                     LLVMValueRef a,
2620                     LLVMValueRef rcp_a)
2621 {
2622    LLVMBuilderRef builder = bld->gallivm->builder;
2623    LLVMValueRef neg_a;
2624    LLVMValueRef res;
2625 
2626    neg_a = LLVMBuildFNeg(builder, a, "");
2627    res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2628    res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2629 
2630    return res;
2631 }
2632 
2633 
2634 LLVMValueRef
lp_build_rcp(struct lp_build_context * bld,LLVMValueRef a)2635 lp_build_rcp(struct lp_build_context *bld,
2636              LLVMValueRef a)
2637 {
2638    LLVMBuilderRef builder = bld->gallivm->builder;
2639    const struct lp_type type = bld->type;
2640 
2641    assert(lp_check_value(type, a));
2642 
2643    if (a == bld->zero)
2644       return bld->undef;
2645    if (a == bld->one)
2646       return bld->one;
2647    if (a == bld->undef)
2648       return bld->undef;
2649 
2650    assert(type.floating);
2651 
2652    if (LLVMIsConstant(a))
2653       return LLVMBuildFDiv(builder, bld->one, a, "");
2654 
2655    /*
2656     * We don't use RCPPS because:
2657     * - it only has 10bits of precision
2658     * - it doesn't even get the reciprocate of 1.0 exactly
2659     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2660     * - for recent processors the benefit over DIVPS is marginal, a case
2661     *   dependent
2662     *
2663     * We could still use it on certain processors if benchmarks show that the
2664     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2665     * particular uses that require less workarounds.
2666     */
2667 
2668    if (false && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2669          (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2670       const unsigned num_iterations = 0;
2671       LLVMValueRef res;
2672       unsigned i;
2673       const char *intrinsic = NULL;
2674 
2675       if (type.length == 4) {
2676          intrinsic = "llvm.x86.sse.rcp.ps";
2677       }
2678       else {
2679          intrinsic = "llvm.x86.avx.rcp.ps.256";
2680       }
2681 
2682       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2683 
2684       for (i = 0; i < num_iterations; ++i) {
2685          res = lp_build_rcp_refine(bld, a, res);
2686       }
2687 
2688       return res;
2689    }
2690 
2691    return LLVMBuildFDiv(builder, bld->one, a, "");
2692 }
2693 
2694 
2695 /**
2696  * Do one Newton-Raphson step to improve rsqrt precision:
2697  *
2698  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2699  *
2700  * See also Intel 64 and IA-32 Architectures Optimization Manual.
2701  */
2702 static inline LLVMValueRef
lp_build_rsqrt_refine(struct lp_build_context * bld,LLVMValueRef a,LLVMValueRef rsqrt_a)2703 lp_build_rsqrt_refine(struct lp_build_context *bld,
2704                       LLVMValueRef a,
2705                       LLVMValueRef rsqrt_a)
2706 {
2707    LLVMBuilderRef builder = bld->gallivm->builder;
2708    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2709    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2710    LLVMValueRef res;
2711 
2712    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2713    res = LLVMBuildFMul(builder, a, res, "");
2714    res = LLVMBuildFSub(builder, three, res, "");
2715    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2716    res = LLVMBuildFMul(builder, half, res, "");
2717 
2718    return res;
2719 }
2720 
2721 
2722 /**
2723  * Generate 1/sqrt(a).
2724  * Result is undefined for values < 0, infinity for +0.
2725  */
2726 LLVMValueRef
lp_build_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2727 lp_build_rsqrt(struct lp_build_context *bld,
2728                LLVMValueRef a)
2729 {
2730    const struct lp_type type = bld->type;
2731 
2732    assert(lp_check_value(type, a));
2733 
2734    assert(type.floating);
2735 
2736    /*
2737     * This should be faster but all denormals will end up as infinity.
2738     */
2739    if (0 && lp_build_fast_rsqrt_available(type)) {
2740       const unsigned num_iterations = 1;
2741       LLVMValueRef res;
2742       unsigned i;
2743 
2744       /* rsqrt(1.0) != 1.0 here */
2745       res = lp_build_fast_rsqrt(bld, a);
2746 
2747       if (num_iterations) {
2748          /*
2749           * Newton-Raphson will result in NaN instead of infinity for zero,
2750           * and NaN instead of zero for infinity.
2751           * Also, need to ensure rsqrt(1.0) == 1.0.
2752           * All numbers smaller than FLT_MIN will result in +infinity
2753           * (rsqrtps treats all denormals as zero).
2754           */
2755          LLVMValueRef cmp;
2756          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2757          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2758 
2759          for (i = 0; i < num_iterations; ++i) {
2760             res = lp_build_rsqrt_refine(bld, a, res);
2761          }
2762          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2763          res = lp_build_select(bld, cmp, inf, res);
2764          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2765          res = lp_build_select(bld, cmp, bld->zero, res);
2766          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2767          res = lp_build_select(bld, cmp, bld->one, res);
2768       }
2769 
2770       return res;
2771    }
2772 
2773    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2774 }
2775 
2776 
2777 /**
2778  * If there's a fast (inaccurate) rsqrt instruction available
2779  * (caller may want to avoid to call rsqrt_fast if it's not available,
2780  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2781  * unavailable it would result in sqrt/div/mul so obviously
2782  * much better to just call sqrt, skipping both div and mul).
2783  */
2784 bool
lp_build_fast_rsqrt_available(struct lp_type type)2785 lp_build_fast_rsqrt_available(struct lp_type type)
2786 {
2787    assert(type.floating);
2788 
2789    if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2790        (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2791       return true;
2792    }
2793    return false;
2794 }
2795 
2796 
2797 /**
2798  * Generate 1/sqrt(a).
2799  * Result is undefined for values < 0, infinity for +0.
2800  * Precision is limited, only ~10 bits guaranteed
2801  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2802  */
2803 LLVMValueRef
lp_build_fast_rsqrt(struct lp_build_context * bld,LLVMValueRef a)2804 lp_build_fast_rsqrt(struct lp_build_context *bld,
2805                     LLVMValueRef a)
2806 {
2807    LLVMBuilderRef builder = bld->gallivm->builder;
2808    const struct lp_type type = bld->type;
2809 
2810    assert(lp_check_value(type, a));
2811 
2812    if (lp_build_fast_rsqrt_available(type)) {
2813       const char *intrinsic = NULL;
2814 
2815       if (type.length == 4) {
2816          intrinsic = "llvm.x86.sse.rsqrt.ps";
2817       }
2818       else {
2819          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2820       }
2821       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2822    }
2823    else {
2824       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __func__);
2825    }
2826    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2827 }
2828 
2829 
2830 /**
2831  * Generate sin(a) or cos(a) using polynomial approximation.
2832  * TODO: it might be worth recognizing sin and cos using same source
2833  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2834  * would be way cheaper than calculating (nearly) everything twice...
2835  * Not sure it's common enough to be worth bothering however, scs
2836  * opcode could also benefit from calculating both though.
2837  */
2838 static LLVMValueRef
lp_build_sin_or_cos(struct lp_build_context * bld,LLVMValueRef a,bool cos)2839 lp_build_sin_or_cos(struct lp_build_context *bld,
2840                     LLVMValueRef a,
2841                     bool cos)
2842 {
2843    struct gallivm_state *gallivm = bld->gallivm;
2844    LLVMBuilderRef b = gallivm->builder;
2845    struct lp_type int_type = lp_int_type(bld->type);
2846 
2847    /*
2848     *  take the absolute value,
2849     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2850     */
2851 
2852    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2853    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2854 
2855    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2856    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2857 
2858    /*
2859     * scale by 4/Pi
2860     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2861     */
2862 
2863    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2864    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2865 
2866    /*
2867     * store the integer part of y in mm0
2868     * emm2 = _mm_cvttps_epi32(y);
2869     */
2870 
2871    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2872 
2873    /*
2874     * j=(j+1) & (~1) (see the cephes sources)
2875     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2876     */
2877 
2878    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2879    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2880    /*
2881     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2882     */
2883    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2884    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2885 
2886    /*
2887     * y = _mm_cvtepi32_ps(emm2);
2888     */
2889    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2890 
2891    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2892    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2893    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2894    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2895 
2896    /*
2897     * Argument used for poly selection and sign bit determination
2898     * is different for sin vs. cos.
2899     */
2900    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2901                                emm2_and;
2902 
2903    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2904                                                               LLVMBuildNot(b, emm2_2, ""), ""),
2905                                               const_29, "sign_bit") :
2906                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2907                                                               LLVMBuildShl(b, emm2_add,
2908                                                                            const_29, ""), ""),
2909                                               sign_mask, "sign_bit");
2910 
2911    /*
2912     * get the polynom selection mask
2913     * there is one polynom for 0 <= x <= Pi/4
2914     * and another one for Pi/4<x<=Pi/2
2915     * Both branches will be computed.
2916     *
2917     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2918     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2919     */
2920 
2921    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2922    LLVMValueRef poly_mask = lp_build_compare(gallivm,
2923                                              int_type, PIPE_FUNC_EQUAL,
2924                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2925 
2926    /*
2927     * _PS_CONST(minus_cephes_DP1, -0.78515625);
2928     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2929     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2930     */
2931    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2932    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2933    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2934 
2935    /*
2936     * The magic pass: "Extended precision modular arithmetic"
2937     * x = ((x - y * DP1) - y * DP2) - y * DP3;
2938     */
2939    LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2940    LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2941    LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2942 
2943    /*
2944     * Evaluate the first polynom  (0 <= x <= Pi/4)
2945     *
2946     * z = _mm_mul_ps(x,x);
2947     */
2948    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2949 
2950    /*
2951     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2952     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2953     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2954     */
2955    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2956    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2957    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2958 
2959    /*
2960     * y = *(v4sf*)_ps_coscof_p0;
2961     * y = _mm_mul_ps(y, z);
2962     */
2963    LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2964    LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2965    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2966    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2967 
2968 
2969    /*
2970     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2971     * y = _mm_sub_ps(y, tmp);
2972     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2973     */
2974    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2975    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2976    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2977    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2978    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2979 
2980    /*
2981     * _PS_CONST(sincof_p0, -1.9515295891E-4);
2982     * _PS_CONST(sincof_p1,  8.3321608736E-3);
2983     * _PS_CONST(sincof_p2, -1.6666654611E-1);
2984     */
2985    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2986    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2987    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2988 
2989    /*
2990     * Evaluate the second polynom  (Pi/4 <= x <= 0)
2991     *
2992     * y2 = *(v4sf*)_ps_sincof_p0;
2993     * y2 = _mm_mul_ps(y2, z);
2994     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2995     * y2 = _mm_mul_ps(y2, z);
2996     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2997     * y2 = _mm_mul_ps(y2, z);
2998     * y2 = _mm_mul_ps(y2, x);
2999     * y2 = _mm_add_ps(y2, x);
3000     */
3001 
3002    LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3003    LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3004    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3005    LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3006 
3007    /*
3008     * select the correct result from the two polynoms
3009     * xmm3 = poly_mask;
3010     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3011     * y = _mm_andnot_ps(xmm3, y);
3012     * y = _mm_or_ps(y,y2);
3013     */
3014    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3015    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3016    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3017    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3018    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3019    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3020 
3021    /*
3022     * update the sign
3023     * y = _mm_xor_ps(y, sign_bit);
3024     */
3025    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3026    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3027 
3028    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3029 
3030    /* clamp output to be within [-1, 1] */
3031    y_result = lp_build_clamp(bld, y_result,
3032                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3033                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3034    /* If a is -inf, inf or NaN then return NaN */
3035    y_result = lp_build_select(bld, isfinite, y_result,
3036                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3037    return y_result;
3038 }
3039 
3040 
3041 /**
3042  * Generate sin(a)
3043  */
3044 LLVMValueRef
lp_build_sin(struct lp_build_context * bld,LLVMValueRef a)3045 lp_build_sin(struct lp_build_context *bld,
3046              LLVMValueRef a)
3047 {
3048    const struct lp_type type = bld->type;
3049 
3050    if (type.width == 16) {
3051       LLVMBuilderRef builder = bld->gallivm->builder;
3052       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3053       char intrinsic[32];
3054       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3055       LLVMValueRef args[] = { a };
3056       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3057    }
3058 
3059    return lp_build_sin_or_cos(bld, a, false);
3060 }
3061 
3062 
3063 /**
3064  * Generate cos(a)
3065  */
3066 LLVMValueRef
lp_build_cos(struct lp_build_context * bld,LLVMValueRef a)3067 lp_build_cos(struct lp_build_context *bld,
3068              LLVMValueRef a)
3069 {
3070    const struct lp_type type = bld->type;
3071 
3072    if (type.width == 16) {
3073       LLVMBuilderRef builder = bld->gallivm->builder;
3074       LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3075       char intrinsic[32];
3076       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3077       LLVMValueRef args[] = { a };
3078       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3079    }
3080 
3081    return lp_build_sin_or_cos(bld, a, true);
3082 }
3083 
3084 
3085 /**
3086  * Generate pow(x, y)
3087  */
3088 LLVMValueRef
lp_build_pow(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3089 lp_build_pow(struct lp_build_context *bld,
3090              LLVMValueRef x,
3091              LLVMValueRef y)
3092 {
3093    /* TODO: optimize the constant case */
3094    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3095        LLVMIsConstant(x) && LLVMIsConstant(y)) {
3096       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3097                    __func__);
3098    }
3099 
3100    LLVMValueRef cmp = lp_build_cmp_ordered(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3101    LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3102 
3103    res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3104    return res;
3105 }
3106 
3107 
3108 /**
3109  * Generate exp(x)
3110  */
3111 LLVMValueRef
lp_build_exp(struct lp_build_context * bld,LLVMValueRef x)3112 lp_build_exp(struct lp_build_context *bld,
3113              LLVMValueRef x)
3114 {
3115    /* log2(e) = 1/log(2) */
3116    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3117                                            1.4426950408889634);
3118 
3119    assert(lp_check_value(bld->type, x));
3120 
3121    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3122 }
3123 
3124 
3125 /**
3126  * Generate log(x)
3127  * Behavior is undefined with infs, 0s and nans
3128  */
3129 LLVMValueRef
lp_build_log(struct lp_build_context * bld,LLVMValueRef x)3130 lp_build_log(struct lp_build_context *bld,
3131              LLVMValueRef x)
3132 {
3133    /* log(2) */
3134    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3135                                           0.69314718055994529);
3136 
3137    assert(lp_check_value(bld->type, x));
3138 
3139    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3140 }
3141 
3142 
3143 /**
3144  * Generate log(x) that handles edge cases (infs, 0s and nans)
3145  */
3146 LLVMValueRef
lp_build_log_safe(struct lp_build_context * bld,LLVMValueRef x)3147 lp_build_log_safe(struct lp_build_context *bld,
3148                   LLVMValueRef x)
3149 {
3150    /* log(2) */
3151    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3152                                           0.69314718055994529);
3153 
3154    assert(lp_check_value(bld->type, x));
3155 
3156    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3157 }
3158 
3159 
3160 /**
3161  * Generate polynomial.
3162  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3163  */
3164 LLVMValueRef
lp_build_polynomial(struct lp_build_context * bld,LLVMValueRef x,const double * coeffs,unsigned num_coeffs)3165 lp_build_polynomial(struct lp_build_context *bld,
3166                     LLVMValueRef x,
3167                     const double *coeffs,
3168                     unsigned num_coeffs)
3169 {
3170    const struct lp_type type = bld->type;
3171    LLVMValueRef even = NULL, odd = NULL;
3172    LLVMValueRef x2;
3173    unsigned i;
3174 
3175    assert(lp_check_value(bld->type, x));
3176 
3177    /* TODO: optimize the constant case */
3178    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3179        LLVMIsConstant(x)) {
3180       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3181                    __func__);
3182    }
3183 
3184    /*
3185     * Calculate odd and even terms seperately to decrease data dependency
3186     * Ex:
3187     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3188     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3189     */
3190    x2 = lp_build_mul(bld, x, x);
3191 
3192    for (i = num_coeffs; i--; ) {
3193       LLVMValueRef coeff;
3194 
3195       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3196 
3197       if (i % 2 == 0) {
3198          if (even)
3199             even = lp_build_mad(bld, x2, even, coeff);
3200          else
3201             even = coeff;
3202       } else {
3203          if (odd)
3204             odd = lp_build_mad(bld, x2, odd, coeff);
3205          else
3206             odd = coeff;
3207       }
3208    }
3209 
3210    if (odd)
3211       return lp_build_mad(bld, odd, x, even);
3212    else if (even)
3213       return even;
3214    else
3215       return bld->undef;
3216 }
3217 
3218 
3219 /**
3220  * Minimax polynomial fit of 2**x, in range [0, 1[
3221  */
3222 static const double lp_build_exp2_polynomial[] = {
3223 #if EXP_POLY_DEGREE == 5
3224    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3225    0.693153073200168932794,
3226    0.240153617044375388211,
3227    0.0558263180532956664775,
3228    0.00898934009049466391101,
3229    0.00187757667519147912699
3230 #elif EXP_POLY_DEGREE == 4
3231    1.00000259337069434683,
3232    0.693003834469974940458,
3233    0.24144275689150793076,
3234    0.0520114606103070150235,
3235    0.0135341679161270268764
3236 #elif EXP_POLY_DEGREE == 3
3237    0.999925218562710312959,
3238    0.695833540494823811697,
3239    0.226067155427249155588,
3240    0.0780245226406372992967
3241 #elif EXP_POLY_DEGREE == 2
3242    1.00172476321474503578,
3243    0.657636275736077639316,
3244    0.33718943461968720704
3245 #else
3246 #error
3247 #endif
3248 };
3249 
3250 
3251 LLVMValueRef
lp_build_exp2(struct lp_build_context * bld,LLVMValueRef x)3252 lp_build_exp2(struct lp_build_context *bld,
3253               LLVMValueRef x)
3254 {
3255    LLVMBuilderRef builder = bld->gallivm->builder;
3256    const struct lp_type type = bld->type;
3257    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3258    LLVMValueRef ipart = NULL;
3259    LLVMValueRef fpart = NULL;
3260    LLVMValueRef expipart = NULL;
3261    LLVMValueRef expfpart = NULL;
3262    LLVMValueRef res = NULL;
3263 
3264    if (type.floating && type.width == 16) {
3265       char intrinsic[32];
3266       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3267       LLVMValueRef args[] = { x };
3268       return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3269    }
3270 
3271    assert(lp_check_value(bld->type, x));
3272 
3273    /* TODO: optimize the constant case */
3274    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3275        LLVMIsConstant(x)) {
3276       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3277                    __func__);
3278    }
3279 
3280    assert(type.floating && type.width == 32);
3281 
3282    /* We want to preserve NaN and make sure than for exp2 if x > 128,
3283     * the result is INF  and if it's smaller than -126.9 the result is 0 */
3284    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3285                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3286    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3287                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3288 
3289    /* ipart = floor(x) */
3290    /* fpart = x - ipart */
3291    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3292 
3293    /* expipart = (float) (1 << ipart) */
3294    expipart = LLVMBuildAdd(builder, ipart,
3295                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
3296    expipart = LLVMBuildShl(builder, expipart,
3297                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
3298    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3299 
3300    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3301                                   ARRAY_SIZE(lp_build_exp2_polynomial));
3302 
3303    res = LLVMBuildFMul(builder, expipart, expfpart, "");
3304 
3305    return res;
3306 }
3307 
3308 
3309 /**
3310  * Extract the exponent of a IEEE-754 floating point value.
3311  *
3312  * Optionally apply an integer bias.
3313  *
3314  * Result is an integer value with
3315  *
3316  *   ifloor(log2(x)) + bias
3317  */
3318 LLVMValueRef
lp_build_extract_exponent(struct lp_build_context * bld,LLVMValueRef x,int bias)3319 lp_build_extract_exponent(struct lp_build_context *bld,
3320                           LLVMValueRef x,
3321                           int bias)
3322 {
3323    LLVMBuilderRef builder = bld->gallivm->builder;
3324    const struct lp_type type = bld->type;
3325    unsigned mantissa = lp_mantissa(type);
3326    LLVMValueRef res;
3327 
3328    assert(type.floating);
3329 
3330    assert(lp_check_value(bld->type, x));
3331 
3332    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3333 
3334    res = LLVMBuildLShr(builder, x,
3335                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3336    res = LLVMBuildAnd(builder, res,
3337                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
3338    res = LLVMBuildSub(builder, res,
3339                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3340 
3341    return res;
3342 }
3343 
3344 
3345 /**
3346  * Extract the mantissa of the a floating.
3347  *
3348  * Result is a floating point value with
3349  *
3350  *   x / floor(log2(x))
3351  */
3352 LLVMValueRef
lp_build_extract_mantissa(struct lp_build_context * bld,LLVMValueRef x)3353 lp_build_extract_mantissa(struct lp_build_context *bld,
3354                           LLVMValueRef x)
3355 {
3356    LLVMBuilderRef builder = bld->gallivm->builder;
3357    const struct lp_type type = bld->type;
3358    unsigned mantissa = lp_mantissa(type);
3359    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3360                                                   (1ULL << mantissa) - 1);
3361    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3362    LLVMValueRef res;
3363 
3364    assert(lp_check_value(bld->type, x));
3365 
3366    assert(type.floating);
3367 
3368    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3369 
3370    /* res = x / 2**ipart */
3371    res = LLVMBuildAnd(builder, x, mantmask, "");
3372    res = LLVMBuildOr(builder, res, one, "");
3373    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3374 
3375    return res;
3376 }
3377 
3378 
3379 
3380 /**
3381  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3382  * These coefficients can be generate with
3383  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3384  */
3385 static const double lp_build_log2_polynomial[] = {
3386 #if LOG_POLY_DEGREE == 5
3387    2.88539008148777786488L,
3388    0.961796878841293367824L,
3389    0.577058946784739859012L,
3390    0.412914355135828735411L,
3391    0.308591899232910175289L,
3392    0.352376952300281371868L,
3393 #elif LOG_POLY_DEGREE == 4
3394    2.88539009343309178325L,
3395    0.961791550404184197881L,
3396    0.577440339438736392009L,
3397    0.403343858251329912514L,
3398    0.406718052498846252698L,
3399 #elif LOG_POLY_DEGREE == 3
3400    2.88538959748872753838L,
3401    0.961932915889597772928L,
3402    0.571118517972136195241L,
3403    0.493997535084709500285L,
3404 #else
3405 #error
3406 #endif
3407 };
3408 
3409 
3410 /**
3411  * See http://www.devmaster.net/forums/showthread.php?p=43580
3412  * http://en.wikipedia.org/wiki/Logarithm#Calculation
3413  * http://www.nezumi.demon.co.uk/consult/logx.htm
3414  *
3415  * If handle_edge_cases is true the function will perform computations
3416  * to match the required D3D10+ behavior for each of the edge cases.
3417  * That means that if input is:
3418  * - less than zero (to and including -inf) then NaN will be returned
3419  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3420  * - +infinity, then +infinity will be returned
3421  * - NaN, then NaN will be returned
3422  *
3423  * Those checks are fairly expensive so if you don't need them make sure
3424  * handle_edge_cases is false.
3425  */
3426 void
lp_build_log2_approx(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef * p_exp,LLVMValueRef * p_floor_log2,LLVMValueRef * p_log2,bool handle_edge_cases)3427 lp_build_log2_approx(struct lp_build_context *bld,
3428                      LLVMValueRef x,
3429                      LLVMValueRef *p_exp,
3430                      LLVMValueRef *p_floor_log2,
3431                      LLVMValueRef *p_log2,
3432                      bool handle_edge_cases)
3433 {
3434    LLVMBuilderRef builder = bld->gallivm->builder;
3435    const struct lp_type type = bld->type;
3436    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3437    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3438 
3439    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3440    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3441    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3442 
3443    LLVMValueRef i = NULL;
3444    LLVMValueRef y = NULL;
3445    LLVMValueRef z = NULL;
3446    LLVMValueRef exp = NULL;
3447    LLVMValueRef mant = NULL;
3448    LLVMValueRef logexp = NULL;
3449    LLVMValueRef p_z = NULL;
3450    LLVMValueRef res = NULL;
3451 
3452    if (bld->type.width == 16) {
3453       char intrinsic[32];
3454       lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3455       LLVMValueRef args[] = { x };
3456       if (p_log2)
3457          *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3458       return;
3459    }
3460 
3461    assert(lp_check_value(bld->type, x));
3462 
3463    if (p_exp || p_floor_log2 || p_log2) {
3464       /* TODO: optimize the constant case */
3465       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3466           LLVMIsConstant(x)) {
3467          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3468                       __func__);
3469       }
3470 
3471       assert(type.floating && type.width == 32);
3472 
3473       /*
3474        * We don't explicitly handle denormalized numbers. They will yield a
3475        * result in the neighbourhood of -127, which appears to be adequate
3476        * enough.
3477        */
3478 
3479       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3480 
3481       /* exp = (float) exponent(x) */
3482       exp = LLVMBuildAnd(builder, i, expmask, "");
3483    }
3484 
3485    if (p_floor_log2 || p_log2) {
3486       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3487       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3488       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3489    }
3490 
3491    if (p_log2) {
3492       /* mant = 1 + (float) mantissa(x) */
3493       mant = LLVMBuildAnd(builder, i, mantmask, "");
3494       mant = LLVMBuildOr(builder, mant, one, "");
3495       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3496 
3497       /* y = (mant - 1) / (mant + 1) */
3498       y = lp_build_div(bld,
3499          lp_build_sub(bld, mant, bld->one),
3500          lp_build_add(bld, mant, bld->one));
3501 
3502       /* z = y^2 */
3503       z = lp_build_mul(bld, y, y);
3504 
3505       /* compute P(z) */
3506       p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3507                                 ARRAY_SIZE(lp_build_log2_polynomial));
3508 
3509       /* y * P(z) + logexp */
3510       res = lp_build_mad(bld, y, p_z, logexp);
3511 
3512       if (type.floating && handle_edge_cases) {
3513          LLVMValueRef negmask, infmask,  zmask;
3514          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3515                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
3516          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3517                               lp_build_const_vec(bld->gallivm, type,  0.0f));
3518          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3519                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
3520 
3521          /* If x is qual to inf make sure we return inf */
3522          res = lp_build_select(bld, infmask,
3523                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
3524                                res);
3525          /* If x is qual to 0, return -inf */
3526          res = lp_build_select(bld, zmask,
3527                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3528                                res);
3529          /* If x is nan or less than 0, return nan */
3530          res = lp_build_select(bld, negmask,
3531                                lp_build_const_vec(bld->gallivm, type,  NAN),
3532                                res);
3533       }
3534    }
3535 
3536    if (p_exp) {
3537       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3538       *p_exp = exp;
3539    }
3540 
3541    if (p_floor_log2)
3542       *p_floor_log2 = logexp;
3543 
3544    if (p_log2)
3545       *p_log2 = res;
3546 }
3547 
3548 
3549 /*
3550  * log2 implementation which doesn't have special code to
3551  * handle edge cases (-inf, 0, inf, NaN). It's faster but
3552  * the results for those cases are undefined.
3553  */
3554 LLVMValueRef
lp_build_log2(struct lp_build_context * bld,LLVMValueRef x)3555 lp_build_log2(struct lp_build_context *bld,
3556               LLVMValueRef x)
3557 {
3558    LLVMValueRef res;
3559    lp_build_log2_approx(bld, x, NULL, NULL, &res, false);
3560    return res;
3561 }
3562 
3563 
3564 /*
3565  * Version of log2 which handles all edge cases.
3566  * Look at documentation of lp_build_log2_approx for
3567  * description of the behavior for each of the edge cases.
3568  */
3569 LLVMValueRef
lp_build_log2_safe(struct lp_build_context * bld,LLVMValueRef x)3570 lp_build_log2_safe(struct lp_build_context *bld,
3571                    LLVMValueRef x)
3572 {
3573    LLVMValueRef res;
3574    lp_build_log2_approx(bld, x, NULL, NULL, &res, true);
3575    return res;
3576 }
3577 
3578 
3579 /**
3580  * Faster (and less accurate) log2.
3581  *
3582  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3583  *
3584  * Piece-wise linear approximation, with exact results when x is a
3585  * power of two.
3586  *
3587  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3588  */
3589 LLVMValueRef
lp_build_fast_log2(struct lp_build_context * bld,LLVMValueRef x)3590 lp_build_fast_log2(struct lp_build_context *bld,
3591                    LLVMValueRef x)
3592 {
3593    LLVMBuilderRef builder = bld->gallivm->builder;
3594    LLVMValueRef ipart;
3595    LLVMValueRef fpart;
3596 
3597    assert(lp_check_value(bld->type, x));
3598 
3599    assert(bld->type.floating);
3600 
3601    /* ipart = floor(log2(x)) - 1 */
3602    ipart = lp_build_extract_exponent(bld, x, -1);
3603    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3604 
3605    /* fpart = x / 2**ipart */
3606    fpart = lp_build_extract_mantissa(bld, x);
3607 
3608    /* ipart + fpart */
3609    return LLVMBuildFAdd(builder, ipart, fpart, "");
3610 }
3611 
3612 
3613 /**
3614  * Fast implementation of iround(log2(x)).
3615  *
3616  * Not an approximation -- it should give accurate results all the time.
3617  */
3618 LLVMValueRef
lp_build_ilog2(struct lp_build_context * bld,LLVMValueRef x)3619 lp_build_ilog2(struct lp_build_context *bld,
3620                LLVMValueRef x)
3621 {
3622    LLVMBuilderRef builder = bld->gallivm->builder;
3623    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3624    LLVMValueRef ipart;
3625 
3626    assert(bld->type.floating);
3627 
3628    assert(lp_check_value(bld->type, x));
3629 
3630    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3631    x = LLVMBuildFMul(builder, x, sqrt2, "");
3632 
3633    /* ipart = floor(log2(x) + 0.5)  */
3634    ipart = lp_build_extract_exponent(bld, x, 0);
3635 
3636    return ipart;
3637 }
3638 
3639 LLVMValueRef
lp_build_mod(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y)3640 lp_build_mod(struct lp_build_context *bld,
3641              LLVMValueRef x,
3642              LLVMValueRef y)
3643 {
3644    LLVMBuilderRef builder = bld->gallivm->builder;
3645    LLVMValueRef res;
3646    const struct lp_type type = bld->type;
3647 
3648    assert(lp_check_value(type, x));
3649    assert(lp_check_value(type, y));
3650 
3651    if (type.floating)
3652       res = LLVMBuildFRem(builder, x, y, "");
3653    else if (type.sign)
3654       res = LLVMBuildSRem(builder, x, y, "");
3655    else
3656       res = LLVMBuildURem(builder, x, y, "");
3657    return res;
3658 }
3659 
3660 
3661 /*
3662  * For floating inputs it creates and returns a mask
3663  * which is all 1's for channels which are NaN.
3664  * Channels inside x which are not NaN will be 0.
3665  */
3666 LLVMValueRef
lp_build_isnan(struct lp_build_context * bld,LLVMValueRef x)3667 lp_build_isnan(struct lp_build_context *bld,
3668                LLVMValueRef x)
3669 {
3670    LLVMValueRef mask;
3671    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3672 
3673    assert(bld->type.floating);
3674    assert(lp_check_value(bld->type, x));
3675 
3676    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3677                         "isnotnan");
3678    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3679    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3680    return mask;
3681 }
3682 
3683 
3684 /* Returns all 1's for floating point numbers that are
3685  * finite numbers and returns all zeros for -inf,
3686  * inf and nan's */
3687 LLVMValueRef
lp_build_isfinite(struct lp_build_context * bld,LLVMValueRef x)3688 lp_build_isfinite(struct lp_build_context *bld,
3689                   LLVMValueRef x)
3690 {
3691    LLVMBuilderRef builder = bld->gallivm->builder;
3692    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3693    struct lp_type int_type = lp_int_type(bld->type);
3694    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3695    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3696                                                     0x7f800000);
3697 
3698    if (!bld->type.floating) {
3699       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3700    }
3701    assert(bld->type.floating);
3702    assert(lp_check_value(bld->type, x));
3703    assert(bld->type.width == 32);
3704 
3705    intx = LLVMBuildAnd(builder, intx, infornan32, "");
3706    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3707                            intx, infornan32);
3708 }
3709 
3710 
3711 /*
3712  * Returns true if the number is nan or inf and false otherwise.
3713  * The input has to be a floating point vector.
3714  */
3715 LLVMValueRef
lp_build_is_inf_or_nan(struct gallivm_state * gallivm,const struct lp_type type,LLVMValueRef x)3716 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3717                        const struct lp_type type,
3718                        LLVMValueRef x)
3719 {
3720    LLVMBuilderRef builder = gallivm->builder;
3721    struct lp_type int_type = lp_int_type(type);
3722    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3723                                                 0x7f800000);
3724    LLVMValueRef ret;
3725 
3726    assert(type.floating);
3727 
3728    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3729    ret = LLVMBuildAnd(builder, ret, const0, "");
3730    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3731                           ret, const0);
3732 
3733    return ret;
3734 }
3735 
3736 
3737 LLVMValueRef
lp_build_fpstate_get(struct gallivm_state * gallivm)3738 lp_build_fpstate_get(struct gallivm_state *gallivm)
3739 {
3740    if (util_get_cpu_caps()->has_sse) {
3741       LLVMBuilderRef builder = gallivm->builder;
3742       LLVMValueRef mxcsr_ptr = lp_build_alloca(
3743          gallivm,
3744          LLVMInt32TypeInContext(gallivm->context),
3745          "mxcsr_ptr");
3746       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3747           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3748       lp_build_intrinsic(builder,
3749                          "llvm.x86.sse.stmxcsr",
3750                          LLVMVoidTypeInContext(gallivm->context),
3751                          &mxcsr_ptr8, 1, 0);
3752       return mxcsr_ptr;
3753    }
3754    return 0;
3755 }
3756 
3757 void
lp_build_fpstate_set_denorms_zero(struct gallivm_state * gallivm,bool zero)3758 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3759                                   bool zero)
3760 {
3761    if (util_get_cpu_caps()->has_sse) {
3762       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3763       int daz_ftz = _MM_FLUSH_ZERO_MASK;
3764 
3765       LLVMBuilderRef builder = gallivm->builder;
3766       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3767       LLVMValueRef mxcsr =
3768          LLVMBuildLoad2(builder, LLVMInt32TypeInContext(gallivm->context), mxcsr_ptr, "mxcsr");
3769 
3770       if (util_get_cpu_caps()->has_daz) {
3771          /* Enable denormals are zero mode */
3772          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3773       }
3774       if (zero) {
3775          mxcsr = LLVMBuildOr(builder, mxcsr,
3776                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3777       } else {
3778          mxcsr = LLVMBuildAnd(builder, mxcsr,
3779                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3780       }
3781 
3782       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3783       lp_build_fpstate_set(gallivm, mxcsr_ptr);
3784    }
3785 }
3786 
3787 
3788 void
lp_build_fpstate_set(struct gallivm_state * gallivm,LLVMValueRef mxcsr_ptr)3789 lp_build_fpstate_set(struct gallivm_state *gallivm,
3790                      LLVMValueRef mxcsr_ptr)
3791 {
3792    if (util_get_cpu_caps()->has_sse) {
3793       LLVMBuilderRef builder = gallivm->builder;
3794       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3795                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3796       lp_build_intrinsic(builder,
3797                          "llvm.x86.sse.ldmxcsr",
3798                          LLVMVoidTypeInContext(gallivm->context),
3799                          &mxcsr_ptr, 1, 0);
3800    }
3801 }
3802