• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Double-precision vector erfc(x) function.
3  *
4  * Copyright (c) 2019-2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "v_math.h"
9 #include "horner.h"
10 #include "math_config.h"
11 #include "pl_sig.h"
12 #include "pl_test.h"
13 
14 #if V_SUPPORTED
15 
16 /* Accurate exponential (vector variant of exp_dd).  */
17 v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
18 
19 #define One v_f64 (1.0)
20 #define AbsMask v_u64 (0x7fffffffffffffff)
21 #define Scale v_f64 (0x1.0000002p27)
22 
23 /* Coeffs for polynomial approximation on [0x1.0p-28., 31.].  */
24 #define PX __v_erfc_data.poly
25 #define xint __v_erfc_data.interval_bounds
26 
27 /* Special cases (fall back to scalar calls).  */
28 VPCS_ATTR
29 NOINLINE static v_f64_t
specialcase(v_f64_t x,v_f64_t y,v_u64_t cmp)30 specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
31 {
32   return v_call_f64 (erfc, x, y, cmp);
33 }
34 
35 /* A structure to perform look-up in coeffs and other parameter
36    tables.  */
37 struct entry
38 {
39   v_f64_t P[ERFC_POLY_ORDER + 1];
40   v_f64_t xi;
41 };
42 
43 static inline struct entry
lookup(v_u64_t i)44 lookup (v_u64_t i)
45 {
46   struct entry e;
47 #ifdef SCALAR
48   for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
49     e.P[j] = PX[i][j];
50   e.xi = xint[i];
51 #else
52   for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
53     {
54       e.P[j][0] = PX[i[0]][j];
55       e.P[j][1] = PX[i[1]][j];
56     }
57   e.xi[0] = xint[i[0]];
58   e.xi[1] = xint[i[1]];
59 #endif
60   return e;
61 }
62 
63 /* Accurate evaluation of exp(x^2) using compensated product
64    (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
65    corrections d<<y.  */
66 static inline v_f64_t
v_eval_gauss(v_f64_t a)67 v_eval_gauss (v_f64_t a)
68 {
69   v_f64_t e2;
70   v_f64_t a2 = a * a;
71 
72   /* TwoProduct (Dekker) applied to a * a.  */
73   v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
74   a_hi = v_fma_f64 (Scale, a, a_hi);
75   v_f64_t a_lo = a - a_hi;
76 
77   /* Now assemble error term.  */
78   e2 = v_fma_f64 (-a_hi, a_hi, a2);
79   e2 = v_fma_f64 (-a_hi, a_lo, e2);
80   e2 = v_fma_f64 (-a_lo, a_hi, e2);
81   e2 = v_fma_f64 (-a_lo, a_lo, e2);
82 
83   /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2.  */
84   return V_NAME (exp_tail) (-a2, e2);
85 }
86 
87 /* Optimized double precision vector complementary error function erfc.
88    Maximum measured error is 3.64 ULP:
89    __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
90 				 want 0x1.ff3f4c8e200d9p-42.  */
91 VPCS_ATTR
V_NAME(erfc)92 v_f64_t V_NAME (erfc) (v_f64_t x)
93 {
94   v_f64_t z, p, y;
95   v_u64_t ix, atop, sign, i, cmp;
96 
97   ix = v_as_u64_f64 (x);
98   /* Compute fac as early as possible in order to get best performance.  */
99   v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
100   /* Use 12-bit for small, nan and inf case detection.  */
101   atop = (ix >> 52) & 0x7ff;
102   cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
103 
104   struct entry dat;
105 
106   /* All entries of the vector are out of bounds, take a short path.
107      Use smallest possible number above 28 representable in 12 bits.  */
108   v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
109 
110   /* Use sign to produce either 0 if x > 0, 2 otherwise.  */
111   if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
112     return fac;
113 
114   /* erfc(|x|) = P(|x|-x_i)*exp(-x^2).  */
115 
116   v_f64_t a = v_abs_f64 (x);
117 
118   /* Interval bounds are a logarithmic scale, i.e. interval n has
119      lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
120      the interval index.  */
121   v_f64_t xp1 = a + v_f64 (1.0);
122   xp1 = xp1 * xp1;
123   xp1 = xp1 * xp1;
124   v_u64_t ixp1 = v_as_u64_f64 (xp1);
125   i = (ixp1 >> 52) - v_u64 (1023);
126 
127   /* Index cannot exceed number of polynomials.  */
128 #ifdef SCALAR
129   i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
130 #else
131   i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
132 		i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
133 #endif
134   /* Get coeffs of i-th polynomial.  */
135   dat = lookup (i);
136 
137   /* Evaluate Polynomial: P(|x|-x_i).  */
138   z = a - dat.xi;
139 #define C(i) dat.P[i]
140   p = HORNER_12 (z, C);
141 
142   /* Evaluate Gaussian: exp(-x^2).  */
143   v_f64_t e = v_eval_gauss (a);
144 
145   /* Copy sign.  */
146   sign = v_as_u64_f64 (x) & ~AbsMask;
147   p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
148 
149   /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise.  */
150   y = v_fma_f64 (p, e, fac);
151 
152   /* No need to fix value of y if x is out of bound, as
153      P[ERFC_NUM_INTERVALS]=0.  */
154   if (unlikely (v_any_u64 (cmp)))
155     return specialcase (x, y, cmp);
156   return y;
157 }
158 VPCS_ALIAS
159 
160 PL_SIG (V, D, 1, erfc, -6.0, 28.0)
161 PL_TEST_ULP (V_NAME (erfc), 3.15)
162 PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000)
163 PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000)
164 PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000)
165 PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000)
166 PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000)
167 PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000)
168 #endif
169