• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Double-precision SVE erfc(x) function.
3  *
4  * Copyright (c) 2021-2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "sv_math.h"
9 #include "pl_sig.h"
10 #include "pl_test.h"
11 
12 #if SV_SUPPORTED
13 #include "sv_exp_tail.h"
14 
15 sv_f64_t __sv_exp_x (sv_f64_t, svbool_t);
16 
17 static NOINLINE sv_f64_t
specialcase(sv_f64_t x,sv_f64_t y,svbool_t special)18 specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
19 {
20   return sv_call_f64 (erfc, x, y, special);
21 }
22 
23 static inline sv_u64_t
lookup_interval_idx(const svbool_t pg,sv_f64_t abs_x)24 lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x)
25 {
26   /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by
27      the number of polynomials.  */
28   sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1);
29   xp1 = svmul_f64_x (pg, xp1, xp1);
30   xp1 = svmul_f64_x (pg, xp1, xp1);
31   sv_u64_t interval_idx
32     = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023);
33   return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS),
34 		    interval_idx, sv_u64 (ERFC_NUM_INTERVALS));
35 }
36 
37 static inline sv_f64_t
sv_eval_poly(const svbool_t pg,sv_f64_t z,sv_u64_t idx)38 sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx)
39 {
40   sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1);
41   const double *base = &__v_erfc_data.poly[0][12];
42   sv_f64_t r = sv_lookup_f64_x (pg, base, offset);
43   for (int i = 0; i < ERFC_POLY_ORDER; i++)
44     {
45       base--;
46       sv_f64_t c = sv_lookup_f64_x (pg, base, offset);
47       r = sv_fma_f64_x (pg, z, r, c);
48     }
49   return r;
50 }
51 
52 static inline sv_f64_t
sv_eval_gauss(const svbool_t pg,sv_f64_t abs_x)53 sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
54 {
55   /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding
56      errors in x^2, so we compute an estimate for the error and use a custom exp
57      helper which corrects for the calculated error estimate.  */
58   sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x);
59 
60   /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and
61      a_lo is the 'small' component.  */
62   const sv_f64_t scale = sv_f64 (0x1.0000002p27);
63   sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x,
64 						 svneg_f64_x (pg, abs_x)));
65   a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi);
66   sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi);
67 
68   sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi);
69   sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo);
70 
71   /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) -
72      (a_hi + a_lo) * (a_hi + a_lo).  */
73   sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2);
74   e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2);
75   e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2);
76   e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2);
77 
78   return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2);
79 }
80 
81 /* Optimized double precision vector complementary error function erfc.
82    Maximum measured error is 3.64 ULP:
83    __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
84 				  want 0x1.ff3f4c8e200d9p-42.  */
85 sv_f64_t
__sv_erfc_x(sv_f64_t x,const svbool_t pg)86 __sv_erfc_x (sv_f64_t x, const svbool_t pg)
87 {
88   sv_u64_t ix = sv_as_u64_f64 (x);
89   sv_f64_t abs_x = svabs_f64_x (pg, x);
90   sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52);
91 
92   /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes
93      to 2. As long as the polynomial is 0 in the boring zone, we can assemble
94      the result correctly. This is dealt with in two ways:
95 
96      The 'coarse approach' is that the approximation algorithm is
97      zero-predicated on in_bounds = |x| < 32, which saves the need to do
98      coefficient lookup etc for |x| >= 32.
99 
100      The coarse approach misses [-32, -6] and [28, 32], which are dealt with in
101      the polynomial and index calculation, such that the polynomial evaluates to
102      0 in these regions.  */
103   /* in_bounds is true for lanes where |x| < 32.  */
104   svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404);
105   /* boring_zone = 2 for x < 0, 0 otherwise.  */
106   sv_f64_t boring_zone
107     = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62));
108   /* Very small, nan and inf.  */
109   svbool_t special_cases
110     = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432);
111 
112   /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2)
113 
114      Where P_i is a polynomial and x_i is an offset, both defined in
115      v_erfc_data.c. i is chosen based on which interval x falls in.  */
116   sv_u64_t i = lookup_interval_idx (in_bounds, abs_x);
117   sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i);
118   sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i);
119   /* 'copy' sign of x to p, i.e. negate p if x is negative.  */
120   sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff);
121   p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign));
122 
123   sv_f64_t e = sv_eval_gauss (in_bounds, abs_x);
124 
125   /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally
126      select boring_zone because P[V_ERFC_NINTS-1]=0.  */
127   sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone);
128 
129   if (unlikely (svptest_any (pg, special_cases)))
130     {
131       return specialcase (x, y, special_cases);
132     }
133   return y;
134 }
135 
136 PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
137 
138 PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
139 PL_TEST_ULP (__sv_erfc, 3.15)
140 PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000)
141 PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000)
142 PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000)
143 PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000)
144 PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000)
145 PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000)
146 #endif
147