• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Single-precision vector erfc(x) function.
3  *
4  * Copyright (c) 2021-2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "v_math.h"
9 #include "erfcf.h"
10 #include "estrin.h"
11 #include "pl_sig.h"
12 #include "pl_test.h"
13 
14 #if V_SUPPORTED
15 
16 #define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
17 
18 VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
19 
20 static VPCS_ATTR NOINLINE v_f32_t
specialcase(v_f32_t x,v_f32_t y,v_u32_t special)21 specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
22 {
23   return v_call_f32 (erfcf, x, y, special);
24 }
25 
26 static inline uint32_t
interval_index(uint32_t ia12)27 interval_index (uint32_t ia12)
28 {
29   // clang-format off
30   return (ia12 < 0x400 ? 0 :
31          (ia12 < 0x408 ? 1 :
32          (ia12 < 0x410 ? 2 :
33                          3)));
34   // clang-format on
35 }
36 
37 /* The C macro wraps the coeffs argument in order to make the
38    poynomial evaluation more readable. In the scalarised variant the
39    second pointer is ignored.  */
40 #ifdef SCALAR
41 #define C(i) coeff1[i]
42 #else
43 #define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
44 #endif
45 
46 static inline v_f64_t
v_approx_erfcf_poly_gauss(v_f64_t x,const double * coeff1,const double * coeff2)47 v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
48 			   const double *coeff2)
49 {
50   v_f64_t x2 = x * x;
51   v_f64_t x4 = x2 * x2;
52   v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
53   v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
54   return poly * gauss;
55 }
56 
57 static inline float
approx_poly_gauss(float abs_x,const double * coeff)58 approx_poly_gauss (float abs_x, const double *coeff)
59 {
60   return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
61 }
62 
63 static v_f32_t
v_approx_erfcf(v_f32_t abs_x,v_u32_t sign,v_u32_t ia12,v_u32_t lanes)64 v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
65 {
66 #ifdef SCALAR
67   float y = approx_poly_gauss (abs_x, P (ia12));
68   return sign ? 2 - y : y;
69 #else
70   float32x2_t lo32 = {0, 0};
71   float32x2_t hi32 = {0, 0};
72   /* The polynomial and Gaussian components must be calculated in
73      double precision in order to meet the required ULP error. This
74      means we have to promote low and high halves of the
75      single-precision input vector to two separate double-precision
76      input vectors. This incurs some overhead, and there is also
77      overhead to loading the polynomial coefficients as this cannot be
78      done in a vector fashion. This would be wasted effort for
79      elements which lie in the 'boring' zone, as they will be
80      overwritten later. Hence we use the lanes parameter to only do
81      the promotion on a pair of lanes if both of those lanes are
82      interesting and not special cases. If one lane is inactive, we
83      use a scalar routine which is shared with the scalar variant.  */
84   if (lanes[0] & lanes[1])
85     {
86       lo32 = vcvt_f32_f64 (
87 	v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
88 				   P (ia12[0]), P (ia12[1])));
89     }
90   else if (lanes[0])
91     {
92       lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
93     }
94   else if (lanes[1])
95     {
96       lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
97     }
98 
99   if (lanes[2] & lanes[3])
100     {
101       hi32
102 	= vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
103 						   P (ia12[2]), P (ia12[3])));
104     }
105   else if (lanes[2])
106     {
107       hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
108     }
109   else if (lanes[3])
110     {
111       hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
112     }
113 
114   v_f32_t y = vcombine_f32 (lo32, hi32);
115 
116   if (v_any_u32 (sign))
117     {
118       y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
119     }
120 
121   return y;
122 #endif
123 }
124 
125 /* Optimized single-precision vector complementary error function
126    erfcf. Max measured error: 0.750092 at various values between
127    -0x1.06521p-20 and -0x1.add1dap-17. For example:
128    __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
129    +0.249908 ulp err 0.250092.  */
130 VPCS_ATTR
V_NAME(erfcf)131 v_f32_t V_NAME (erfcf) (v_f32_t x)
132 {
133   v_u32_t ix = v_as_u32_f32 (x);
134   v_u32_t ia = ix & 0x7fffffff;
135   v_u32_t ia12 = ia >> 20;
136   v_u32_t sign = ix >> 31;
137   v_u32_t inf_ia12 = v_u32 (0x7f8);
138 
139   v_u32_t special_cases
140     = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
141   v_u32_t in_bounds
142     = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
143   v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
144 
145 #ifdef SCALAR
146   if (unlikely (special_cases))
147     {
148       if (ia12 >= 0x7f8)
149 	return (float) (sign << 1) + 1.0f / x; /* Special cases.  */
150       else
151 	return 1.0f - x; /* Small case.  */
152     }
153   else if (likely (!in_bounds))
154     {
155       return sign ? boring_zone : __math_uflowf (boring_zone);
156     }
157 #endif
158 
159   v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
160 			      in_bounds & ~special_cases);
161 
162 #ifndef SCALAR
163   y = vbslq_f32 (~in_bounds, boring_zone, y);
164 
165   if (unlikely (v_any_u32 (special_cases)))
166     {
167       return specialcase (x, y, special_cases);
168     }
169 #endif
170 
171   return y;
172 }
173 VPCS_ALIAS
174 
175 PL_SIG (V, F, 1, erfc, -6.0, 28.0)
176 PL_TEST_ULP (V_NAME (erfcf), 0.26)
177 PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
178 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
179 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
180 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
181 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
182 PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
183 #endif
184