1 /*
2 * Single-precision vector erfc(x) function.
3 *
4 * Copyright (c) 2021-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #include "v_math.h"
9 #include "erfcf.h"
10 #include "estrin.h"
11 #include "pl_sig.h"
12 #include "pl_test.h"
13
14 #if V_SUPPORTED
15
16 #define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
17
18 VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
19
20 static VPCS_ATTR NOINLINE v_f32_t
specialcase(v_f32_t x,v_f32_t y,v_u32_t special)21 specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
22 {
23 return v_call_f32 (erfcf, x, y, special);
24 }
25
26 static inline uint32_t
interval_index(uint32_t ia12)27 interval_index (uint32_t ia12)
28 {
29 // clang-format off
30 return (ia12 < 0x400 ? 0 :
31 (ia12 < 0x408 ? 1 :
32 (ia12 < 0x410 ? 2 :
33 3)));
34 // clang-format on
35 }
36
37 /* The C macro wraps the coeffs argument in order to make the
38 poynomial evaluation more readable. In the scalarised variant the
39 second pointer is ignored. */
40 #ifdef SCALAR
41 #define C(i) coeff1[i]
42 #else
43 #define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
44 #endif
45
46 static inline v_f64_t
v_approx_erfcf_poly_gauss(v_f64_t x,const double * coeff1,const double * coeff2)47 v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
48 const double *coeff2)
49 {
50 v_f64_t x2 = x * x;
51 v_f64_t x4 = x2 * x2;
52 v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
53 v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
54 return poly * gauss;
55 }
56
57 static inline float
approx_poly_gauss(float abs_x,const double * coeff)58 approx_poly_gauss (float abs_x, const double *coeff)
59 {
60 return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
61 }
62
63 static v_f32_t
v_approx_erfcf(v_f32_t abs_x,v_u32_t sign,v_u32_t ia12,v_u32_t lanes)64 v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
65 {
66 #ifdef SCALAR
67 float y = approx_poly_gauss (abs_x, P (ia12));
68 return sign ? 2 - y : y;
69 #else
70 float32x2_t lo32 = {0, 0};
71 float32x2_t hi32 = {0, 0};
72 /* The polynomial and Gaussian components must be calculated in
73 double precision in order to meet the required ULP error. This
74 means we have to promote low and high halves of the
75 single-precision input vector to two separate double-precision
76 input vectors. This incurs some overhead, and there is also
77 overhead to loading the polynomial coefficients as this cannot be
78 done in a vector fashion. This would be wasted effort for
79 elements which lie in the 'boring' zone, as they will be
80 overwritten later. Hence we use the lanes parameter to only do
81 the promotion on a pair of lanes if both of those lanes are
82 interesting and not special cases. If one lane is inactive, we
83 use a scalar routine which is shared with the scalar variant. */
84 if (lanes[0] & lanes[1])
85 {
86 lo32 = vcvt_f32_f64 (
87 v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
88 P (ia12[0]), P (ia12[1])));
89 }
90 else if (lanes[0])
91 {
92 lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
93 }
94 else if (lanes[1])
95 {
96 lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
97 }
98
99 if (lanes[2] & lanes[3])
100 {
101 hi32
102 = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
103 P (ia12[2]), P (ia12[3])));
104 }
105 else if (lanes[2])
106 {
107 hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
108 }
109 else if (lanes[3])
110 {
111 hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
112 }
113
114 v_f32_t y = vcombine_f32 (lo32, hi32);
115
116 if (v_any_u32 (sign))
117 {
118 y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
119 }
120
121 return y;
122 #endif
123 }
124
125 /* Optimized single-precision vector complementary error function
126 erfcf. Max measured error: 0.750092 at various values between
127 -0x1.06521p-20 and -0x1.add1dap-17. For example:
128 __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
129 +0.249908 ulp err 0.250092. */
130 VPCS_ATTR
V_NAME(erfcf)131 v_f32_t V_NAME (erfcf) (v_f32_t x)
132 {
133 v_u32_t ix = v_as_u32_f32 (x);
134 v_u32_t ia = ix & 0x7fffffff;
135 v_u32_t ia12 = ia >> 20;
136 v_u32_t sign = ix >> 31;
137 v_u32_t inf_ia12 = v_u32 (0x7f8);
138
139 v_u32_t special_cases
140 = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
141 v_u32_t in_bounds
142 = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
143 v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
144
145 #ifdef SCALAR
146 if (unlikely (special_cases))
147 {
148 if (ia12 >= 0x7f8)
149 return (float) (sign << 1) + 1.0f / x; /* Special cases. */
150 else
151 return 1.0f - x; /* Small case. */
152 }
153 else if (likely (!in_bounds))
154 {
155 return sign ? boring_zone : __math_uflowf (boring_zone);
156 }
157 #endif
158
159 v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
160 in_bounds & ~special_cases);
161
162 #ifndef SCALAR
163 y = vbslq_f32 (~in_bounds, boring_zone, y);
164
165 if (unlikely (v_any_u32 (special_cases)))
166 {
167 return specialcase (x, y, special_cases);
168 }
169 #endif
170
171 return y;
172 }
173 VPCS_ALIAS
174
175 PL_SIG (V, F, 1, erfc, -6.0, 28.0)
176 PL_TEST_ULP (V_NAME (erfcf), 0.26)
177 PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
178 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
179 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
180 PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
181 PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
182 PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
183 #endif
184