1 /*
2 * Single-precision erf(x) function.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include <stdint.h>
9 #include <math.h>
10 #include "math_config.h"
11
12 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
13 #define A __erff_data.erff_poly_A
14 #define B __erff_data.erff_poly_B
15
16 /* Top 12 bits of a float. */
17 static inline uint32_t
top12(float x)18 top12 (float x)
19 {
20 return asuint (x) >> 20;
21 }
22
23 /* Efficient implementation of erff
24 using either a pure polynomial approximation or
25 the exponential of a polynomial.
26 Worst-case error is 1.09ulps at 0x1.c111acp-1. */
27 float
erff(float x)28 erff (float x)
29 {
30 float r, x2, u;
31
32 /* Get top word. */
33 uint32_t ix = asuint (x);
34 uint32_t sign = ix >> 31;
35 uint32_t ia12 = top12 (x) & 0x7ff;
36
37 /* Limit of both intervals is 0.875 for performance reasons but coefficients
38 computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
39 from 0.94 to 1.1ulps. */
40 if (ia12 < 0x3f6)
41 { /* a = |x| < 0.875. */
42
43 /* Tiny and subnormal cases. */
44 if (unlikely (ia12 < 0x318))
45 { /* |x| < 2^(-28). */
46 if (unlikely (ia12 < 0x040))
47 { /* |x| < 2^(-119). */
48 float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
49 return check_uflowf (y);
50 }
51 return x + TwoOverSqrtPiMinusOne * x;
52 }
53
54 x2 = x * x;
55
56 /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2). */
57 r = A[5];
58 r = fmaf (r, x2, A[4]);
59 r = fmaf (r, x2, A[3]);
60 r = fmaf (r, x2, A[2]);
61 r = fmaf (r, x2, A[1]);
62 r = fmaf (r, x2, A[0]);
63 r = fmaf (r, x, x);
64 }
65 else if (ia12 < 0x408)
66 { /* |x| < 4.0 - Use a custom Estrin scheme. */
67
68 float a = fabsf (x);
69 /* Start with Estrin scheme on high order (small magnitude) coefficients. */
70 r = fmaf (B[6], a, B[5]);
71 u = fmaf (B[4], a, B[3]);
72 x2 = x * x;
73 r = fmaf (r, x2, u);
74 /* Then switch to pure Horner scheme. */
75 r = fmaf (r, a, B[2]);
76 r = fmaf (r, a, B[1]);
77 r = fmaf (r, a, B[0]);
78 r = fmaf (r, a, a);
79 /* Single precision exponential with ~0.5ulps,
80 ensures erff has max. rel. error
81 < 1ulp on [0.921875, 4.0],
82 < 1.1ulps on [0.875, 4.0]. */
83 r = expf (-r);
84 /* Explicit copysign (calling copysignf increases latency). */
85 if (sign)
86 r = -1.0f + r;
87 else
88 r = 1.0f - r;
89 }
90 else
91 { /* |x| >= 4.0. */
92
93 /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
94 if (unlikely (ia12 >= 0x7f8))
95 return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
96
97 /* Explicit copysign (calling copysignf increases latency). */
98 if (sign)
99 r = -1.0f;
100 else
101 r = 1.0f;
102 }
103 return r;
104 }
105