• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-sigmoid/psimd-p5-div.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <psimd.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/vunary.h>
16 
17 
xnn_f32_sigmoid_ukernel__psimd_p5_div_x16(size_t n,const float * x,float * y,const void * params)18 void xnn_f32_sigmoid_ukernel__psimd_p5_div_x16(
19     size_t n,
20     const float* x,
21     float* y,
22     const void* params)
23 {
24   assert(n % sizeof(float) == 0);
25 
26   const psimd_f32 vmagic_bias = psimd_splat_f32(0x1.8000FEp23f);
27   // The largest z for which sigmoidf(-z) is normalized.
28   // This number is also the largest z for which expf(-z) is normalized.
29   const psimd_f32 vdenorm_cutoff = psimd_splat_f32(0x1.5D589Ep+6f);
30   const psimd_f32 vminus_log2e = psimd_splat_f32(-0x1.715476p+0f);
31   // Last 7 bits are zeroes
32   const psimd_f32 vln2_hi = psimd_splat_f32(0x1.62E400p-1f);
33   const psimd_f32 vln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
34   const psimd_f32 vone = psimd_splat_f32(1.0f);
35 
36   const psimd_f32 vc1 = psimd_splat_f32(-0x1.FFFFF6p-1f);
37   const psimd_f32 vc2 = psimd_splat_f32( 0x1.FFFDC6p-2f);
38   const psimd_f32 vc3 = psimd_splat_f32(-0x1.555A80p-3f);
39   const psimd_f32 vc4 = psimd_splat_f32( 0x1.573A1Ap-5f);
40   const psimd_f32 vc5 = psimd_splat_f32(-0x1.0F9F9Cp-7f);
41 
42   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
43     const psimd_f32 vx0123 = psimd_load_f32(x);
44     const psimd_f32 vx4567 = psimd_load_f32(x + 4);
45     const psimd_f32 vx89AB = psimd_load_f32(x + 8);
46     const psimd_f32 vxCDEF = psimd_load_f32(x + 12);
47     x += 16;
48 
49     // General structure of the algorithm:
50     //           / exp(x) / (1 + exp(x)) if x <= 0
51     //   f[x] :=
52     //           \ 1 - f[-x] if x >= 0
53     //
54     // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
55     // then replace result with 1 - f[-z] if x >= 0.
56     const psimd_f32 vz0123 = psimd_abs_f32(vx0123);
57     const psimd_f32 vz4567 = psimd_abs_f32(vx4567);
58     const psimd_f32 vz89AB = psimd_abs_f32(vx89AB);
59     const psimd_f32 vzCDEF = psimd_abs_f32(vxCDEF);
60 
61     // Compute reduced argument n := round(-z / log(2)).
62     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
63     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
64     // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
65     // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
66     // anyway. We fixup the result for such inputs at the very end of the algorithm.
67     psimd_f32 vn0123 = psimd_qfma_f32(vmagic_bias, vz0123, vminus_log2e);
68     psimd_f32 vn4567 = psimd_qfma_f32(vmagic_bias, vz4567, vminus_log2e);
69     psimd_f32 vn89AB = psimd_qfma_f32(vmagic_bias, vz89AB, vminus_log2e);
70     psimd_f32 vnCDEF = psimd_qfma_f32(vmagic_bias, vzCDEF, vminus_log2e);
71 
72     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
73     // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
74     const psimd_f32 vs0123 = (psimd_f32) ((psimd_u32) vn0123 << 23);
75     const psimd_f32 vs4567 = (psimd_f32) ((psimd_u32) vn4567 << 23);
76     const psimd_f32 vs89AB = (psimd_f32) ((psimd_u32) vn89AB << 23);
77     const psimd_f32 vsCDEF = (psimd_f32) ((psimd_u32) vnCDEF << 23);
78 
79     // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
80     vn0123 = psimd_sub_f32(vn0123, vmagic_bias);
81     vn4567 = psimd_sub_f32(vn4567, vmagic_bias);
82     vn89AB = psimd_sub_f32(vn89AB, vmagic_bias);
83     vnCDEF = psimd_sub_f32(vnCDEF, vmagic_bias);
84 
85     // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
86     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
87     psimd_f32 vt0123 = psimd_qfma_f32(vz0123, vn0123, vln2_hi);
88     psimd_f32 vt4567 = psimd_qfma_f32(vz4567, vn4567, vln2_hi);
89     psimd_f32 vt89AB = psimd_qfma_f32(vz89AB, vn89AB, vln2_hi);
90     psimd_f32 vtCDEF = psimd_qfma_f32(vzCDEF, vnCDEF, vln2_hi);
91 
92     vt0123 = psimd_qfma_f32(vt0123, vn0123, vln2_lo);
93     vt4567 = psimd_qfma_f32(vt4567, vn4567, vln2_lo);
94     vt89AB = psimd_qfma_f32(vt89AB, vn89AB, vln2_lo);
95     vtCDEF = psimd_qfma_f32(vtCDEF, vnCDEF, vln2_lo);
96 
97     // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
98     //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
99     psimd_f32 vp0123 = psimd_qfma_f32(vc4, vt0123, vc5);
100     psimd_f32 vp4567 = psimd_qfma_f32(vc4, vt4567, vc5);
101     psimd_f32 vp89AB = psimd_qfma_f32(vc4, vt89AB, vc5);
102     psimd_f32 vpCDEF = psimd_qfma_f32(vc4, vtCDEF, vc5);
103 
104     vp0123 = psimd_qfma_f32(vc3, vt0123, vp0123);
105     vp4567 = psimd_qfma_f32(vc3, vt4567, vp4567);
106     vp89AB = psimd_qfma_f32(vc3, vt89AB, vp89AB);
107     vpCDEF = psimd_qfma_f32(vc3, vtCDEF, vpCDEF);
108 
109     vp0123 = psimd_qfma_f32(vc2, vt0123, vp0123);
110     vp4567 = psimd_qfma_f32(vc2, vt4567, vp4567);
111     vp89AB = psimd_qfma_f32(vc2, vt89AB, vp89AB);
112     vpCDEF = psimd_qfma_f32(vc2, vtCDEF, vpCDEF);
113 
114     vp0123 = psimd_qfma_f32(vc1, vt0123, vp0123);
115     vp4567 = psimd_qfma_f32(vc1, vt4567, vp4567);
116     vp89AB = psimd_qfma_f32(vc1, vt89AB, vp89AB);
117     vpCDEF = psimd_qfma_f32(vc1, vtCDEF, vpCDEF);
118 
119     // Reconstruct the exp(-z) value:
120     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
121     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
122     //     = s + (t * s) * p
123     vt0123 = psimd_mul_f32(vt0123, vs0123);
124     vt4567 = psimd_mul_f32(vt4567, vs4567);
125     vt89AB = psimd_mul_f32(vt89AB, vs89AB);
126     vtCDEF = psimd_mul_f32(vtCDEF, vsCDEF);
127 
128     const psimd_f32 ve0123 = psimd_qfma_f32(vs0123, vt0123, vp0123);
129     const psimd_f32 ve4567 = psimd_qfma_f32(vs4567, vt4567, vp4567);
130     const psimd_f32 ve89AB = psimd_qfma_f32(vs89AB, vt89AB, vp89AB);
131     const psimd_f32 veCDEF = psimd_qfma_f32(vsCDEF, vtCDEF, vpCDEF);
132 
133     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
134     psimd_f32 vf0123 = psimd_div_f32(ve0123, psimd_add_f32(ve0123, vone));
135     psimd_f32 vf4567 = psimd_div_f32(ve4567, psimd_add_f32(ve4567, vone));
136     psimd_f32 vf89AB = psimd_div_f32(ve89AB, psimd_add_f32(ve89AB, vone));
137     psimd_f32 vfCDEF = psimd_div_f32(veCDEF, psimd_add_f32(veCDEF, vone));
138 
139     // For inputs above denormal cutoff, replace output with +0.0f.
140     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
141     vf0123 = psimd_andnotmask_f32(vz0123 > vdenorm_cutoff, vf0123);
142     vf4567 = psimd_andnotmask_f32(vz4567 > vdenorm_cutoff, vf4567);
143     vf89AB = psimd_andnotmask_f32(vz89AB > vdenorm_cutoff, vf89AB);
144     vfCDEF = psimd_andnotmask_f32(vzCDEF > vdenorm_cutoff, vfCDEF);
145 
146     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
147     vf0123 = psimd_signblend_f32(vx0123, vf0123, psimd_sub_f32(vone, vf0123));
148     vf4567 = psimd_signblend_f32(vx4567, vf4567, psimd_sub_f32(vone, vf4567));
149     vf89AB = psimd_signblend_f32(vx89AB, vf89AB, psimd_sub_f32(vone, vf89AB));
150     vfCDEF = psimd_signblend_f32(vxCDEF, vfCDEF, psimd_sub_f32(vone, vfCDEF));
151 
152     psimd_store_f32(y, vf0123);
153     psimd_store_f32(y + 4, vf4567);
154     psimd_store_f32(y + 8, vf89AB);
155     psimd_store_f32(y + 12, vfCDEF);
156     y += 16;
157   }
158   for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
159     const psimd_f32 vx = psimd_load_f32(x);
160     x += 4;
161 
162     // General structure of the algorithm:
163     //           / exp(x) / (1 + exp(x)) if x <= 0
164     //   f[x] :=
165     //           \ 1 - f[-x] if x >= 0
166     //
167     // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
168     // then replace result with 1 - f[-z] if x >= 0.
169     const psimd_f32 vz = psimd_abs_f32(vx);
170 
171     // Compute reduced argument n := round(-z / log(2)).
172     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
173     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
174     // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
175     // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
176     // anyway. We fixup the result for such inputs at the very end of the algorithm.
177     psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
178 
179     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
180     // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
181     const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
182 
183     // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
184     vn = psimd_sub_f32(vn, vmagic_bias);
185 
186     // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
187     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
188     psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
189     vt = psimd_qfma_f32(vt, vn, vln2_lo);
190 
191     // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
192     //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
193     psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
194     vp = psimd_qfma_f32(vc3, vt, vp);
195     vp = psimd_qfma_f32(vc2, vt, vp);
196     vp = psimd_qfma_f32(vc1, vt, vp);
197 
198     // Reconstruct the exp(-z) value:
199     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
200     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
201     //     = s + (t * s) * p
202     vt = psimd_mul_f32(vt, vs);
203     const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
204 
205     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
206     psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
207 
208     // For inputs above denormal cutoff, replace output with +0.0f.
209     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
210     vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
211 
212     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
213     vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
214 
215     psimd_store_f32(y, vf);
216     y += 4;
217   }
218   if XNN_UNLIKELY(n != 0) {
219     const psimd_f32 vx = psimd_load_f32(x);
220 
221     // General structure of the algorithm:
222     //           / exp(x) / (1 + exp(x)) if x <= 0
223     //   f[x] :=
224     //           \ 1 - f[-x] if x >= 0
225     //
226     // First we compute f[-z] := exp(-z) / (1 + exp(-z)) where z = abs(x),
227     // then replace result with 1 - f[-z] if x >= 0.
228     const psimd_f32 vz = psimd_abs_f32(vx);
229 
230     // Compute reduced argument n := round(-z / log(2)).
231     // We do it by adding a large number (magic bias), which cause rounding of result to an integer, then subtracing the
232     // large number back. The first addition is combined with multiplication by log2e into a single FMA instruction.
233     // The trick with adding large number is valid only within certain bounds (|x| <= 2**22), but thats ok, because
234     // inputs x outside of [-87.336544, 17.328678] (i.e. z outsize [0, 87.336544]) underflow or saturate sigmoidf(x)
235     // anyway. We fixup the result for such inputs at the very end of the algorithm.
236     psimd_f32 vn = psimd_qfma_f32(vmagic_bias, vz, vminus_log2e);
237 
238     // Create a floating-point number s (scale) such that s == 2**n for inputs which don't cause underflow, i.e.
239     // -87.336544 <= -z <= 0.0, and -126 <= n <= 0 accordingly.
240     const psimd_f32 vs = (psimd_f32) ((psimd_u32) vn << 23);
241 
242     // Subtract the large number back to get the final n := round(-z / log(2)) as a floating-point number.
243     vn = psimd_sub_f32(vn, vmagic_bias);
244 
245     // Compute reduced argument t := z + n * log(2). Note that -t = -z - n * log(2).
246     // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
247     psimd_f32 vt = psimd_qfma_f32(vz, vn, vln2_hi);
248     vt = psimd_qfma_f32(vt, vn, vln2_lo);
249 
250     // Compute degree-5 polynomial approximation for exp(-t) on [-log(2)/2, log(2)/2]:
251     //   P5(t) = 1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
252     psimd_f32 vp = psimd_qfma_f32(vc4, vt, vc5);
253     vp = psimd_qfma_f32(vc3, vt, vp);
254     vp = psimd_qfma_f32(vc2, vt, vp);
255     vp = psimd_qfma_f32(vc1, vt, vp);
256 
257     // Reconstruct the exp(-z) value:
258     //   e = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
259     //     = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
260     //     = s + (t * s) * p
261     vt = psimd_mul_f32(vt, vs);
262     const psimd_f32 ve = psimd_qfma_f32(vs, vt, vp);
263 
264     // Reconstruct sigmoid(-z) = exp(-z) / (1.0 + exp(-z))
265     psimd_f32 vf = psimd_div_f32(ve, psimd_add_f32(ve, vone));
266 
267     // For inputs above denormal cutoff, replace output with +0.0f.
268     // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
269     vf = psimd_andnotmask_f32(vz > vdenorm_cutoff, vf);
270 
271     // Reconstruct sigmoid(x) = x < 0 ? sigmoid(-z) : 1.0 - sigmoid(-z)
272     vf = psimd_signblend_f32(vx, vf, psimd_sub_f32(vone, vf));
273 
274     if (n & (2 * sizeof(float))) {
275       psimd_store2_f32(y, vf);
276       vf = psimd_concat_hi_f32(vf, vf);
277       y += 2;
278     }
279     if (n & (1 * sizeof(float))) {
280       psimd_store1_f32(y, vf);
281     }
282   }
283 }
284