1 /******************************************************************************
2 * @file arm_vec_math_f16.h
3 * @brief Public header file for CMSIS DSP Library
4 * @version V1.10.0
5 * @date 08 July 2021
6 * Target Processor: Cortex-M and Cortex-A cores
7 ******************************************************************************/
8 /*
9 * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
10 *
11 * SPDX-License-Identifier: Apache-2.0
12 *
13 * Licensed under the Apache License, Version 2.0 (the License); you may
14 * not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 * www.apache.org/licenses/LICENSE-2.0
18 *
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
21 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
24 */
25
26 #ifndef _ARM_VEC_MATH_F16_H
27 #define _ARM_VEC_MATH_F16_H
28
29 #include "arm_math_types_f16.h"
30 #include "arm_common_tables_f16.h"
31 #include "arm_helium_utils.h"
32
33 #ifdef __cplusplus
34 extern "C"
35 {
36 #endif
37
38 #if defined(ARM_FLOAT16_SUPPORTED)
39
40
41 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
42
43
44 static const float16_t __logf_rng_f16=0.693147180f16;
45
46 /* fast inverse approximation (3x newton) */
vrecip_medprec_f16(f16x8_t x)47 __STATIC_INLINE f16x8_t vrecip_medprec_f16(
48 f16x8_t x)
49 {
50 q15x8_t m;
51 f16x8_t b;
52 any16x8_t xinv;
53 f16x8_t ax = vabsq(x);
54
55 xinv.f = ax;
56
57 m = 0x03c00 - (xinv.i & 0x07c00);
58 xinv.i = xinv.i + m;
59 xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
60 xinv.i = xinv.i + m;
61
62 b = 2.0f16 - xinv.f * ax;
63 xinv.f = xinv.f * b;
64
65 b = 2.0f16 - xinv.f * ax;
66 xinv.f = xinv.f * b;
67
68 b = 2.0f16 - xinv.f * ax;
69 xinv.f = xinv.f * b;
70
71 xinv.f = vdupq_m_n_f16(xinv.f, F16INFINITY, vcmpeqq_n_f16(x, 0.0f));
72 /*
73 * restore sign
74 */
75 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq_n_f16(x, 0.0f));
76
77 return xinv.f;
78 }
79
80 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f16(f16x8_t x)81 __STATIC_INLINE f16x8_t vrecip_hiprec_f16(
82 f16x8_t x)
83 {
84 q15x8_t m;
85 f16x8_t b;
86 any16x8_t xinv;
87 f16x8_t ax = vabsq(x);
88
89 xinv.f = ax;
90
91 m = 0x03c00 - (xinv.i & 0x07c00);
92 xinv.i = xinv.i + m;
93 xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
94 xinv.i = xinv.i + m;
95
96 b = 2.0f16 - xinv.f * ax;
97 xinv.f = xinv.f * b;
98
99 b = 2.0f16 - xinv.f * ax;
100 xinv.f = xinv.f * b;
101
102 b = 2.0f16 - xinv.f * ax;
103 xinv.f = xinv.f * b;
104
105 b = 2.0f16 - xinv.f * ax;
106 xinv.f = xinv.f * b;
107
108 xinv.f = vdupq_m_n_f16(xinv.f, F16INFINITY, vcmpeqq_n_f16(x, 0.0f));
109 /*
110 * restore sign
111 */
112 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq_n_f16(x, 0.0f));
113
114 return xinv.f;
115 }
116
vdiv_f16(f16x8_t num,f16x8_t den)117 __STATIC_INLINE f16x8_t vdiv_f16(
118 f16x8_t num, f16x8_t den)
119 {
120 return vmulq(num, vrecip_hiprec_f16(den));
121 }
122
123
124 /**
125 @brief Single-precision taylor dev.
126 @param[in] x f16 vector input
127 @param[in] coeffs f16 vector coeffs
128 @return destination f16 vector
129 */
130
vtaylor_polyq_f16(float16x8_t x,const float16_t * coeffs)131 __STATIC_INLINE float16x8_t vtaylor_polyq_f16(
132 float16x8_t x,
133 const float16_t * coeffs)
134 {
135 float16x8_t A = vfmasq(vdupq_n_f16(coeffs[4]), x, coeffs[0]);
136 float16x8_t B = vfmasq(vdupq_n_f16(coeffs[6]), x, coeffs[2]);
137 float16x8_t C = vfmasq(vdupq_n_f16(coeffs[5]), x, coeffs[1]);
138 float16x8_t D = vfmasq(vdupq_n_f16(coeffs[7]), x, coeffs[3]);
139 float16x8_t x2 = vmulq(x, x);
140 float16x8_t x4 = vmulq(x2, x2);
141 float16x8_t res = vfmaq(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4);
142
143 return res;
144 }
145
146 #define VMANT_EXP_F16(x) \
147 any16x8_t r; \
148 int16x8_t n; \
149 \
150 r.f = x; \
151 n = r.i >> 10; \
152 n = n - 15; \
153 r.i = r.i - (n << 10);\
154 \
155 vecExpUnBiased = n; \
156 vecTmpFlt1 = r.f;
157
vlogq_f16(float16x8_t vecIn)158 __STATIC_INLINE float16x8_t vlogq_f16(float16x8_t vecIn)
159 {
160 q15x8_t vecExpUnBiased;
161 float16x8_t vecTmpFlt0, vecTmpFlt1;
162 float16x8_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
163 float16x8_t vecExpUnBiasedFlt;
164
165 /*
166 * extract exponent
167 */
168 VMANT_EXP_F16(vecIn);
169
170 vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
171 /*
172 * a = (__logf_lut_f16[4] * r.f) + (__logf_lut_f16[0]);
173 */
174 vecAcc0 = vdupq_n_f16(__logf_lut_f16[0]);
175 vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f16[4]);
176 /*
177 * b = (__logf_lut_f16[6] * r.f) + (__logf_lut_f16[2]);
178 */
179 vecAcc1 = vdupq_n_f16(__logf_lut_f16[2]);
180 vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f16[6]);
181 /*
182 * c = (__logf_lut_f16[5] * r.f) + (__logf_lut_f16[1]);
183 */
184 vecAcc2 = vdupq_n_f16(__logf_lut_f16[1]);
185 vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f16[5]);
186 /*
187 * d = (__logf_lut_f16[7] * r.f) + (__logf_lut_f16[3]);
188 */
189 vecAcc3 = vdupq_n_f16(__logf_lut_f16[3]);
190 vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f16[7]);
191 /*
192 * a = a + b * xx;
193 */
194 vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
195 /*
196 * c = c + d * xx;
197 */
198 vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
199 /*
200 * xx = xx * xx;
201 */
202 vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
203 vecExpUnBiasedFlt = vcvtq_f16_s16(vecExpUnBiased);
204 /*
205 * r.f = a + c * xx;
206 */
207 vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
208 /*
209 * add exponent
210 * r.f = r.f + ((float32_t) m) * __logf_rng_f16;
211 */
212 vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f16);
213 // set log0 down to -inf
214 vecAcc0 = vdupq_m_n_f16(vecAcc0, -(_Float16)F16INFINITY, vcmpeqq_n_f16(vecIn, 0.0f));
215 return vecAcc0;
216 }
217
vexpq_f16(float16x8_t x)218 __STATIC_INLINE float16x8_t vexpq_f16(
219 float16x8_t x)
220 {
221 // Perform range reduction [-log(2),log(2)]
222 int16x8_t m = vcvtq_s16_f16(vmulq_n_f16(x, 1.4426950408f16));
223 float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), vdupq_n_f16(0.6931471805f16));
224
225 // Polynomial Approximation
226 float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16);
227
228 // Reconstruct
229 poly = (float16x8_t) (vqaddq_s16((int16x8_t) (poly), vqshlq_n_s16(m, 10)));
230
231 poly = vdupq_m_n_f16(poly, 0.0f16, vcmpltq_n_s16(m, -14));
232 return poly;
233 }
234
arm_vec_exponent_f16(float16x8_t x,int16_t nb)235 __STATIC_INLINE float16x8_t arm_vec_exponent_f16(float16x8_t x, int16_t nb)
236 {
237 float16x8_t r = x;
238 nb--;
239 while (nb > 0) {
240 r = vmulq(r, x);
241 nb--;
242 }
243 return (r);
244 }
245
vpowq_f16(f16x8_t val,f16x8_t n)246 __STATIC_INLINE f16x8_t vpowq_f16(
247 f16x8_t val,
248 f16x8_t n)
249 {
250 return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
251 }
252
253 #define INV_NEWTON_INIT_F16 0x7773
254
vrecip_f16(f16x8_t vecIn)255 __STATIC_INLINE f16x8_t vrecip_f16(f16x8_t vecIn)
256 {
257 f16x8_t vecSx, vecW, vecTmp;
258 any16x8_t v;
259
260 vecSx = vabsq(vecIn);
261
262 v.f = vecIn;
263 v.i = vsubq(vdupq_n_s16(INV_NEWTON_INIT_F16), v.i);
264
265 vecW = vmulq(vecSx, v.f);
266
267 // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
268 vecTmp = vsubq(vdupq_n_f16(8.0f16), vecW);
269 vecTmp = vfmasq_n_f16(vecW, vecTmp, -28.0f16);
270 vecTmp = vfmasq_n_f16(vecW, vecTmp, 56.0f16);
271 vecTmp = vfmasq_n_f16(vecW, vecTmp, -70.0f16);
272 vecTmp = vfmasq_n_f16(vecW, vecTmp, 56.0f16);
273 vecTmp = vfmasq_n_f16(vecW, vecTmp, -28.0f16);
274 vecTmp = vfmasq_n_f16(vecW, vecTmp, 8.0f16);
275 v.f = vmulq(v.f, vecTmp);
276
277 v.f = vdupq_m_n_f16(v.f, F16INFINITY, vcmpeqq_n_f16(vecIn, 0.0f));
278 /*
279 * restore sign
280 */
281 v.f = vnegq_m(v.f, v.f, vcmpltq_n_f16(vecIn, 0.0f));
282 return v.f;
283 }
284
vtanhq_f16(f16x8_t val)285 __STATIC_INLINE f16x8_t vtanhq_f16(
286 f16x8_t val)
287 {
288 f16x8_t x =
289 vminnmq_f16(vmaxnmq_f16(val, vdupq_n_f16(-10.f16)), vdupq_n_f16(10.0f16));
290 f16x8_t exp2x = vexpq_f16(vmulq_n_f16(x, 2.f16));
291 f16x8_t num = vsubq_n_f16(exp2x, 1.f16);
292 f16x8_t den = vaddq_n_f16(exp2x, 1.f16);
293 f16x8_t tanh = vmulq_f16(num, vrecip_f16(den));
294 return tanh;
295 }
296
297 #endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
298
299
300
301 #ifdef __cplusplus
302 }
303 #endif
304
305 #endif /* ARM FLOAT16 SUPPORTED */
306
307 #endif /* _ARM_VEC_MATH_F16_H */
308
309 /**
310 *
311 * End of file.
312 */
313