1 /******************************************************************************
2 * @file arm_vec_math.h
3 * @brief Public header file for CMSIS DSP Library
4 * @version V1.7.0
5 * @date 15. October 2019
6 ******************************************************************************/
7 /*
8 * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
9 *
10 * SPDX-License-Identifier: Apache-2.0
11 *
12 * Licensed under the Apache License, Version 2.0 (the License); you may
13 * not use this file except in compliance with the License.
14 * You may obtain a copy of the License at
15 *
16 * www.apache.org/licenses/LICENSE-2.0
17 *
18 * Unless required by applicable law or agreed to in writing, software
19 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
20 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 * See the License for the specific language governing permissions and
22 * limitations under the License.
23 */
24
25 #ifndef _ARM_VEC_MATH_H
26 #define _ARM_VEC_MATH_H
27
28 #include "arm_math.h"
29 #include "arm_common_tables.h"
30 #include "arm_helium_utils.h"
31
32 #ifdef __cplusplus
33 extern "C"
34 {
35 #endif
36
37 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
38
39 #define INV_NEWTON_INIT_F32 0x7EF127EA
40
41 static const float32_t __logf_rng_f32=0.693147180f;
42
43
44 /* fast inverse approximation (3x newton) */
vrecip_medprec_f32(f32x4_t x)45 __STATIC_INLINE f32x4_t vrecip_medprec_f32(
46 f32x4_t x)
47 {
48 q31x4_t m;
49 f32x4_t b;
50 any32x4_t xinv;
51 f32x4_t ax = vabsq(x);
52
53 xinv.f = ax;
54 m = 0x3F800000 - (xinv.i & 0x7F800000);
55 xinv.i = xinv.i + m;
56 xinv.f = 1.41176471f - 0.47058824f * xinv.f;
57 xinv.i = xinv.i + m;
58
59 b = 2.0f - xinv.f * ax;
60 xinv.f = xinv.f * b;
61
62 b = 2.0f - xinv.f * ax;
63 xinv.f = xinv.f * b;
64
65 b = 2.0f - xinv.f * ax;
66 xinv.f = xinv.f * b;
67
68 xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
69 /*
70 * restore sign
71 */
72 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
73
74 return xinv.f;
75 }
76
77 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f32(f32x4_t x)78 __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
79 f32x4_t x)
80 {
81 q31x4_t m;
82 f32x4_t b;
83 any32x4_t xinv;
84 f32x4_t ax = vabsq(x);
85
86 xinv.f = ax;
87
88 m = 0x3F800000 - (xinv.i & 0x7F800000);
89 xinv.i = xinv.i + m;
90 xinv.f = 1.41176471f - 0.47058824f * xinv.f;
91 xinv.i = xinv.i + m;
92
93 b = 2.0f - xinv.f * ax;
94 xinv.f = xinv.f * b;
95
96 b = 2.0f - xinv.f * ax;
97 xinv.f = xinv.f * b;
98
99 b = 2.0f - xinv.f * ax;
100 xinv.f = xinv.f * b;
101
102 b = 2.0f - xinv.f * ax;
103 xinv.f = xinv.f * b;
104
105 xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
106 /*
107 * restore sign
108 */
109 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
110
111 return xinv.f;
112 }
113
vdiv_f32(f32x4_t num,f32x4_t den)114 __STATIC_INLINE f32x4_t vdiv_f32(
115 f32x4_t num, f32x4_t den)
116 {
117 return vmulq(num, vrecip_hiprec_f32(den));
118 }
119
120 /**
121 @brief Single-precision taylor dev.
122 @param[in] x f32 quad vector input
123 @param[in] coeffs f32 quad vector coeffs
124 @return destination f32 quad vector
125 */
126
vtaylor_polyq_f32(f32x4_t x,const float32_t * coeffs)127 __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
128 f32x4_t x,
129 const float32_t * coeffs)
130 {
131 f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
132 f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
133 f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
134 f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
135 f32x4_t x2 = vmulq(x, x);
136 f32x4_t x4 = vmulq(x2, x2);
137 f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
138
139 return res;
140 }
141
vmant_exp_f32(f32x4_t x,int32x4_t * e)142 __STATIC_INLINE f32x4_t vmant_exp_f32(
143 f32x4_t x,
144 int32x4_t * e)
145 {
146 any32x4_t r;
147 int32x4_t n;
148
149 r.f = x;
150 n = r.i >> 23;
151 n = n - 127;
152 r.i = r.i - (n << 23);
153
154 *e = n;
155 return r.f;
156 }
157
158
vlogq_f32(f32x4_t vecIn)159 __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
160 {
161 q31x4_t vecExpUnBiased;
162 f32x4_t vecTmpFlt0, vecTmpFlt1;
163 f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
164 f32x4_t vecExpUnBiasedFlt;
165
166 /*
167 * extract exponent
168 */
169 vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
170
171 vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
172 /*
173 * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
174 */
175 vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
176 vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
177 /*
178 * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
179 */
180 vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
181 vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
182 /*
183 * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
184 */
185 vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
186 vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
187 /*
188 * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
189 */
190 vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
191 vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
192 /*
193 * a = a + b * xx;
194 */
195 vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
196 /*
197 * c = c + d * xx;
198 */
199 vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
200 /*
201 * xx = xx * xx;
202 */
203 vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
204 vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
205 /*
206 * r.f = a + c * xx;
207 */
208 vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
209 /*
210 * add exponent
211 * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
212 */
213 vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
214 // set log0 down to -inf
215 vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
216 return vecAcc0;
217 }
218
vexpq_f32(f32x4_t x)219 __STATIC_INLINE f32x4_t vexpq_f32(
220 f32x4_t x)
221 {
222 // Perform range reduction [-log(2),log(2)]
223 int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
224 f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
225
226 // Polynomial Approximation
227 f32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
228
229 // Reconstruct
230 poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
231
232 poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
233 return poly;
234 }
235
arm_vec_exponent_f32(f32x4_t x,int32_t nb)236 __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
237 {
238 f32x4_t r = x;
239 nb--;
240 while (nb > 0) {
241 r = vmulq(r, x);
242 nb--;
243 }
244 return (r);
245 }
246
vrecip_f32(f32x4_t vecIn)247 __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
248 {
249 f32x4_t vecSx, vecW, vecTmp;
250 any32x4_t v;
251
252 vecSx = vabsq(vecIn);
253
254 v.f = vecIn;
255 v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
256
257 vecW = vmulq(vecSx, v.f);
258
259 // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
260 vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
261 vecTmp = vfmasq(vecW, vecTmp, -28.0f);
262 vecTmp = vfmasq(vecW, vecTmp, 56.0f);
263 vecTmp = vfmasq(vecW, vecTmp, -70.0f);
264 vecTmp = vfmasq(vecW, vecTmp, 56.0f);
265 vecTmp = vfmasq(vecW, vecTmp, -28.0f);
266 vecTmp = vfmasq(vecW, vecTmp, 8.0f);
267 v.f = vmulq(v.f, vecTmp);
268
269 v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
270 /*
271 * restore sign
272 */
273 v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
274 return v.f;
275 }
276
vtanhq_f32(f32x4_t val)277 __STATIC_INLINE f32x4_t vtanhq_f32(
278 f32x4_t val)
279 {
280 f32x4_t x =
281 vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
282 f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
283 f32x4_t num = vsubq_n_f32(exp2x, 1.f);
284 f32x4_t den = vaddq_n_f32(exp2x, 1.f);
285 f32x4_t tanh = vmulq_f32(num, vrecip_f32(den));
286 return tanh;
287 }
288
vpowq_f32(f32x4_t val,f32x4_t n)289 __STATIC_INLINE f32x4_t vpowq_f32(
290 f32x4_t val,
291 f32x4_t n)
292 {
293 return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
294 }
295
296 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
297
298 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
299 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
300
301 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
302
303 #include "NEMath.h"
304 /**
305 * @brief Vectorized integer exponentiation
306 * @param[in] x value
307 * @param[in] nb integer exponent >= 1
308 * @return x^nb
309 *
310 */
arm_vec_exponent_f32(float32x4_t x,int32_t nb)311 __STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
312 {
313 float32x4_t r = x;
314 nb --;
315 while(nb > 0)
316 {
317 r = vmulq_f32(r , x);
318 nb--;
319 }
320 return(r);
321 }
322
323
__arm_vec_sqrt_f32_neon(float32x4_t x)324 __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x)
325 {
326 float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
327 float32x4_t e = vrsqrteq_f32(x1);
328 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
329 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
330 return vmulq_f32(x, e);
331 }
332
__arm_vec_sqrt_q15_neon(int16x8_t vec)333 __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
334 {
335 float32x4_t tempF;
336 int32x4_t tempHI,tempLO;
337
338 tempLO = vmovl_s16(vget_low_s16(vec));
339 tempF = vcvtq_n_f32_s32(tempLO,15);
340 tempF = __arm_vec_sqrt_f32_neon(tempF);
341 tempLO = vcvtq_n_s32_f32(tempF,15);
342
343 tempHI = vmovl_s16(vget_high_s16(vec));
344 tempF = vcvtq_n_f32_s32(tempHI,15);
345 tempF = __arm_vec_sqrt_f32_neon(tempF);
346 tempHI = vcvtq_n_s32_f32(tempF,15);
347
348 return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
349 }
350
__arm_vec_sqrt_q31_neon(int32x4_t vec)351 __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
352 {
353 float32x4_t temp;
354
355 temp = vcvtq_n_f32_s32(vec,31);
356 temp = __arm_vec_sqrt_f32_neon(temp);
357 return(vcvtq_n_s32_f32(temp,31));
358 }
359
360 #endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
361
362 #ifdef __cplusplus
363 }
364 #endif
365
366
367 #endif /* _ARM_VEC_MATH_H */
368
369 /**
370 *
371 * End of file.
372 */
373