• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  * @file     arm_vec_math.h
3  * @brief    Public header file for CMSIS DSP Library
4  * @version  V1.7.0
5  * @date     15. October 2019
6  ******************************************************************************/
7 /*
8  * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
9  *
10  * SPDX-License-Identifier: Apache-2.0
11  *
12  * Licensed under the Apache License, Version 2.0 (the License); you may
13  * not use this file except in compliance with the License.
14  * You may obtain a copy of the License at
15  *
16  * www.apache.org/licenses/LICENSE-2.0
17  *
18  * Unless required by applicable law or agreed to in writing, software
19  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
20  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  * See the License for the specific language governing permissions and
22  * limitations under the License.
23  */
24 
25 #ifndef _ARM_VEC_MATH_H
26 #define _ARM_VEC_MATH_H
27 
28 #include "arm_math.h"
29 #include "arm_common_tables.h"
30 #include "arm_helium_utils.h"
31 
32 #ifdef   __cplusplus
33 extern "C"
34 {
35 #endif
36 
37 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
38 
39 #define INV_NEWTON_INIT_F32         0x7EF127EA
40 
41 static const float32_t __logf_rng_f32=0.693147180f;
42 
43 
44 /* fast inverse approximation (3x newton) */
vrecip_medprec_f32(f32x4_t x)45 __STATIC_INLINE f32x4_t vrecip_medprec_f32(
46     f32x4_t x)
47 {
48     q31x4_t         m;
49     f32x4_t         b;
50     any32x4_t       xinv;
51     f32x4_t         ax = vabsq(x);
52 
53     xinv.f = ax;
54     m = 0x3F800000 - (xinv.i & 0x7F800000);
55     xinv.i = xinv.i + m;
56     xinv.f = 1.41176471f - 0.47058824f * xinv.f;
57     xinv.i = xinv.i + m;
58 
59     b = 2.0f - xinv.f * ax;
60     xinv.f = xinv.f * b;
61 
62     b = 2.0f - xinv.f * ax;
63     xinv.f = xinv.f * b;
64 
65     b = 2.0f - xinv.f * ax;
66     xinv.f = xinv.f * b;
67 
68     xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
69     /*
70      * restore sign
71      */
72     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
73 
74     return xinv.f;
75 }
76 
77 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f32(f32x4_t x)78 __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
79     f32x4_t x)
80 {
81     q31x4_t         m;
82     f32x4_t         b;
83     any32x4_t       xinv;
84     f32x4_t         ax = vabsq(x);
85 
86     xinv.f = ax;
87 
88     m = 0x3F800000 - (xinv.i & 0x7F800000);
89     xinv.i = xinv.i + m;
90     xinv.f = 1.41176471f - 0.47058824f * xinv.f;
91     xinv.i = xinv.i + m;
92 
93     b = 2.0f - xinv.f * ax;
94     xinv.f = xinv.f * b;
95 
96     b = 2.0f - xinv.f * ax;
97     xinv.f = xinv.f * b;
98 
99     b = 2.0f - xinv.f * ax;
100     xinv.f = xinv.f * b;
101 
102     b = 2.0f - xinv.f * ax;
103     xinv.f = xinv.f * b;
104 
105     xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
106     /*
107      * restore sign
108      */
109     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
110 
111     return xinv.f;
112 }
113 
vdiv_f32(f32x4_t num,f32x4_t den)114 __STATIC_INLINE f32x4_t vdiv_f32(
115     f32x4_t num, f32x4_t den)
116 {
117     return vmulq(num, vrecip_hiprec_f32(den));
118 }
119 
120 /**
121   @brief         Single-precision taylor dev.
122   @param[in]     x              f32 quad vector input
123   @param[in]     coeffs         f32 quad vector coeffs
124   @return        destination    f32 quad vector
125  */
126 
vtaylor_polyq_f32(f32x4_t x,const float32_t * coeffs)127 __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
128         f32x4_t           x,
129         const float32_t * coeffs)
130 {
131     f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
132     f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
133     f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
134     f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
135     f32x4_t         x2 = vmulq(x, x);
136     f32x4_t         x4 = vmulq(x2, x2);
137     f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
138 
139     return res;
140 }
141 
vmant_exp_f32(f32x4_t x,int32x4_t * e)142 __STATIC_INLINE f32x4_t vmant_exp_f32(
143     f32x4_t     x,
144     int32x4_t * e)
145 {
146     any32x4_t       r;
147     int32x4_t       n;
148 
149     r.f = x;
150     n = r.i >> 23;
151     n = n - 127;
152     r.i = r.i - (n << 23);
153 
154     *e = n;
155     return r.f;
156 }
157 
158 
vlogq_f32(f32x4_t vecIn)159 __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
160 {
161     q31x4_t         vecExpUnBiased;
162     f32x4_t         vecTmpFlt0, vecTmpFlt1;
163     f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
164     f32x4_t         vecExpUnBiasedFlt;
165 
166     /*
167      * extract exponent
168      */
169     vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
170 
171     vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
172     /*
173      * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
174      */
175     vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
176     vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
177     /*
178      * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
179      */
180     vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
181     vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
182     /*
183      * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
184      */
185     vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
186     vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
187     /*
188      * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
189      */
190     vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
191     vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
192     /*
193      * a = a + b * xx;
194      */
195     vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
196     /*
197      * c = c + d * xx;
198      */
199     vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
200     /*
201      * xx = xx * xx;
202      */
203     vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
204     vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
205     /*
206      * r.f = a + c * xx;
207      */
208     vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
209     /*
210      * add exponent
211      * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
212      */
213     vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
214     // set log0 down to -inf
215     vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
216     return vecAcc0;
217 }
218 
vexpq_f32(f32x4_t x)219 __STATIC_INLINE f32x4_t vexpq_f32(
220     f32x4_t x)
221 {
222     // Perform range reduction [-log(2),log(2)]
223     int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
224     f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
225 
226     // Polynomial Approximation
227     f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);
228 
229     // Reconstruct
230     poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
231 
232     poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
233     return poly;
234 }
235 
arm_vec_exponent_f32(f32x4_t x,int32_t nb)236 __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
237 {
238     f32x4_t         r = x;
239     nb--;
240     while (nb > 0) {
241         r = vmulq(r, x);
242         nb--;
243     }
244     return (r);
245 }
246 
vrecip_f32(f32x4_t vecIn)247 __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
248 {
249     f32x4_t     vecSx, vecW, vecTmp;
250     any32x4_t   v;
251 
252     vecSx = vabsq(vecIn);
253 
254     v.f = vecIn;
255     v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
256 
257     vecW = vmulq(vecSx, v.f);
258 
259     // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
260     vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
261     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
262     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
263     vecTmp = vfmasq(vecW, vecTmp, -70.0f);
264     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
265     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
266     vecTmp = vfmasq(vecW, vecTmp, 8.0f);
267     v.f = vmulq(v.f,  vecTmp);
268 
269     v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
270     /*
271      * restore sign
272      */
273     v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
274     return v.f;
275 }
276 
vtanhq_f32(f32x4_t val)277 __STATIC_INLINE f32x4_t vtanhq_f32(
278     f32x4_t val)
279 {
280     f32x4_t         x =
281         vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
282     f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
283     f32x4_t         num = vsubq_n_f32(exp2x, 1.f);
284     f32x4_t         den = vaddq_n_f32(exp2x, 1.f);
285     f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));
286     return tanh;
287 }
288 
vpowq_f32(f32x4_t val,f32x4_t n)289 __STATIC_INLINE f32x4_t vpowq_f32(
290     f32x4_t val,
291     f32x4_t n)
292 {
293     return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
294 }
295 
296 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
297 
298 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
299 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
300 
301 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
302 
303 #include "NEMath.h"
304 /**
305  * @brief Vectorized integer exponentiation
306  * @param[in]    x           value
307  * @param[in]    nb          integer exponent >= 1
308  * @return x^nb
309  *
310  */
arm_vec_exponent_f32(float32x4_t x,int32_t nb)311 __STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
312 {
313     float32x4_t r = x;
314     nb --;
315     while(nb > 0)
316     {
317         r = vmulq_f32(r , x);
318         nb--;
319     }
320     return(r);
321 }
322 
323 
__arm_vec_sqrt_f32_neon(float32x4_t x)324 __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x)
325 {
326     float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
327     float32x4_t e = vrsqrteq_f32(x1);
328     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
329     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
330     return vmulq_f32(x, e);
331 }
332 
__arm_vec_sqrt_q15_neon(int16x8_t vec)333 __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
334 {
335     float32x4_t tempF;
336     int32x4_t tempHI,tempLO;
337 
338     tempLO = vmovl_s16(vget_low_s16(vec));
339     tempF = vcvtq_n_f32_s32(tempLO,15);
340     tempF = __arm_vec_sqrt_f32_neon(tempF);
341     tempLO = vcvtq_n_s32_f32(tempF,15);
342 
343     tempHI = vmovl_s16(vget_high_s16(vec));
344     tempF = vcvtq_n_f32_s32(tempHI,15);
345     tempF = __arm_vec_sqrt_f32_neon(tempF);
346     tempHI = vcvtq_n_s32_f32(tempF,15);
347 
348     return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
349 }
350 
__arm_vec_sqrt_q31_neon(int32x4_t vec)351 __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
352 {
353   float32x4_t temp;
354 
355   temp = vcvtq_n_f32_s32(vec,31);
356   temp = __arm_vec_sqrt_f32_neon(temp);
357   return(vcvtq_n_s32_f32(temp,31));
358 }
359 
360 #endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
361 
362 #ifdef   __cplusplus
363 }
364 #endif
365 
366 
367 #endif /* _ARM_VEC_MATH_H */
368 
369 /**
370  *
371  * End of file.
372  */
373