• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_helium_utils.h
4  * Description:  Utility functions for Helium development
5  *
6  * $Date:        09. September 2019
7  * $Revision:    V.1.5.1
8  *
9  * Target Processor: Cortex-M cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #ifndef _ARM_UTILS_HELIUM_H_
30 #define _ARM_UTILS_HELIUM_H_
31 
32 /***************************************
33 
34 Definitions available for MVEF and MVEI
35 
36 ***************************************/
37 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
38 
39 #define INACTIVELANE            0 /* inactive lane content */
40 
41 
42 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
43 
44 /***************************************
45 
46 Definitions available for MVEF only
47 
48 ***************************************/
49 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
50 
vecAddAcrossF32Mve(float32x4_t in)51 __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
52 {
53     float32_t acc;
54 
55     acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
56           vgetq_lane(in, 2) + vgetq_lane(in, 3);
57 
58     return acc;
59 }
60 
61 /* newton initial guess */
62 #define INVSQRT_MAGIC_F32           0x5f3759df
63 
64 #define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
65 {                                                     \
66     float32x4_t tmp;                                  \
67                                                       \
68     /* tmp = xhalf * x * x */                         \
69     tmp = vmulq(xStart, xStart);                      \
70     tmp = vmulq(tmp, xHalf);                          \
71     /* (1.5f - xhalf * x * x) */                      \
72     tmp = vsubq(vdupq_n_f32(1.5f), tmp);              \
73     /* x = x*(1.5f-xhalf*x*x); */                     \
74     invSqrt = vmulq(tmp, xStart);                     \
75 }
76 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
77 
78 /***************************************
79 
80 Definitions available for MVEI only
81 
82 ***************************************/
83 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
84 
85 
86 #include "arm_common_tables.h"
87 
88 /* Following functions are used to transpose matrix in f32 and q31 cases */
arm_mat_trans_32bit_2x2_mve(uint32_t * pDataSrc,uint32_t * pDataDest)89 __STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
90     uint32_t * pDataSrc,
91     uint32_t * pDataDest)
92 {
93     static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
94     /*
95      *
96      * | 0   1 |   =>  |  0   2 |
97      * | 2   3 |       |  1   3 |
98      *
99      */
100     uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
101     vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
102 
103     return (ARM_MATH_SUCCESS);
104 }
105 
arm_mat_trans_32bit_3x3_mve(uint32_t * pDataSrc,uint32_t * pDataDest)106 __STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
107     uint32_t * pDataSrc,
108     uint32_t * pDataDest)
109 {
110     const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
111     const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
112     /*
113      *
114      *  | 0   1   2 |       | 0   3   6 |  4 x 32 flattened version | 0   3   6   1 |
115      *  | 3   4   5 |   =>  | 1   4   7 |            =>             | 4   7   2   5 |
116      *  | 6   7   8 |       | 2   5   8 |       (row major)         | 8   .   .   . |
117      *
118      */
119     uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
120     uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
121 
122     vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
123     vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
124 
125     pDataDest[8] = pDataSrc[8];
126 
127     return (ARM_MATH_SUCCESS);
128 }
129 
arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc,uint32_t * pDataDest)130 __STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
131 {
132     /*
133      * 4x4 Matrix transposition
134      * is 4 x de-interleave operation
135      *
136      * 0   1   2   3       0   4   8   12
137      * 4   5   6   7       1   5   9   13
138      * 8   9   10  11      2   6   10  14
139      * 12  13  14  15      3   7   11  15
140      */
141 
142     uint32x4x4_t vecIn;
143 
144     vecIn = vld4q((uint32_t const *) pDataSrc);
145     vstrwq(pDataDest, vecIn.val[0]);
146     pDataDest += 4;
147     vstrwq(pDataDest, vecIn.val[1]);
148     pDataDest += 4;
149     vstrwq(pDataDest, vecIn.val[2]);
150     pDataDest += 4;
151     vstrwq(pDataDest, vecIn.val[3]);
152 
153     return (ARM_MATH_SUCCESS);
154 }
155 
156 
arm_mat_trans_32bit_generic_mve(uint16_t srcRows,uint16_t srcCols,uint32_t * pDataSrc,uint32_t * pDataDest)157 __STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
158     uint16_t    srcRows,
159     uint16_t    srcCols,
160     uint32_t  * pDataSrc,
161     uint32_t  * pDataDest)
162 {
163     uint32x4_t vecOffs;
164     uint32_t  i;
165     uint32_t  blkCnt;
166     uint32_t const *pDataC;
167     uint32_t *pDataDestR;
168     uint32x4_t vecIn;
169 
170     vecOffs = vidupq_u32((uint32_t)0, 1);
171     vecOffs = vecOffs * srcCols;
172 
173     i = srcCols;
174     do
175     {
176         pDataC = (uint32_t const *) pDataSrc;
177         pDataDestR = pDataDest;
178 
179         blkCnt = srcRows >> 2;
180         while (blkCnt > 0U)
181         {
182             vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
183             vstrwq(pDataDestR, vecIn);
184             pDataDestR += 4;
185             pDataC = pDataC + srcCols * 4;
186             /*
187              * Decrement the blockSize loop counter
188              */
189             blkCnt--;
190         }
191 
192         /*
193          * tail
194          */
195         blkCnt = srcRows & 3;
196         if (blkCnt > 0U)
197         {
198             mve_pred16_t p0 = vctp32q(blkCnt);
199             vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
200             vstrwq_p(pDataDestR, vecIn, p0);
201         }
202 
203         pDataSrc += 1;
204         pDataDest += srcRows;
205     }
206     while (--i);
207 
208     return (ARM_MATH_SUCCESS);
209 }
210 
211 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
FAST_VSQRT_Q31(q31x4_t vecIn)212 __STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
213 {
214     q63x2_t         vecTmpLL;
215     q31x4_t         vecTmp0, vecTmp1;
216     q31_t           scale;
217     q63_t           tmp64;
218     q31x4_t         vecNrm, vecDst, vecIdx, vecSignBits;
219 
220 
221     vecSignBits = vclsq(vecIn);
222     vecSignBits = vbicq(vecSignBits, 1);
223     /*
224      * in = in << no_of_sign_bits;
225      */
226     vecNrm = vshlq(vecIn, vecSignBits);
227     /*
228      * index = in >> 24;
229      */
230     vecIdx = vecNrm >> 24;
231     vecIdx = vecIdx << 1;
232 
233     vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
234 
235     vecIdx = vecIdx + 1;
236 
237     vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
238 
239     vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
240     vecTmp0 = vecTmp0 - vecTmp1;
241     vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
242     vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
243     vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
244     vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
245     vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
246 
247     /*
248      * scale elements 0, 2
249      */
250     scale = 26 + (vecSignBits[0] >> 1);
251     tmp64 = asrl(vecTmpLL[0], scale);
252     vecDst[0] = (q31_t) tmp64;
253 
254     scale = 26 + (vecSignBits[2] >> 1);
255     tmp64 = asrl(vecTmpLL[1], scale);
256     vecDst[2] = (q31_t) tmp64;
257 
258     vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
259 
260     /*
261      * scale elements 1, 3
262      */
263     scale = 26 + (vecSignBits[1] >> 1);
264     tmp64 = asrl(vecTmpLL[0], scale);
265     vecDst[1] = (q31_t) tmp64;
266 
267     scale = 26 + (vecSignBits[3] >> 1);
268     tmp64 = asrl(vecTmpLL[1], scale);
269     vecDst[3] = (q31_t) tmp64;
270     /*
271      * set negative values to 0
272      */
273     vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
274 
275     return vecDst;
276 }
277 #endif
278 
279 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
FAST_VSQRT_Q15(q15x8_t vecIn)280 __STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
281 {
282     q31x4_t         vecTmpLev, vecTmpLodd, vecSignL;
283     q15x8_t         vecTmp0, vecTmp1;
284     q15x8_t         vecNrm, vecDst, vecIdx, vecSignBits;
285 
286     vecDst = vuninitializedq_s16();
287 
288     vecSignBits = vclsq(vecIn);
289     vecSignBits = vbicq(vecSignBits, 1);
290     /*
291      * in = in << no_of_sign_bits;
292      */
293     vecNrm = vshlq(vecIn, vecSignBits);
294 
295     vecIdx = vecNrm >> 8;
296     vecIdx = vecIdx << 1;
297 
298     vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
299 
300     vecIdx = vecIdx + 1;
301 
302     vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
303 
304     vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
305     vecTmp0 = vecTmp0 - vecTmp1;
306     vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
307     vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
308     vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
309     vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
310 
311     vecSignBits = vecSignBits >> 1;
312 
313     vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
314     vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
315 
316     vecTmp0 = vecSignBits + 10;
317     /*
318      * negate sign to apply register based vshl
319      */
320     vecTmp0 = -vecTmp0;
321 
322     /*
323      * shift even elements
324      */
325     vecSignL = vmovlbq(vecTmp0);
326     vecTmpLev = vshlq(vecTmpLev, vecSignL);
327     /*
328      * shift odd elements
329      */
330     vecSignL = vmovltq(vecTmp0);
331     vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
332     /*
333      * merge and narrow odd and even parts
334      */
335     vecDst = vmovnbq_s32(vecDst, vecTmpLev);
336     vecDst = vmovntq_s32(vecDst, vecTmpLodd);
337     /*
338      * set negative values to 0
339      */
340     vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
341 
342     return vecDst;
343 }
344 #endif
345 
346 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
347 
348 #endif
349