1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_helium_utils.h
4 * Description: Utility functions for Helium development
5 *
6 * $Date: 09. September 2019
7 * $Revision: V.1.5.1
8 *
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #ifndef _ARM_UTILS_HELIUM_H_
30 #define _ARM_UTILS_HELIUM_H_
31
32 /***************************************
33
34 Definitions available for MVEF and MVEI
35
36 ***************************************/
37 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI)
38
39 #define INACTIVELANE 0 /* inactive lane content */
40
41
42 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) */
43
44 /***************************************
45
46 Definitions available for MVEF only
47
48 ***************************************/
49 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF)
50
vecAddAcrossF32Mve(float32x4_t in)51 __STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)
52 {
53 float32_t acc;
54
55 acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +
56 vgetq_lane(in, 2) + vgetq_lane(in, 3);
57
58 return acc;
59 }
60
61 /* newton initial guess */
62 #define INVSQRT_MAGIC_F32 0x5f3759df
63
64 #define INVSQRT_NEWTON_MVE_F32(invSqrt, xHalf, xStart)\
65 { \
66 float32x4_t tmp; \
67 \
68 /* tmp = xhalf * x * x */ \
69 tmp = vmulq(xStart, xStart); \
70 tmp = vmulq(tmp, xHalf); \
71 /* (1.5f - xhalf * x * x) */ \
72 tmp = vsubq(vdupq_n_f32(1.5f), tmp); \
73 /* x = x*(1.5f-xhalf*x*x); */ \
74 invSqrt = vmulq(tmp, xStart); \
75 }
76 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEF) */
77
78 /***************************************
79
80 Definitions available for MVEI only
81
82 ***************************************/
83 #if defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)
84
85
86 #include "arm_common_tables.h"
87
88 /* Following functions are used to transpose matrix in f32 and q31 cases */
arm_mat_trans_32bit_2x2_mve(uint32_t * pDataSrc,uint32_t * pDataDest)89 __STATIC_INLINE arm_status arm_mat_trans_32bit_2x2_mve(
90 uint32_t * pDataSrc,
91 uint32_t * pDataDest)
92 {
93 static const uint32x4_t vecOffs = { 0, 2, 1, 3 };
94 /*
95 *
96 * | 0 1 | => | 0 2 |
97 * | 2 3 | | 1 3 |
98 *
99 */
100 uint32x4_t vecIn = vldrwq_u32((uint32_t const *)pDataSrc);
101 vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs, vecIn);
102
103 return (ARM_MATH_SUCCESS);
104 }
105
arm_mat_trans_32bit_3x3_mve(uint32_t * pDataSrc,uint32_t * pDataDest)106 __STATIC_INLINE arm_status arm_mat_trans_32bit_3x3_mve(
107 uint32_t * pDataSrc,
108 uint32_t * pDataDest)
109 {
110 const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
111 const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
112 /*
113 *
114 * | 0 1 2 | | 0 3 6 | 4 x 32 flattened version | 0 3 6 1 |
115 * | 3 4 5 | => | 1 4 7 | => | 4 7 2 5 |
116 * | 6 7 8 | | 2 5 8 | (row major) | 8 . . . |
117 *
118 */
119 uint32x4_t vecIn1 = vldrwq_u32((uint32_t const *) pDataSrc);
120 uint32x4_t vecIn2 = vldrwq_u32((uint32_t const *) &pDataSrc[4]);
121
122 vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs1, vecIn1);
123 vstrwq_scatter_shifted_offset_u32(pDataDest, vecOffs2, vecIn2);
124
125 pDataDest[8] = pDataSrc[8];
126
127 return (ARM_MATH_SUCCESS);
128 }
129
arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc,uint32_t * pDataDest)130 __STATIC_INLINE arm_status arm_mat_trans_32bit_4x4_mve(uint32_t * pDataSrc, uint32_t * pDataDest)
131 {
132 /*
133 * 4x4 Matrix transposition
134 * is 4 x de-interleave operation
135 *
136 * 0 1 2 3 0 4 8 12
137 * 4 5 6 7 1 5 9 13
138 * 8 9 10 11 2 6 10 14
139 * 12 13 14 15 3 7 11 15
140 */
141
142 uint32x4x4_t vecIn;
143
144 vecIn = vld4q((uint32_t const *) pDataSrc);
145 vstrwq(pDataDest, vecIn.val[0]);
146 pDataDest += 4;
147 vstrwq(pDataDest, vecIn.val[1]);
148 pDataDest += 4;
149 vstrwq(pDataDest, vecIn.val[2]);
150 pDataDest += 4;
151 vstrwq(pDataDest, vecIn.val[3]);
152
153 return (ARM_MATH_SUCCESS);
154 }
155
156
arm_mat_trans_32bit_generic_mve(uint16_t srcRows,uint16_t srcCols,uint32_t * pDataSrc,uint32_t * pDataDest)157 __STATIC_INLINE arm_status arm_mat_trans_32bit_generic_mve(
158 uint16_t srcRows,
159 uint16_t srcCols,
160 uint32_t * pDataSrc,
161 uint32_t * pDataDest)
162 {
163 uint32x4_t vecOffs;
164 uint32_t i;
165 uint32_t blkCnt;
166 uint32_t const *pDataC;
167 uint32_t *pDataDestR;
168 uint32x4_t vecIn;
169
170 vecOffs = vidupq_u32((uint32_t)0, 1);
171 vecOffs = vecOffs * srcCols;
172
173 i = srcCols;
174 do
175 {
176 pDataC = (uint32_t const *) pDataSrc;
177 pDataDestR = pDataDest;
178
179 blkCnt = srcRows >> 2;
180 while (blkCnt > 0U)
181 {
182 vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
183 vstrwq(pDataDestR, vecIn);
184 pDataDestR += 4;
185 pDataC = pDataC + srcCols * 4;
186 /*
187 * Decrement the blockSize loop counter
188 */
189 blkCnt--;
190 }
191
192 /*
193 * tail
194 */
195 blkCnt = srcRows & 3;
196 if (blkCnt > 0U)
197 {
198 mve_pred16_t p0 = vctp32q(blkCnt);
199 vecIn = vldrwq_gather_shifted_offset_u32(pDataC, vecOffs);
200 vstrwq_p(pDataDestR, vecIn, p0);
201 }
202
203 pDataSrc += 1;
204 pDataDest += srcRows;
205 }
206 while (--i);
207
208 return (ARM_MATH_SUCCESS);
209 }
210
211 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
FAST_VSQRT_Q31(q31x4_t vecIn)212 __STATIC_INLINE q31x4_t FAST_VSQRT_Q31(q31x4_t vecIn)
213 {
214 q63x2_t vecTmpLL;
215 q31x4_t vecTmp0, vecTmp1;
216 q31_t scale;
217 q63_t tmp64;
218 q31x4_t vecNrm, vecDst, vecIdx, vecSignBits;
219
220
221 vecSignBits = vclsq(vecIn);
222 vecSignBits = vbicq(vecSignBits, 1);
223 /*
224 * in = in << no_of_sign_bits;
225 */
226 vecNrm = vshlq(vecIn, vecSignBits);
227 /*
228 * index = in >> 24;
229 */
230 vecIdx = vecNrm >> 24;
231 vecIdx = vecIdx << 1;
232
233 vecTmp0 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
234
235 vecIdx = vecIdx + 1;
236
237 vecTmp1 = vldrwq_gather_shifted_offset_s32(sqrtTable_Q31, vecIdx);
238
239 vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
240 vecTmp0 = vecTmp0 - vecTmp1;
241 vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
242 vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
243 vecTmp1 = vdupq_n_s32(0x18000000) - vecTmp1;
244 vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
245 vecTmpLL = vmullbq_int(vecNrm, vecTmp0);
246
247 /*
248 * scale elements 0, 2
249 */
250 scale = 26 + (vecSignBits[0] >> 1);
251 tmp64 = asrl(vecTmpLL[0], scale);
252 vecDst[0] = (q31_t) tmp64;
253
254 scale = 26 + (vecSignBits[2] >> 1);
255 tmp64 = asrl(vecTmpLL[1], scale);
256 vecDst[2] = (q31_t) tmp64;
257
258 vecTmpLL = vmulltq_int(vecNrm, vecTmp0);
259
260 /*
261 * scale elements 1, 3
262 */
263 scale = 26 + (vecSignBits[1] >> 1);
264 tmp64 = asrl(vecTmpLL[0], scale);
265 vecDst[1] = (q31_t) tmp64;
266
267 scale = 26 + (vecSignBits[3] >> 1);
268 tmp64 = asrl(vecTmpLL[1], scale);
269 vecDst[3] = (q31_t) tmp64;
270 /*
271 * set negative values to 0
272 */
273 vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s32(vecIn, 0));
274
275 return vecDst;
276 }
277 #endif
278
279 #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
FAST_VSQRT_Q15(q15x8_t vecIn)280 __STATIC_INLINE q15x8_t FAST_VSQRT_Q15(q15x8_t vecIn)
281 {
282 q31x4_t vecTmpLev, vecTmpLodd, vecSignL;
283 q15x8_t vecTmp0, vecTmp1;
284 q15x8_t vecNrm, vecDst, vecIdx, vecSignBits;
285
286 vecDst = vuninitializedq_s16();
287
288 vecSignBits = vclsq(vecIn);
289 vecSignBits = vbicq(vecSignBits, 1);
290 /*
291 * in = in << no_of_sign_bits;
292 */
293 vecNrm = vshlq(vecIn, vecSignBits);
294
295 vecIdx = vecNrm >> 8;
296 vecIdx = vecIdx << 1;
297
298 vecTmp0 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
299
300 vecIdx = vecIdx + 1;
301
302 vecTmp1 = vldrhq_gather_shifted_offset_s16(sqrtTable_Q15, vecIdx);
303
304 vecTmp1 = vqrdmulhq(vecTmp1, vecNrm);
305 vecTmp0 = vecTmp0 - vecTmp1;
306 vecTmp1 = vqrdmulhq(vecTmp0, vecTmp0);
307 vecTmp1 = vqrdmulhq(vecNrm, vecTmp1);
308 vecTmp1 = vdupq_n_s16(0x1800) - vecTmp1;
309 vecTmp0 = vqrdmulhq(vecTmp0, vecTmp1);
310
311 vecSignBits = vecSignBits >> 1;
312
313 vecTmpLev = vmullbq_int(vecNrm, vecTmp0);
314 vecTmpLodd = vmulltq_int(vecNrm, vecTmp0);
315
316 vecTmp0 = vecSignBits + 10;
317 /*
318 * negate sign to apply register based vshl
319 */
320 vecTmp0 = -vecTmp0;
321
322 /*
323 * shift even elements
324 */
325 vecSignL = vmovlbq(vecTmp0);
326 vecTmpLev = vshlq(vecTmpLev, vecSignL);
327 /*
328 * shift odd elements
329 */
330 vecSignL = vmovltq(vecTmp0);
331 vecTmpLodd = vshlq(vecTmpLodd, vecSignL);
332 /*
333 * merge and narrow odd and even parts
334 */
335 vecDst = vmovnbq_s32(vecDst, vecTmpLev);
336 vecDst = vmovntq_s32(vecDst, vecTmpLodd);
337 /*
338 * set negative values to 0
339 */
340 vecDst = vdupq_m(vecDst, 0, vcmpltq_n_s16(vecIn, 0));
341
342 return vecDst;
343 }
344 #endif
345
346 #endif /* defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI) */
347
348 #endif
349