1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_trans_q15.c
4 * Description: Q15 matrix transpose
5 *
6 * $Date: 18. March 2019
7 * $Revision: V1.6.0
8 *
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "arm_math.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @addtogroup MatrixTrans
37 @{
38 */
39
40 /**
41 @brief Q15 matrix transpose.
42 @param[in] pSrc points to input matrix
43 @param[out] pDst points to output matrix
44 @return execution status
45 - \ref ARM_MATH_SUCCESS : Operation successful
46 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
47 */
48
49 #if defined(ARM_MATH_MVEI)
50
arm_mat_trans_16bit_2x2(uint16_t * pDataSrc,uint16_t * pDataDest)51 __STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
52 {
53 pDataDest[0] = pDataSrc[0];
54 pDataDest[3] = pDataSrc[3];
55 pDataDest[2] = pDataSrc[1];
56 pDataDest[1] = pDataSrc[2];
57
58 return (ARM_MATH_SUCCESS);
59 }
60
arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc,uint16_t * pDataDest)61 static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
62 {
63 static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
64 uint16x8_t vecOffs1;
65 uint16x8_t vecIn1;
66 /*
67 *
68 * | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
69 * | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
70 * | 6 7 8 | | 2 5 8 | (row major)
71 *
72 */
73 vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
74 vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
75
76 vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
77
78 pDataDest[8] = pDataSrc[8];
79
80 return (ARM_MATH_SUCCESS);
81 }
82
83
arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc,uint16_t * pDataDest)84 static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
85 {
86 static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
87 static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
88 uint16x8_t vecOffs1, vecOffs2;
89 uint16x8_t vecIn1, vecIn2;
90 uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
91
92 /*
93 * 4x4 Matrix transposition
94 *
95 * | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
96 * | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
97 * | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
98 * | 12 13 14 15 | | 3 7 11 15 |
99 */
100
101 vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
102 vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
103 vecIn1 = vldrhq_u16(pDataSrcVec);
104 pDataSrcVec += 8;
105 vecIn2 = vldrhq_u16(pDataSrcVec);
106
107 vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
108 vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
109
110
111 return (ARM_MATH_SUCCESS);
112 }
113
114
115
arm_mat_trans_16bit_generic(uint16_t srcRows,uint16_t srcCols,uint16_t * pDataSrc,uint16_t * pDataDest)116 static arm_status arm_mat_trans_16bit_generic(
117 uint16_t srcRows,
118 uint16_t srcCols,
119 uint16_t * pDataSrc,
120 uint16_t * pDataDest)
121 {
122 uint16x8_t vecOffs;
123 uint32_t i;
124 uint32_t blkCnt;
125 uint16_t const *pDataC;
126 uint16_t *pDataDestR;
127 uint16x8_t vecIn;
128
129 vecOffs = vidupq_u16((uint32_t)0, 1);
130 vecOffs = vecOffs * srcCols;
131
132 i = srcCols;
133 while(i > 0U)
134 {
135 pDataC = (uint16_t const *) pDataSrc;
136 pDataDestR = pDataDest;
137
138 blkCnt = srcRows >> 3;
139 while (blkCnt > 0U)
140 {
141 vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
142 vstrhq_u16(pDataDestR, vecIn);
143 pDataDestR += 8;
144 pDataC = pDataC + srcCols * 8;
145 /*
146 * Decrement the blockSize loop counter
147 */
148 blkCnt--;
149 }
150
151 /*
152 * tail
153 */
154 blkCnt = srcRows & 7;
155 if (blkCnt > 0U)
156 {
157 mve_pred16_t p0 = vctp16q(blkCnt);
158 vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
159 vstrhq_p_u16(pDataDestR, vecIn, p0);
160 }
161 pDataSrc += 1;
162 pDataDest += srcRows;
163 i--;
164 }
165
166 return (ARM_MATH_SUCCESS);
167 }
168
169
arm_mat_trans_q15(const arm_matrix_instance_q15 * pSrc,arm_matrix_instance_q15 * pDst)170 arm_status arm_mat_trans_q15(
171 const arm_matrix_instance_q15 * pSrc,
172 arm_matrix_instance_q15 * pDst)
173 {
174 arm_status status; /* status of matrix transpose */
175
176 #ifdef ARM_MATH_MATRIX_CHECK
177
178 /* Check for matrix mismatch condition */
179 if ((pSrc->numRows != pDst->numCols) ||
180 (pSrc->numCols != pDst->numRows) )
181 {
182 /* Set status as ARM_MATH_SIZE_MISMATCH */
183 status = ARM_MATH_SIZE_MISMATCH;
184 }
185 else
186
187 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
188
189 {
190 if (pDst->numRows == pDst->numCols)
191 {
192 if (pDst->numCols == 1)
193 {
194 pDst->pData[0] = pSrc->pData[0];
195 return(ARM_MATH_SUCCESS);
196 }
197 if (pDst->numCols == 2)
198 return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
199 if (pDst->numCols == 3)
200 return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
201 if (pDst->numCols == 4)
202 return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
203 }
204
205 arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
206 /* Set status as ARM_MATH_SUCCESS */
207 status = ARM_MATH_SUCCESS;
208 }
209
210 /* Return to application */
211 return (status);
212 }
213 #else
arm_mat_trans_q15(const arm_matrix_instance_q15 * pSrc,arm_matrix_instance_q15 * pDst)214 arm_status arm_mat_trans_q15(
215 const arm_matrix_instance_q15 * pSrc,
216 arm_matrix_instance_q15 * pDst)
217 {
218 q15_t *pIn = pSrc->pData; /* input data matrix pointer */
219 q15_t *pOut = pDst->pData; /* output data matrix pointer */
220 uint16_t nRows = pSrc->numRows; /* number of rows */
221 uint16_t nCols = pSrc->numCols; /* number of columns */
222 uint32_t col, row = nRows, i = 0U; /* Loop counters */
223 arm_status status; /* status of matrix transpose */
224
225 #if defined (ARM_MATH_LOOPUNROLL)
226 q31_t in; /* variable to hold temporary output */
227 #endif
228
229 #ifdef ARM_MATH_MATRIX_CHECK
230
231 /* Check for matrix mismatch condition */
232 if ((pSrc->numRows != pDst->numCols) ||
233 (pSrc->numCols != pDst->numRows) )
234 {
235 /* Set status as ARM_MATH_SIZE_MISMATCH */
236 status = ARM_MATH_SIZE_MISMATCH;
237 }
238 else
239
240 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
241
242 {
243 /* Matrix transpose by exchanging the rows with columns */
244 /* row loop */
245 do
246 {
247 /* Pointer pOut is set to starting address of column being processed */
248 pOut = pDst->pData + i;
249
250 #if defined (ARM_MATH_LOOPUNROLL)
251
252 /* Loop unrolling: Compute 4 outputs at a time */
253 col = nCols >> 2U;
254
255 while (col > 0U) /* column loop */
256 {
257 /* Read two elements from row */
258 in = read_q15x2_ia ((q15_t **) &pIn);
259
260 /* Unpack and store one element in destination */
261 #ifndef ARM_MATH_BIG_ENDIAN
262 *pOut = (q15_t) in;
263 #else
264 *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
265 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
266
267 /* Update pointer pOut to point to next row of transposed matrix */
268 pOut += nRows;
269
270 /* Unpack and store second element in destination */
271 #ifndef ARM_MATH_BIG_ENDIAN
272 *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
273 #else
274 *pOut = (q15_t) in;
275 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
276
277 /* Update pointer pOut to point to next row of transposed matrix */
278 pOut += nRows;
279
280 /* Read two elements from row */
281 in = read_q15x2_ia ((q15_t **) &pIn);
282
283 /* Unpack and store one element in destination */
284 #ifndef ARM_MATH_BIG_ENDIAN
285 *pOut = (q15_t) in;
286 #else
287 *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
288
289 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
290
291 /* Update pointer pOut to point to next row of transposed matrix */
292 pOut += nRows;
293
294 /* Unpack and store second element in destination */
295 #ifndef ARM_MATH_BIG_ENDIAN
296 *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
297 #else
298 *pOut = (q15_t) in;
299 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
300
301 /* Update pointer pOut to point to next row of transposed matrix */
302 pOut += nRows;
303
304 /* Decrement column loop counter */
305 col--;
306 }
307
308 /* Loop unrolling: Compute remaining outputs */
309 col = nCols % 0x4U;
310
311 #else
312
313 /* Initialize col with number of samples */
314 col = nCols;
315
316 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
317
318 while (col > 0U)
319 {
320 /* Read and store input element in destination */
321 *pOut = *pIn++;
322
323 /* Update pointer pOut to point to next row of transposed matrix */
324 pOut += nRows;
325
326 /* Decrement column loop counter */
327 col--;
328 }
329
330 i++;
331
332 /* Decrement row loop counter */
333 row--;
334
335 } while (row > 0U); /* row loop end */
336
337 /* Set status as ARM_MATH_SUCCESS */
338 status = ARM_MATH_SUCCESS;
339 }
340
341 /* Return to application */
342 return (status);
343 }
344 #endif /* defined(ARM_MATH_MVEI) */
345
346 /**
347 @} end of MatrixTrans group
348 */
349