• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_trans_q15.c
4  * Description:  Q15 matrix transpose
5  *
6  * $Date:        18. March 2019
7  * $Revision:    V1.6.0
8  *
9  * Target Processor: Cortex-M cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "arm_math.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @addtogroup MatrixTrans
37   @{
38  */
39 
40 /**
41   @brief         Q15 matrix transpose.
42   @param[in]     pSrc      points to input matrix
43   @param[out]    pDst      points to output matrix
44   @return        execution status
45                    - \ref ARM_MATH_SUCCESS       : Operation successful
46                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
47  */
48 
49 #if defined(ARM_MATH_MVEI)
50 
arm_mat_trans_16bit_2x2(uint16_t * pDataSrc,uint16_t * pDataDest)51 __STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
52 {
53     pDataDest[0] = pDataSrc[0];
54     pDataDest[3] = pDataSrc[3];
55     pDataDest[2] = pDataSrc[1];
56     pDataDest[1] = pDataSrc[2];
57 
58     return (ARM_MATH_SUCCESS);
59 }
60 
arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc,uint16_t * pDataDest)61 static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
62 {
63     static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
64     uint16x8_t    vecOffs1;
65     uint16x8_t    vecIn1;
66     /*
67      *
68      *  | 0   1   2 |       | 0   3   6 |  8 x 16 flattened version | 0   3   6   1   4   7   2   5 |
69      *  | 3   4   5 | =>    | 1   4   7 |            =>             | 8   .   .   .   .   .   .   . |
70      *  | 6   7   8 |       | 2   5   8 |       (row major)
71      *
72      */
73     vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
74     vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
75 
76     vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
77 
78     pDataDest[8] = pDataSrc[8];
79 
80     return (ARM_MATH_SUCCESS);
81 }
82 
83 
arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc,uint16_t * pDataDest)84 static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
85 {
86     static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
87     static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
88     uint16x8_t    vecOffs1, vecOffs2;
89     uint16x8_t    vecIn1, vecIn2;
90     uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
91 
92     /*
93      * 4x4 Matrix transposition
94      *
95      * | 0   1   2   3  |       | 0   4   8   12 |   8 x 16 flattened version
96      * | 4   5   6   7  |  =>   | 1   5   9   13 |   =>      [0   4   8   12  1   5   9   13]
97      * | 8   9   10  11 |       | 2   6   10  14 |           [2   6   10  14  3   7   11  15]
98      * | 12  13  14  15 |       | 3   7   11  15 |
99      */
100 
101     vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
102     vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
103     vecIn1 = vldrhq_u16(pDataSrcVec);
104     pDataSrcVec += 8;
105     vecIn2 = vldrhq_u16(pDataSrcVec);
106 
107     vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
108     vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
109 
110 
111     return (ARM_MATH_SUCCESS);
112 }
113 
114 
115 
arm_mat_trans_16bit_generic(uint16_t srcRows,uint16_t srcCols,uint16_t * pDataSrc,uint16_t * pDataDest)116 static arm_status arm_mat_trans_16bit_generic(
117     uint16_t    srcRows,
118     uint16_t    srcCols,
119     uint16_t  * pDataSrc,
120     uint16_t  * pDataDest)
121 {
122     uint16x8_t    vecOffs;
123     uint32_t        i;
124     uint32_t        blkCnt;
125     uint16_t const *pDataC;
126     uint16_t       *pDataDestR;
127     uint16x8_t    vecIn;
128 
129     vecOffs = vidupq_u16((uint32_t)0, 1);
130     vecOffs = vecOffs * srcCols;
131 
132     i = srcCols;
133     while(i > 0U)
134     {
135         pDataC = (uint16_t const *) pDataSrc;
136         pDataDestR = pDataDest;
137 
138         blkCnt = srcRows >> 3;
139         while (blkCnt > 0U)
140         {
141             vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
142             vstrhq_u16(pDataDestR, vecIn);
143             pDataDestR += 8;
144             pDataC = pDataC + srcCols * 8;
145             /*
146              * Decrement the blockSize loop counter
147              */
148             blkCnt--;
149         }
150 
151         /*
152          * tail
153          */
154         blkCnt = srcRows & 7;
155         if (blkCnt > 0U)
156         {
157             mve_pred16_t p0 = vctp16q(blkCnt);
158             vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
159             vstrhq_p_u16(pDataDestR, vecIn, p0);
160         }
161         pDataSrc += 1;
162         pDataDest += srcRows;
163         i--;
164     }
165 
166     return (ARM_MATH_SUCCESS);
167 }
168 
169 
arm_mat_trans_q15(const arm_matrix_instance_q15 * pSrc,arm_matrix_instance_q15 * pDst)170 arm_status arm_mat_trans_q15(
171   const arm_matrix_instance_q15 * pSrc,
172         arm_matrix_instance_q15 * pDst)
173 {
174   arm_status status;                             /* status of matrix transpose */
175 
176 #ifdef ARM_MATH_MATRIX_CHECK
177 
178   /* Check for matrix mismatch condition */
179   if ((pSrc->numRows != pDst->numCols) ||
180       (pSrc->numCols != pDst->numRows)   )
181   {
182     /* Set status as ARM_MATH_SIZE_MISMATCH */
183     status = ARM_MATH_SIZE_MISMATCH;
184   }
185   else
186 
187 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
188 
189   {
190     if (pDst->numRows == pDst->numCols)
191     {
192         if (pDst->numCols == 1)
193         {
194           pDst->pData[0] = pSrc->pData[0];
195           return(ARM_MATH_SUCCESS);
196         }
197         if (pDst->numCols == 2)
198             return arm_mat_trans_16bit_2x2((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
199         if (pDst->numCols == 3)
200             return arm_mat_trans_16bit_3x3_mve((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
201         if (pDst->numCols == 4)
202             return arm_mat_trans_16bit_4x4_mve((uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
203     }
204 
205     arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t  *)pSrc->pData, (uint16_t  *)pDst->pData);
206       /* Set status as ARM_MATH_SUCCESS */
207     status = ARM_MATH_SUCCESS;
208   }
209 
210   /* Return to application */
211   return (status);
212 }
213 #else
arm_mat_trans_q15(const arm_matrix_instance_q15 * pSrc,arm_matrix_instance_q15 * pDst)214 arm_status arm_mat_trans_q15(
215   const arm_matrix_instance_q15 * pSrc,
216         arm_matrix_instance_q15 * pDst)
217 {
218         q15_t *pIn = pSrc->pData;                      /* input data matrix pointer */
219         q15_t *pOut = pDst->pData;                     /* output data matrix pointer */
220         uint16_t nRows = pSrc->numRows;                /* number of rows */
221         uint16_t nCols = pSrc->numCols;                /* number of columns */
222         uint32_t col, row = nRows, i = 0U;             /* Loop counters */
223         arm_status status;                             /* status of matrix transpose */
224 
225 #if defined (ARM_MATH_LOOPUNROLL)
226         q31_t in;                                      /* variable to hold temporary output  */
227 #endif
228 
229 #ifdef ARM_MATH_MATRIX_CHECK
230 
231   /* Check for matrix mismatch condition */
232   if ((pSrc->numRows != pDst->numCols) ||
233       (pSrc->numCols != pDst->numRows)   )
234   {
235     /* Set status as ARM_MATH_SIZE_MISMATCH */
236     status = ARM_MATH_SIZE_MISMATCH;
237   }
238   else
239 
240 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
241 
242   {
243     /* Matrix transpose by exchanging the rows with columns */
244     /* row loop */
245     do
246     {
247       /* Pointer pOut is set to starting address of column being processed */
248       pOut = pDst->pData + i;
249 
250 #if defined (ARM_MATH_LOOPUNROLL)
251 
252       /* Loop unrolling: Compute 4 outputs at a time */
253       col = nCols >> 2U;
254 
255       while (col > 0U)        /* column loop */
256       {
257         /* Read two elements from row */
258         in = read_q15x2_ia ((q15_t **) &pIn);
259 
260         /* Unpack and store one element in  destination */
261 #ifndef ARM_MATH_BIG_ENDIAN
262         *pOut = (q15_t) in;
263 #else
264         *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
265 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
266 
267         /* Update pointer pOut to point to next row of transposed matrix */
268         pOut += nRows;
269 
270         /* Unpack and store second element in destination */
271 #ifndef ARM_MATH_BIG_ENDIAN
272         *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
273 #else
274         *pOut = (q15_t) in;
275 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
276 
277         /* Update  pointer pOut to point to next row of transposed matrix */
278         pOut += nRows;
279 
280         /* Read two elements from row */
281         in = read_q15x2_ia ((q15_t **) &pIn);
282 
283         /* Unpack and store one element in destination */
284 #ifndef ARM_MATH_BIG_ENDIAN
285         *pOut = (q15_t) in;
286 #else
287         *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
288 
289 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
290 
291         /* Update pointer pOut to point to next row of transposed matrix */
292         pOut += nRows;
293 
294         /* Unpack and store second element in destination */
295 #ifndef ARM_MATH_BIG_ENDIAN
296         *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
297 #else
298         *pOut = (q15_t) in;
299 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
300 
301         /* Update pointer pOut to point to next row of transposed matrix */
302         pOut += nRows;
303 
304         /* Decrement column loop counter */
305         col--;
306       }
307 
308       /* Loop unrolling: Compute remaining outputs */
309       col = nCols % 0x4U;
310 
311 #else
312 
313       /* Initialize col with number of samples */
314       col = nCols;
315 
316 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
317 
318       while (col > 0U)
319       {
320         /* Read and store input element in destination */
321         *pOut = *pIn++;
322 
323         /* Update pointer pOut to point to next row of transposed matrix */
324         pOut += nRows;
325 
326         /* Decrement column loop counter */
327         col--;
328       }
329 
330       i++;
331 
332       /* Decrement row loop counter */
333       row--;
334 
335     } while (row > 0U);          /* row loop end */
336 
337     /* Set status as ARM_MATH_SUCCESS */
338     status = ARM_MATH_SUCCESS;
339   }
340 
341   /* Return to application */
342   return (status);
343 }
344 #endif /* defined(ARM_MATH_MVEI) */
345 
346 /**
347   @} end of MatrixTrans group
348  */
349