• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_add_f32.c
4  * Description:  Floating-point matrix addition
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @defgroup MatrixAdd Matrix Addition
37 
38   Adds two matrices.
39   @par Addition of two 3 x 3 matrices
40 
41   \f[
42   \begin{pmatrix}
43    a_{1,1} & a_{1,2} & a_{1,3} \\
44    a_{2,1} & a_{2,2} & a_{2,3} \\
45    a_{3,1} & a_{3,2} & a_{3,3} \\
46   \end{pmatrix}
47   +
48   \begin{pmatrix}
49    b_{1,1} & b_{1,2} & b_{1,3} \\
50    b_{2,1} & b_{2,2} & b_{2,3} \\
51    b_{3,1} & b_{3,2} & b_{3,3} \\
52   \end{pmatrix}
53   =
54   \begin{pmatrix}
55    a_{1,1}+b_{1,1} & a_{1,2}+b_{1,2} & a_{1,3}+b_{1,3} \\
56    a_{2,1}+b_{2,1} & a_{2,2}+b_{2,2} & a_{2,3}+b_{2,3} \\
57    a_{3,1}+b_{3,1} & a_{3,2}+b_{3,2} & a_{3,3}+b_{3,3} \\
58   \end{pmatrix}
59   \f]
60 
61   The functions check to make sure that
62   <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
63   number of rows and columns.
64  */
65 
66 /**
67   @addtogroup MatrixAdd
68   @{
69  */
70 
71 
72 /**
73   @brief         Floating-point matrix addition.
74   @param[in]     pSrcA      points to first input matrix structure
75   @param[in]     pSrcB      points to second input matrix structure
76   @param[out]    pDst       points to output matrix structure
77   @return        execution status
78                    - \ref ARM_MATH_SUCCESS       : Operation successful
79                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
80  */
81 
82 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)83 arm_status arm_mat_add_f32(
84   const arm_matrix_instance_f32 * pSrcA,
85   const arm_matrix_instance_f32 * pSrcB,
86   arm_matrix_instance_f32 * pDst)
87 {
88     arm_status status;
89     uint32_t  numSamples;       /* total number of elements in the matrix  */
90     float32_t *pDataA, *pDataB, *pDataDst;
91     f32x4_t vecA, vecB, vecDst = { 0 };
92     float32_t const *pSrcAVec;
93     float32_t const *pSrcBVec;
94     uint32_t  blkCnt;           /* loop counters */
95 
96     pDataA = pSrcA->pData;
97     pDataB = pSrcB->pData;
98     pDataDst = pDst->pData;
99     pSrcAVec = (float32_t const *) pDataA;
100     pSrcBVec = (float32_t const *) pDataB;
101 
102 #ifdef ARM_MATH_MATRIX_CHECK
103   /* Check for matrix mismatch condition */
104   if ((pSrcA->numRows != pSrcB->numRows) ||
105      (pSrcA->numCols != pSrcB->numCols) ||
106      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
107   {
108     /* Set status as ARM_MATH_SIZE_MISMATCH */
109     status = ARM_MATH_SIZE_MISMATCH;
110   }
111   else
112 #endif
113  {
114     /*
115      * Total number of samples in the input matrix
116      */
117     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
118     blkCnt = numSamples >> 2;
119     while (blkCnt > 0U)
120     {
121         /* C(m,n) = A(m,n) + B(m,n) */
122         /* Add and then store the results in the destination buffer. */
123         vecA = vld1q(pSrcAVec);
124         pSrcAVec += 4;
125         vecB = vld1q(pSrcBVec);
126         pSrcBVec += 4;
127         vecDst = vaddq(vecA, vecB);
128         vst1q(pDataDst, vecDst);
129         pDataDst += 4;
130         /*
131          * Decrement the blockSize loop counter
132          */
133         blkCnt--;
134     }
135     /*
136      * tail
137      */
138     blkCnt = numSamples & 3;
139     if (blkCnt > 0U)
140     {
141         mve_pred16_t p0 = vctp32q(blkCnt);
142         vecA = vld1q(pSrcAVec);
143         vecB = vld1q(pSrcBVec);
144         vecDst = vaddq_m(vecDst, vecA, vecB, p0);
145         vstrwq_p(pDataDst, vecDst, p0);
146     }
147     /* set status as ARM_MATH_SUCCESS */
148     status = ARM_MATH_SUCCESS;
149   }
150   return (status);
151 }
152 #else
153 #if defined(ARM_MATH_NEON)
154 /*
155 
156 Neon version is assuming the matrix is small enough.
157 So no blocking is used for taking into account cache effects.
158 For big matrix, there exist better libraries for Neon.
159 
160 */
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)161 arm_status arm_mat_add_f32(
162   const arm_matrix_instance_f32 * pSrcA,
163   const arm_matrix_instance_f32 * pSrcB,
164   arm_matrix_instance_f32 * pDst)
165 {
166   float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A  */
167   float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B  */
168   float32_t *pOut = pDst->pData;                 /* output data matrix pointer   */
169 
170 
171   uint32_t numSamples;                           /* total number of elements in the matrix  */
172   uint32_t blkCnt;                               /* loop counters */
173   arm_status status;                             /* status of matrix addition */
174 
175 #ifdef ARM_MATH_MATRIX_CHECK
176   /* Check for matrix mismatch condition */
177   if ((pSrcA->numRows != pSrcB->numRows) ||
178      (pSrcA->numCols != pSrcB->numCols) ||
179      (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
180   {
181     /* Set status as ARM_MATH_SIZE_MISMATCH */
182     status = ARM_MATH_SIZE_MISMATCH;
183   }
184   else
185 #endif
186   {
187     float32x4_t vec1;
188     float32x4_t vec2;
189     float32x4_t res;
190 
191     /* Total number of samples in the input matrix */
192     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
193 
194     blkCnt = numSamples >> 2U;
195 
196     /* Compute 4 outputs at a time.
197      ** a second loop below computes the remaining 1 to 3 samples. */
198     while (blkCnt > 0U)
199     {
200       /* C(m,n) = A(m,n) + B(m,n) */
201       /* Add and then store the results in the destination buffer. */
202       vec1 = vld1q_f32(pIn1);
203       vec2 = vld1q_f32(pIn2);
204       res = vaddq_f32(vec1, vec2);
205       vst1q_f32(pOut, res);
206 
207       /* update pointers to process next samples */
208       pIn1 += 4U;
209       pIn2 += 4U;
210       pOut += 4U;
211       /* Decrement the loop counter */
212       blkCnt--;
213     }
214 
215     /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
216      ** No loop unrolling is used. */
217     blkCnt = numSamples % 0x4U;
218 
219     while (blkCnt > 0U)
220     {
221       /* C(m,n) = A(m,n) + B(m,n) */
222       /* Add and then store the results in the destination buffer. */
223       *pOut++ = (*pIn1++) + (*pIn2++);
224 
225       /* Decrement the loop counter */
226       blkCnt--;
227     }
228 
229     /* set status as ARM_MATH_SUCCESS */
230     status = ARM_MATH_SUCCESS;
231   }
232 
233   /* Return to application */
234   return (status);
235 }
236 #else
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)237 arm_status arm_mat_add_f32(
238   const arm_matrix_instance_f32 * pSrcA,
239   const arm_matrix_instance_f32 * pSrcB,
240         arm_matrix_instance_f32 * pDst)
241 {
242   float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
243   float32_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
244   float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
245 
246   uint32_t numSamples;                           /* total number of elements in the matrix */
247   uint32_t blkCnt;                               /* loop counters */
248   arm_status status;                             /* status of matrix addition */
249 
250 #ifdef ARM_MATH_MATRIX_CHECK
251 
252   /* Check for matrix mismatch condition */
253   if ((pSrcA->numRows != pSrcB->numRows) ||
254       (pSrcA->numCols != pSrcB->numCols) ||
255       (pSrcA->numRows != pDst->numRows)  ||
256       (pSrcA->numCols != pDst->numCols)    )
257   {
258     /* Set status as ARM_MATH_SIZE_MISMATCH */
259     status = ARM_MATH_SIZE_MISMATCH;
260   }
261   else
262 
263 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
264 
265   {
266     /* Total number of samples in input matrix */
267     numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
268 
269 #if defined (ARM_MATH_LOOPUNROLL)
270 
271     /* Loop unrolling: Compute 4 outputs at a time */
272     blkCnt = numSamples >> 2U;
273 
274     while (blkCnt > 0U)
275     {
276       /* C(m,n) = A(m,n) + B(m,n) */
277 
278       /* Add and store result in destination buffer. */
279       *pOut++ = *pInA++ + *pInB++;
280 
281       *pOut++ = *pInA++ + *pInB++;
282 
283       *pOut++ = *pInA++ + *pInB++;
284 
285       *pOut++ = *pInA++ + *pInB++;
286 
287       /* Decrement loop counter */
288       blkCnt--;
289     }
290 
291     /* Loop unrolling: Compute remaining outputs */
292     blkCnt = numSamples % 0x4U;
293 
294 #else
295 
296     /* Initialize blkCnt with number of samples */
297     blkCnt = numSamples;
298 
299 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
300 
301     while (blkCnt > 0U)
302     {
303       /* C(m,n) = A(m,n) + B(m,n) */
304 
305       /* Add and store result in destination buffer. */
306       *pOut++ = *pInA++ + *pInB++;
307 
308       /* Decrement loop counter */
309       blkCnt--;
310     }
311 
312     /* Set status as ARM_MATH_SUCCESS */
313     status = ARM_MATH_SUCCESS;
314   }
315 
316   /* Return to application */
317   return (status);
318 }
319 #endif /* #if defined(ARM_MATH_NEON) */
320 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
321 
322 /**
323   @} end of MatrixAdd group
324  */
325