1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_add_f32.c
4 * Description: Floating-point matrix addition
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @defgroup MatrixAdd Matrix Addition
37
38 Adds two matrices.
39 @par Addition of two 3 x 3 matrices
40
41 \f[
42 \begin{pmatrix}
43 a_{1,1} & a_{1,2} & a_{1,3} \\
44 a_{2,1} & a_{2,2} & a_{2,3} \\
45 a_{3,1} & a_{3,2} & a_{3,3} \\
46 \end{pmatrix}
47 +
48 \begin{pmatrix}
49 b_{1,1} & b_{1,2} & b_{1,3} \\
50 b_{2,1} & b_{2,2} & b_{2,3} \\
51 b_{3,1} & b_{3,2} & b_{3,3} \\
52 \end{pmatrix}
53 =
54 \begin{pmatrix}
55 a_{1,1}+b_{1,1} & a_{1,2}+b_{1,2} & a_{1,3}+b_{1,3} \\
56 a_{2,1}+b_{2,1} & a_{2,2}+b_{2,2} & a_{2,3}+b_{2,3} \\
57 a_{3,1}+b_{3,1} & a_{3,2}+b_{3,2} & a_{3,3}+b_{3,3} \\
58 \end{pmatrix}
59 \f]
60
61 The functions check to make sure that
62 <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
63 number of rows and columns.
64 */
65
66 /**
67 @addtogroup MatrixAdd
68 @{
69 */
70
71
72 /**
73 @brief Floating-point matrix addition.
74 @param[in] pSrcA points to first input matrix structure
75 @param[in] pSrcB points to second input matrix structure
76 @param[out] pDst points to output matrix structure
77 @return execution status
78 - \ref ARM_MATH_SUCCESS : Operation successful
79 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
80 */
81
82 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)83 arm_status arm_mat_add_f32(
84 const arm_matrix_instance_f32 * pSrcA,
85 const arm_matrix_instance_f32 * pSrcB,
86 arm_matrix_instance_f32 * pDst)
87 {
88 arm_status status;
89 uint32_t numSamples; /* total number of elements in the matrix */
90 float32_t *pDataA, *pDataB, *pDataDst;
91 f32x4_t vecA, vecB, vecDst = { 0 };
92 float32_t const *pSrcAVec;
93 float32_t const *pSrcBVec;
94 uint32_t blkCnt; /* loop counters */
95
96 pDataA = pSrcA->pData;
97 pDataB = pSrcB->pData;
98 pDataDst = pDst->pData;
99 pSrcAVec = (float32_t const *) pDataA;
100 pSrcBVec = (float32_t const *) pDataB;
101
102 #ifdef ARM_MATH_MATRIX_CHECK
103 /* Check for matrix mismatch condition */
104 if ((pSrcA->numRows != pSrcB->numRows) ||
105 (pSrcA->numCols != pSrcB->numCols) ||
106 (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
107 {
108 /* Set status as ARM_MATH_SIZE_MISMATCH */
109 status = ARM_MATH_SIZE_MISMATCH;
110 }
111 else
112 #endif
113 {
114 /*
115 * Total number of samples in the input matrix
116 */
117 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
118 blkCnt = numSamples >> 2;
119 while (blkCnt > 0U)
120 {
121 /* C(m,n) = A(m,n) + B(m,n) */
122 /* Add and then store the results in the destination buffer. */
123 vecA = vld1q(pSrcAVec);
124 pSrcAVec += 4;
125 vecB = vld1q(pSrcBVec);
126 pSrcBVec += 4;
127 vecDst = vaddq(vecA, vecB);
128 vst1q(pDataDst, vecDst);
129 pDataDst += 4;
130 /*
131 * Decrement the blockSize loop counter
132 */
133 blkCnt--;
134 }
135 /*
136 * tail
137 */
138 blkCnt = numSamples & 3;
139 if (blkCnt > 0U)
140 {
141 mve_pred16_t p0 = vctp32q(blkCnt);
142 vecA = vld1q(pSrcAVec);
143 vecB = vld1q(pSrcBVec);
144 vecDst = vaddq_m(vecDst, vecA, vecB, p0);
145 vstrwq_p(pDataDst, vecDst, p0);
146 }
147 /* set status as ARM_MATH_SUCCESS */
148 status = ARM_MATH_SUCCESS;
149 }
150 return (status);
151 }
152 #else
153 #if defined(ARM_MATH_NEON)
154 /*
155
156 Neon version is assuming the matrix is small enough.
157 So no blocking is used for taking into account cache effects.
158 For big matrix, there exist better libraries for Neon.
159
160 */
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)161 arm_status arm_mat_add_f32(
162 const arm_matrix_instance_f32 * pSrcA,
163 const arm_matrix_instance_f32 * pSrcB,
164 arm_matrix_instance_f32 * pDst)
165 {
166 float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
167 float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
168 float32_t *pOut = pDst->pData; /* output data matrix pointer */
169
170
171 uint32_t numSamples; /* total number of elements in the matrix */
172 uint32_t blkCnt; /* loop counters */
173 arm_status status; /* status of matrix addition */
174
175 #ifdef ARM_MATH_MATRIX_CHECK
176 /* Check for matrix mismatch condition */
177 if ((pSrcA->numRows != pSrcB->numRows) ||
178 (pSrcA->numCols != pSrcB->numCols) ||
179 (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
180 {
181 /* Set status as ARM_MATH_SIZE_MISMATCH */
182 status = ARM_MATH_SIZE_MISMATCH;
183 }
184 else
185 #endif
186 {
187 float32x4_t vec1;
188 float32x4_t vec2;
189 float32x4_t res;
190
191 /* Total number of samples in the input matrix */
192 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
193
194 blkCnt = numSamples >> 2U;
195
196 /* Compute 4 outputs at a time.
197 ** a second loop below computes the remaining 1 to 3 samples. */
198 while (blkCnt > 0U)
199 {
200 /* C(m,n) = A(m,n) + B(m,n) */
201 /* Add and then store the results in the destination buffer. */
202 vec1 = vld1q_f32(pIn1);
203 vec2 = vld1q_f32(pIn2);
204 res = vaddq_f32(vec1, vec2);
205 vst1q_f32(pOut, res);
206
207 /* update pointers to process next samples */
208 pIn1 += 4U;
209 pIn2 += 4U;
210 pOut += 4U;
211 /* Decrement the loop counter */
212 blkCnt--;
213 }
214
215 /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
216 ** No loop unrolling is used. */
217 blkCnt = numSamples % 0x4U;
218
219 while (blkCnt > 0U)
220 {
221 /* C(m,n) = A(m,n) + B(m,n) */
222 /* Add and then store the results in the destination buffer. */
223 *pOut++ = (*pIn1++) + (*pIn2++);
224
225 /* Decrement the loop counter */
226 blkCnt--;
227 }
228
229 /* set status as ARM_MATH_SUCCESS */
230 status = ARM_MATH_SUCCESS;
231 }
232
233 /* Return to application */
234 return (status);
235 }
236 #else
arm_mat_add_f32(const arm_matrix_instance_f32 * pSrcA,const arm_matrix_instance_f32 * pSrcB,arm_matrix_instance_f32 * pDst)237 arm_status arm_mat_add_f32(
238 const arm_matrix_instance_f32 * pSrcA,
239 const arm_matrix_instance_f32 * pSrcB,
240 arm_matrix_instance_f32 * pDst)
241 {
242 float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
243 float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
244 float32_t *pOut = pDst->pData; /* output data matrix pointer */
245
246 uint32_t numSamples; /* total number of elements in the matrix */
247 uint32_t blkCnt; /* loop counters */
248 arm_status status; /* status of matrix addition */
249
250 #ifdef ARM_MATH_MATRIX_CHECK
251
252 /* Check for matrix mismatch condition */
253 if ((pSrcA->numRows != pSrcB->numRows) ||
254 (pSrcA->numCols != pSrcB->numCols) ||
255 (pSrcA->numRows != pDst->numRows) ||
256 (pSrcA->numCols != pDst->numCols) )
257 {
258 /* Set status as ARM_MATH_SIZE_MISMATCH */
259 status = ARM_MATH_SIZE_MISMATCH;
260 }
261 else
262
263 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
264
265 {
266 /* Total number of samples in input matrix */
267 numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
268
269 #if defined (ARM_MATH_LOOPUNROLL)
270
271 /* Loop unrolling: Compute 4 outputs at a time */
272 blkCnt = numSamples >> 2U;
273
274 while (blkCnt > 0U)
275 {
276 /* C(m,n) = A(m,n) + B(m,n) */
277
278 /* Add and store result in destination buffer. */
279 *pOut++ = *pInA++ + *pInB++;
280
281 *pOut++ = *pInA++ + *pInB++;
282
283 *pOut++ = *pInA++ + *pInB++;
284
285 *pOut++ = *pInA++ + *pInB++;
286
287 /* Decrement loop counter */
288 blkCnt--;
289 }
290
291 /* Loop unrolling: Compute remaining outputs */
292 blkCnt = numSamples % 0x4U;
293
294 #else
295
296 /* Initialize blkCnt with number of samples */
297 blkCnt = numSamples;
298
299 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
300
301 while (blkCnt > 0U)
302 {
303 /* C(m,n) = A(m,n) + B(m,n) */
304
305 /* Add and store result in destination buffer. */
306 *pOut++ = *pInA++ + *pInB++;
307
308 /* Decrement loop counter */
309 blkCnt--;
310 }
311
312 /* Set status as ARM_MATH_SUCCESS */
313 status = ARM_MATH_SUCCESS;
314 }
315
316 /* Return to application */
317 return (status);
318 }
319 #endif /* #if defined(ARM_MATH_NEON) */
320 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
321
322 /**
323 @} end of MatrixAdd group
324 */
325