1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mse_f32.c
4 * Description: Floating point mean square error
5 *
6 * $Date: 05 April 2022
7 * $Revision: V1.10.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2022 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/statistics_functions.h"
30
31 /**
32 @ingroup groupStats
33 */
34
35 /**
36 @addtogroup MSE
37 @{
38 */
39
40 /**
41 @brief Mean square error between two floating point vectors.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] blockSize number of samples in input vector
45 @param[out] pResult mean square error
46 @return none
47 */
48
49 #if !defined(ARM_MATH_AUTOVECTORIZE)
50
51 #if defined(ARM_MATH_MVEF)
52 #include "arm_helium_utils.h"
53
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)54 void arm_mse_f32(
55 const float32_t * pSrcA,
56 const float32_t * pSrcB,
57 uint32_t blockSize,
58 float32_t * pResult)
59
60 {
61 float32x4_t vecA, vecB;
62 float32x4_t vecSum;
63 uint32_t blkCnt;
64 float32_t sum = 0.0f;
65 vecSum = vdupq_n_f32(0.0f);
66
67 /* Compute 4 outputs at a time */
68 blkCnt = (blockSize) >> 2;
69 while (blkCnt > 0U)
70 {
71 vecA = vld1q(pSrcA);
72 pSrcA += 4;
73
74 vecB = vld1q(pSrcB);
75 pSrcB += 4;
76
77 vecA = vsubq(vecA, vecB);
78
79 vecSum = vfmaq(vecSum, vecA, vecA);
80 /*
81 * Decrement the blockSize loop counter
82 */
83 blkCnt --;
84 }
85
86
87 blkCnt = (blockSize) & 3;
88 if (blkCnt > 0U)
89 {
90 mve_pred16_t p0 = vctp32q(blkCnt);
91 vecA = vld1q(pSrcA);
92 vecB = vld1q(pSrcB);
93
94 vecA = vsubq(vecA, vecB);
95 vecSum = vfmaq_m(vecSum, vecA, vecA, p0);
96 }
97
98 sum = vecAddAcrossF32Mve(vecSum);
99
100 /* Store result in destination buffer */
101 *pResult = sum / blockSize;
102
103 }
104
105 #endif
106
107 #if defined(ARM_MATH_NEON)
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)108 void arm_mse_f32(
109 const float32_t * pSrcA,
110 const float32_t * pSrcB,
111 uint32_t blockSize,
112 float32_t * pResult)
113
114 {
115 float32x4_t vecA, vecB;
116 float32x4_t vecSum;
117 uint32_t blkCnt;
118 float32_t inA, inB;
119 float32_t sum = 0.0f;
120 vecSum = vdupq_n_f32(0.0f);
121 #if !defined(__aarch64__)
122 f32x2_t tmp = vdup_n_f32(0.0f);
123 #endif
124
125 /* Compute 4 outputs at a time */
126 blkCnt = (blockSize) >> 2;
127 while (blkCnt > 0U)
128 {
129 vecA = vld1q_f32(pSrcA);
130 pSrcA += 4;
131
132 vecB = vld1q_f32(pSrcB);
133 pSrcB += 4;
134
135 vecA = vsubq_f32(vecA, vecB);
136
137 vecSum = vfmaq_f32(vecSum, vecA, vecA);
138 /*
139 * Decrement the blockSize loop counter
140 */
141 blkCnt --;
142 }
143
144 #if defined(__aarch64__)
145 sum = vpadds_f32(vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum)));
146 #else
147 tmp = vpadd_f32(vget_low_f32(vecSum), vget_high_f32(vecSum));
148 sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
149
150 #endif
151
152 blkCnt = (blockSize) & 3;
153 while (blkCnt > 0U)
154 {
155 /* Calculate dot product and store result in a temporary buffer. */
156 inA = *pSrcA++;
157 inB = *pSrcB++;
158 inA = inA - inB;
159 sum += inA * inA;
160
161 /* Decrement loop counter */
162 blkCnt--;
163 }
164
165 /* Store result in destination buffer */
166 *pResult = sum / blockSize;
167
168 }
169 #endif
170
171 #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
172
173
174
175 #if (!defined(ARM_MATH_MVEF) && !defined(ARM_MATH_NEON)) || defined(ARM_MATH_AUTOVECTORIZE)
176
177
arm_mse_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize,float32_t * pResult)178 void arm_mse_f32(
179 const float32_t * pSrcA,
180 const float32_t * pSrcB,
181 uint32_t blockSize,
182 float32_t * pResult)
183
184 {
185 uint32_t blkCnt; /* Loop counter */
186 float32_t inA, inB;
187 float32_t sum = 0.0f; /* Temporary return variable */
188 #if defined (ARM_MATH_LOOPUNROLL)
189 /* Loop unrolling: Compute 4 outputs at a time */
190 blkCnt = (blockSize) >> 2;
191
192 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
193 ** a second loop below computes the remaining 1 to 3 samples. */
194 while (blkCnt > 0U)
195 {
196
197 inA = *pSrcA++;
198 inB = *pSrcB++;
199 inA = inA - inB;
200 sum += inA * inA;
201
202 inA = *pSrcA++;
203 inB = *pSrcB++;
204 inA = inA - inB;
205 sum += inA * inA;
206
207 inA = *pSrcA++;
208 inB = *pSrcB++;
209 inA = inA - inB;
210 sum += inA * inA;
211
212 inA = *pSrcA++;
213 inB = *pSrcB++;
214 inA = inA - inB;
215 sum += inA * inA;
216
217 /* Decrement loop counter */
218 blkCnt--;
219 }
220
221
222 /* Loop unrolling: Compute remaining outputs */
223 blkCnt = (blockSize) & 3;
224 #else
225 /* Initialize blkCnt with number of samples */
226 blkCnt = blockSize;
227 #endif
228 while (blkCnt > 0U)
229 {
230 inA = *pSrcA++;
231 inB = *pSrcB++;
232 inA = inA - inB;
233 sum += inA * inA;
234
235 /* Decrement loop counter */
236 blkCnt--;
237 }
238
239 /* Store result in destination buffer */
240 *pResult = sum / blockSize;
241 }
242
243 #endif /* end of test for vector instruction availability */
244
245 /**
246 @} end of MSE group
247 */
248