1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mag_f32.c
4 * Description: Floating-point complex magnitude
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @defgroup cmplx_mag Complex Magnitude
37
38 Computes the magnitude of the elements of a complex data vector.
39
40 The <code>pSrc</code> points to the source data and
41 <code>pDst</code> points to the where the result should be written.
42 <code>numSamples</code> specifies the number of complex samples
43 in the input array and the data is stored in an interleaved fashion
44 (real, imag, real, imag, ...).
45 The input array has a total of <code>2*numSamples</code> values;
46 the output array has a total of <code>numSamples</code> values.
47
48 The underlying algorithm is used:
49
50 <pre>
51 for (n = 0; n < numSamples; n++) {
52 pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
53 }
54 </pre>
55
56 There are separate functions for floating-point, Q15, and Q31 data types.
57 */
58
59 /**
60 @addtogroup cmplx_mag
61 @{
62 */
63
64 /**
65 @brief Floating-point complex magnitude.
66 @param[in] pSrc points to input vector
67 @param[out] pDst points to output vector
68 @param[in] numSamples number of samples in each vector
69 @return none
70 */
71
72 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
73 #include "arm_vec_math.h"
74 #endif
75
76 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
77
78 #include "arm_helium_utils.h"
79
80
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)81 void arm_cmplx_mag_f32(
82 const float32_t * pSrc,
83 float32_t * pDst,
84 uint32_t numSamples)
85 {
86 int32_t blockSize = numSamples; /* loop counters */
87 uint32_t blkCnt; /* loop counters */
88 f32x4x2_t vecSrc;
89 f32x4_t sum;
90 float32_t real, imag; /* Temporary variables to hold input values */
91
92 /* Compute 4 complex samples at a time */
93 blkCnt = blockSize >> 2;
94 while (blkCnt > 0U)
95 {
96 q31x4_t newtonStartVec;
97 f32x4_t sumHalf, invSqrt;
98
99 vecSrc = vld2q(pSrc);
100 pSrc += 8;
101 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
102 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
103
104 /*
105 * inlined Fast SQRT using inverse SQRT newton-raphson method
106 */
107
108 /* compute initial value */
109 newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
110 sumHalf = sum * 0.5f;
111 /*
112 * compute 3 x iterations
113 *
114 * The more iterations, the more accuracy.
115 * If you need to trade a bit of accuracy for more performance,
116 * you can comment out the 3rd use of the macro.
117 */
118 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
119 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
120 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
121 /*
122 * set negative values to 0
123 */
124 invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
125 /*
126 * sqrt(x) = x * invSqrt(x)
127 */
128 sum = vmulq(sum, invSqrt);
129 vst1q(pDst, sum);
130 pDst += 4;
131 /*
132 * Decrement the blockSize loop counter
133 */
134 blkCnt--;
135 }
136 /*
137 * tail
138 */
139 blkCnt = blockSize & 3;
140 while (blkCnt > 0U)
141 {
142 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
143
144 real = *pSrc++;
145 imag = *pSrc++;
146
147 /* store result in destination buffer. */
148 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
149
150 /* Decrement loop counter */
151 blkCnt--;
152 }
153 }
154
155 #else
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)156 void arm_cmplx_mag_f32(
157 const float32_t * pSrc,
158 float32_t * pDst,
159 uint32_t numSamples)
160 {
161 uint32_t blkCnt; /* loop counter */
162 float32_t real, imag; /* Temporary variables to hold input values */
163
164 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
165
166 float32x4x2_t vecA;
167 float32x4_t vRealA;
168 float32x4_t vImagA;
169 float32x4_t vMagSqA;
170
171 float32x4x2_t vecB;
172 float32x4_t vRealB;
173 float32x4_t vImagB;
174 float32x4_t vMagSqB;
175
176 /* Loop unrolling: Compute 8 outputs at a time */
177 blkCnt = numSamples >> 3;
178
179 while (blkCnt > 0U)
180 {
181 /* out = sqrt((real * real) + (imag * imag)) */
182
183 vecA = vld2q_f32(pSrc);
184 pSrc += 8;
185
186 vecB = vld2q_f32(pSrc);
187 pSrc += 8;
188
189 vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
190 vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
191 vMagSqA = vaddq_f32(vRealA, vImagA);
192
193 vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
194 vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
195 vMagSqB = vaddq_f32(vRealB, vImagB);
196
197 /* Store the result in the destination buffer. */
198 vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
199 pDst += 4;
200
201 vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
202 pDst += 4;
203
204 /* Decrement the loop counter */
205 blkCnt--;
206 }
207
208 blkCnt = numSamples & 7;
209
210 #else
211
212 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
213
214 /* Loop unrolling: Compute 4 outputs at a time */
215 blkCnt = numSamples >> 2U;
216
217 while (blkCnt > 0U)
218 {
219 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
220
221 real = *pSrc++;
222 imag = *pSrc++;
223
224 /* store result in destination buffer. */
225 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
226
227 real = *pSrc++;
228 imag = *pSrc++;
229 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
230
231 real = *pSrc++;
232 imag = *pSrc++;
233 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
234
235 real = *pSrc++;
236 imag = *pSrc++;
237 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
238
239 /* Decrement loop counter */
240 blkCnt--;
241 }
242
243 /* Loop unrolling: Compute remaining outputs */
244 blkCnt = numSamples % 0x4U;
245
246 #else
247
248 /* Initialize blkCnt with number of samples */
249 blkCnt = numSamples;
250
251 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
252 #endif /* #if defined(ARM_MATH_NEON) */
253
254 while (blkCnt > 0U)
255 {
256 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
257
258 real = *pSrc++;
259 imag = *pSrc++;
260
261 /* store result in destination buffer. */
262 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
263
264 /* Decrement loop counter */
265 blkCnt--;
266 }
267
268 }
269 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
270
271 /**
272 @} end of cmplx_mag group
273 */
274