1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_f32.c
4 * Description: Floating-point complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
37
38 Multiplies a complex vector by another complex vector and generates a complex result.
39 The data in the complex arrays is stored in an interleaved fashion
40 (real, imag, real, imag, ...).
41 The parameter <code>numSamples</code> represents the number of complex
42 samples processed. The complex arrays have a total of <code>2*numSamples</code>
43 real values.
44
45 The underlying algorithm is used:
46
47 <pre>
48 for (n = 0; n < numSamples; n++) {
49 pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
50 pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
51 }
52 </pre>
53
54 There are separate functions for floating-point, Q15, and Q31 data types.
55 */
56
57 /**
58 @addtogroup CmplxByCmplxMult
59 @{
60 */
61
62 /**
63 @brief Floating-point complex-by-complex multiplication.
64 @param[in] pSrcA points to first input vector
65 @param[in] pSrcB points to second input vector
66 @param[out] pDst points to output vector
67 @param[in] numSamples number of samples in each vector
68 @return none
69 */
70
71 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
72
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)73 void arm_cmplx_mult_cmplx_f32(
74 const float32_t * pSrcA,
75 const float32_t * pSrcB,
76 float32_t * pDst,
77 uint32_t numSamples)
78 {
79 int32_t blkCnt;
80 f32x4_t vecSrcA, vecSrcB;
81 f32x4_t vecSrcC, vecSrcD;
82 f32x4_t vec_acc;
83
84 blkCnt = numSamples >> 2;
85 blkCnt -= 1;
86 if (blkCnt > 0) {
87 /* should give more freedom to generate stall free code */
88 vecSrcA = vld1q(pSrcA);
89 vecSrcB = vld1q(pSrcB);
90 pSrcA += 4;
91 pSrcB += 4;
92
93 while (blkCnt > 0) {
94 vec_acc = vcmulq(vecSrcA, vecSrcB);
95 vecSrcC = vld1q(pSrcA);
96 pSrcA += 4;
97
98 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
99 vecSrcD = vld1q(pSrcB);
100 pSrcB += 4;
101 vst1q(pDst, vec_acc);
102 pDst += 4;
103
104 vec_acc = vcmulq(vecSrcC, vecSrcD);
105 vecSrcA = vld1q(pSrcA);
106 pSrcA += 4;
107
108 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
109 vecSrcB = vld1q(pSrcB);
110 pSrcB += 4;
111 vst1q(pDst, vec_acc);
112 pDst += 4;
113 /*
114 * Decrement the blockSize loop counter
115 */
116 blkCnt--;
117 }
118
119 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
120 vec_acc = vcmulq(vecSrcA, vecSrcB);
121 vecSrcC = vld1q(pSrcA);
122
123 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
124 vecSrcD = vld1q(pSrcB);
125 vst1q(pDst, vec_acc);
126 pDst += 4;
127
128 vec_acc = vcmulq(vecSrcC, vecSrcD);
129 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
130 vst1q(pDst, vec_acc);
131 pDst += 4;
132
133 /*
134 * tail
135 */
136 blkCnt = CMPLX_DIM * (numSamples & 3);
137 while (blkCnt > 0) {
138 mve_pred16_t p = vctp32q(blkCnt);
139 pSrcA += 4;
140 pSrcB += 4;
141
142 vecSrcA = vldrwq_z_f32(pSrcA, p);
143 vecSrcB = vldrwq_z_f32(pSrcB, p);
144 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
145 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
146
147 vstrwq_p_f32(pDst, vec_acc, p);
148 pDst += 4;
149
150 blkCnt -= 4;
151 }
152 } else {
153 /* small vector */
154 blkCnt = numSamples * CMPLX_DIM;
155 vec_acc = vdupq_n_f32(0.0f);
156
157 do {
158 mve_pred16_t p = vctp32q(blkCnt);
159
160 vecSrcA = vldrwq_z_f32(pSrcA, p);
161 vecSrcB = vldrwq_z_f32(pSrcB, p);
162
163 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
164 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
165 vstrwq_p_f32(pDst, vec_acc, p);
166 pDst += 4;
167
168 /*
169 * Decrement the blkCnt loop counter
170 * Advance vector source and destination pointers
171 */
172 pSrcA += 4;
173 pSrcB += 4;
174 blkCnt -= 4;
175 }
176 while (blkCnt > 0);
177 }
178
179 }
180
181 #else
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)182 void arm_cmplx_mult_cmplx_f32(
183 const float32_t * pSrcA,
184 const float32_t * pSrcB,
185 float32_t * pDst,
186 uint32_t numSamples)
187 {
188 uint32_t blkCnt; /* Loop counter */
189 float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
190
191 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
192 float32x4x2_t va, vb;
193 float32x4x2_t outCplx;
194
195 /* Compute 4 outputs at a time */
196 blkCnt = numSamples >> 2U;
197
198 while (blkCnt > 0U)
199 {
200 va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
201 vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB
202
203 /* Increment pointers */
204 pSrcA += 8;
205 pSrcB += 8;
206
207 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
208 outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
209 outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
210
211 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
212 outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
213 outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
214
215 vst2q_f32(pDst, outCplx);
216
217 /* Increment pointer */
218 pDst += 8;
219
220 /* Decrement the loop counter */
221 blkCnt--;
222 }
223
224 /* Tail */
225 blkCnt = numSamples & 3;
226
227 #else
228 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
229
230 /* Loop unrolling: Compute 4 outputs at a time */
231 blkCnt = numSamples >> 2U;
232
233 while (blkCnt > 0U)
234 {
235 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
236 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
237
238 a = *pSrcA++;
239 b = *pSrcA++;
240 c = *pSrcB++;
241 d = *pSrcB++;
242 /* store result in destination buffer. */
243 *pDst++ = (a * c) - (b * d);
244 *pDst++ = (a * d) + (b * c);
245
246 a = *pSrcA++;
247 b = *pSrcA++;
248 c = *pSrcB++;
249 d = *pSrcB++;
250 *pDst++ = (a * c) - (b * d);
251 *pDst++ = (a * d) + (b * c);
252
253 a = *pSrcA++;
254 b = *pSrcA++;
255 c = *pSrcB++;
256 d = *pSrcB++;
257 *pDst++ = (a * c) - (b * d);
258 *pDst++ = (a * d) + (b * c);
259
260 a = *pSrcA++;
261 b = *pSrcA++;
262 c = *pSrcB++;
263 d = *pSrcB++;
264 *pDst++ = (a * c) - (b * d);
265 *pDst++ = (a * d) + (b * c);
266
267 /* Decrement loop counter */
268 blkCnt--;
269 }
270
271 /* Loop unrolling: Compute remaining outputs */
272 blkCnt = numSamples % 0x4U;
273
274 #else
275
276 /* Initialize blkCnt with number of samples */
277 blkCnt = numSamples;
278
279 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280 #endif /* #if defined(ARM_MATH_NEON) */
281
282 while (blkCnt > 0U)
283 {
284 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
285 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
286
287 a = *pSrcA++;
288 b = *pSrcA++;
289 c = *pSrcB++;
290 d = *pSrcB++;
291
292 /* store result in destination buffer. */
293 *pDst++ = (a * c) - (b * d);
294 *pDst++ = (a * d) + (b * c);
295
296 /* Decrement loop counter */
297 blkCnt--;
298 }
299
300 }
301 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
302
303 /**
304 @} end of CmplxByCmplxMult group
305 */
306