• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_f32.c
4  * Description:  Floating-point complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
37 
38   Multiplies a complex vector by another complex vector and generates a complex result.
39   The data in the complex arrays is stored in an interleaved fashion
40   (real, imag, real, imag, ...).
41   The parameter <code>numSamples</code> represents the number of complex
42   samples processed.  The complex arrays have a total of <code>2*numSamples</code>
43   real values.
44 
45   The underlying algorithm is used:
46 
47   <pre>
48   for (n = 0; n < numSamples; n++) {
49       pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
50       pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
51   }
52   </pre>
53 
54   There are separate functions for floating-point, Q15, and Q31 data types.
55  */
56 
57 /**
58   @addtogroup CmplxByCmplxMult
59   @{
60  */
61 
62 /**
63   @brief         Floating-point complex-by-complex multiplication.
64   @param[in]     pSrcA       points to first input vector
65   @param[in]     pSrcB       points to second input vector
66   @param[out]    pDst        points to output vector
67   @param[in]     numSamples  number of samples in each vector
68   @return        none
69  */
70 
71 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
72 
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)73 void arm_cmplx_mult_cmplx_f32(
74   const float32_t * pSrcA,
75   const float32_t * pSrcB,
76         float32_t * pDst,
77         uint32_t numSamples)
78 {
79      int32_t         blkCnt;
80     f32x4_t         vecSrcA, vecSrcB;
81     f32x4_t         vecSrcC, vecSrcD;
82     f32x4_t         vec_acc;
83 
84     blkCnt = numSamples >> 2;
85     blkCnt -= 1;
86     if (blkCnt > 0) {
87         /* should give more freedom to generate stall free code */
88         vecSrcA = vld1q(pSrcA);
89         vecSrcB = vld1q(pSrcB);
90         pSrcA += 4;
91         pSrcB += 4;
92 
93         while (blkCnt > 0) {
94             vec_acc = vcmulq(vecSrcA, vecSrcB);
95             vecSrcC = vld1q(pSrcA);
96             pSrcA += 4;
97 
98             vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
99             vecSrcD = vld1q(pSrcB);
100             pSrcB += 4;
101             vst1q(pDst, vec_acc);
102             pDst += 4;
103 
104             vec_acc = vcmulq(vecSrcC, vecSrcD);
105             vecSrcA = vld1q(pSrcA);
106             pSrcA += 4;
107 
108             vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
109             vecSrcB = vld1q(pSrcB);
110             pSrcB += 4;
111             vst1q(pDst, vec_acc);
112             pDst += 4;
113             /*
114              * Decrement the blockSize loop counter
115              */
116             blkCnt--;
117         }
118 
119         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
120         vec_acc = vcmulq(vecSrcA, vecSrcB);
121         vecSrcC = vld1q(pSrcA);
122 
123         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
124         vecSrcD = vld1q(pSrcB);
125         vst1q(pDst, vec_acc);
126         pDst += 4;
127 
128         vec_acc = vcmulq(vecSrcC, vecSrcD);
129         vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
130         vst1q(pDst, vec_acc);
131         pDst += 4;
132 
133         /*
134          * tail
135          */
136         blkCnt = CMPLX_DIM * (numSamples & 3);
137         while (blkCnt > 0) {
138             mve_pred16_t    p = vctp32q(blkCnt);
139             pSrcA += 4;
140             pSrcB += 4;
141 
142             vecSrcA = vldrwq_z_f32(pSrcA, p);
143             vecSrcB = vldrwq_z_f32(pSrcB, p);
144             vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
145             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
146 
147             vstrwq_p_f32(pDst, vec_acc, p);
148             pDst += 4;
149 
150             blkCnt -= 4;
151         }
152     } else {
153         /* small vector */
154         blkCnt = numSamples * CMPLX_DIM;
155         vec_acc = vdupq_n_f32(0.0f);
156 
157         do {
158             mve_pred16_t    p = vctp32q(blkCnt);
159 
160             vecSrcA = vldrwq_z_f32(pSrcA, p);
161             vecSrcB = vldrwq_z_f32(pSrcB, p);
162 
163             vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
164             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
165             vstrwq_p_f32(pDst, vec_acc, p);
166             pDst += 4;
167 
168             /*
169              * Decrement the blkCnt loop counter
170              * Advance vector source and destination pointers
171              */
172             pSrcA += 4;
173             pSrcB += 4;
174             blkCnt -= 4;
175         }
176         while (blkCnt > 0);
177     }
178 
179 }
180 
181 #else
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)182 void arm_cmplx_mult_cmplx_f32(
183   const float32_t * pSrcA,
184   const float32_t * pSrcB,
185         float32_t * pDst,
186         uint32_t numSamples)
187 {
188     uint32_t blkCnt;                               /* Loop counter */
189     float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
190 
191 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
192     float32x4x2_t va, vb;
193     float32x4x2_t outCplx;
194 
195     /* Compute 4 outputs at a time */
196     blkCnt = numSamples >> 2U;
197 
198     while (blkCnt > 0U)
199     {
200         va = vld2q_f32(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
201         vb = vld2q_f32(pSrcB);  // load & separate real/imag pSrcB
202 
203 	/* Increment pointers */
204         pSrcA += 8;
205         pSrcB += 8;
206 
207 	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
208         outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
209         outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
210 
211 	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
212         outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
213         outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
214 
215         vst2q_f32(pDst, outCplx);
216 
217 	/* Increment pointer */
218         pDst += 8;
219 
220 	/* Decrement the loop counter */
221         blkCnt--;
222     }
223 
224     /* Tail */
225     blkCnt = numSamples & 3;
226 
227 #else
228 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
229 
230   /* Loop unrolling: Compute 4 outputs at a time */
231   blkCnt = numSamples >> 2U;
232 
233   while (blkCnt > 0U)
234   {
235     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
236     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
237 
238     a = *pSrcA++;
239     b = *pSrcA++;
240     c = *pSrcB++;
241     d = *pSrcB++;
242     /* store result in destination buffer. */
243     *pDst++ = (a * c) - (b * d);
244     *pDst++ = (a * d) + (b * c);
245 
246     a = *pSrcA++;
247     b = *pSrcA++;
248     c = *pSrcB++;
249     d = *pSrcB++;
250     *pDst++ = (a * c) - (b * d);
251     *pDst++ = (a * d) + (b * c);
252 
253     a = *pSrcA++;
254     b = *pSrcA++;
255     c = *pSrcB++;
256     d = *pSrcB++;
257     *pDst++ = (a * c) - (b * d);
258     *pDst++ = (a * d) + (b * c);
259 
260     a = *pSrcA++;
261     b = *pSrcA++;
262     c = *pSrcB++;
263     d = *pSrcB++;
264     *pDst++ = (a * c) - (b * d);
265     *pDst++ = (a * d) + (b * c);
266 
267     /* Decrement loop counter */
268     blkCnt--;
269   }
270 
271   /* Loop unrolling: Compute remaining outputs */
272   blkCnt = numSamples % 0x4U;
273 
274 #else
275 
276   /* Initialize blkCnt with number of samples */
277   blkCnt = numSamples;
278 
279 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280 #endif /* #if defined(ARM_MATH_NEON) */
281 
282   while (blkCnt > 0U)
283   {
284     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
285     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
286 
287     a = *pSrcA++;
288     b = *pSrcA++;
289     c = *pSrcB++;
290     d = *pSrcB++;
291 
292     /* store result in destination buffer. */
293     *pDst++ = (a * c) - (b * d);
294     *pDst++ = (a * d) + (b * c);
295 
296     /* Decrement loop counter */
297     blkCnt--;
298   }
299 
300 }
301 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
302 
303 /**
304   @} end of CmplxByCmplxMult group
305  */
306