• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_f16.c
4  * Description:  Floating-point complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 /**
34   @ingroup groupCmplxMath
35  */
36 
37 /**
38   @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
39 
40   Multiplies a complex vector by another complex vector and generates a complex result.
41   The data in the complex arrays is stored in an interleaved fashion
42   (real, imag, real, imag, ...).
43   The parameter <code>numSamples</code> represents the number of complex
44   samples processed.  The complex arrays have a total of <code>2*numSamples</code>
45   real values.
46 
47   The underlying algorithm is used:
48 
49   <pre>
50   for (n = 0; n < numSamples; n++) {
51       pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
52       pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
53   }
54   </pre>
55 
56   There are separate functions for floating-point, Q15, and Q31 data types.
57  */
58 
59 /**
60   @addtogroup CmplxByCmplxMult
61   @{
62  */
63 
64 /**
65   @brief         Floating-point complex-by-complex multiplication.
66   @param[in]     pSrcA       points to first input vector
67   @param[in]     pSrcB       points to second input vector
68   @param[out]    pDst        points to output vector
69   @param[in]     numSamples  number of samples in each vector
70   @return        none
71  */
72 
73 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
74 
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)75 void arm_cmplx_mult_cmplx_f16(
76   const float16_t * pSrcA,
77   const float16_t * pSrcB,
78         float16_t * pDst,
79         uint32_t numSamples)
80 {
81      int32_t         blkCnt;
82     f16x8_t         vecSrcA, vecSrcB;
83     f16x8_t         vecSrcC, vecSrcD;
84     f16x8_t         vec_acc;
85 
86     blkCnt = (numSamples >> 3);
87     blkCnt -= 1;
88     if (blkCnt > 0) {
89         /* should give more freedom to generate stall free code */
90         vecSrcA = vld1q(pSrcA);
91         vecSrcB = vld1q(pSrcB);
92         pSrcA += 8;
93         pSrcB += 8;
94 
95         while (blkCnt > 0) {
96             vec_acc = vcmulq(vecSrcA, vecSrcB);
97             vecSrcC = vld1q(pSrcA);
98             pSrcA += 8;
99 
100             vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
101             vecSrcD = vld1q(pSrcB);
102             pSrcB += 8;
103             vst1q(pDst, vec_acc);
104             pDst += 8;
105 
106             vec_acc = vcmulq(vecSrcC, vecSrcD);
107             vecSrcA = vld1q(pSrcA);
108             pSrcA += 8;
109 
110             vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
111             vecSrcB = vld1q(pSrcB);
112             pSrcB += 8;
113             vst1q(pDst, vec_acc);
114             pDst += 8;
115             /*
116              * Decrement the blockSize loop counter
117              */
118             blkCnt--;
119         }
120 
121         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
122         vec_acc = vcmulq(vecSrcA, vecSrcB);
123         vecSrcC = vld1q(pSrcA);
124 
125         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
126         vecSrcD = vld1q(pSrcB);
127         vst1q(pDst, vec_acc);
128         pDst += 8;
129 
130         vec_acc = vcmulq(vecSrcC, vecSrcD);
131         vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
132         vst1q(pDst, vec_acc);
133         pDst += 8;
134 
135         /*
136          * tail
137          */
138         blkCnt = CMPLX_DIM * (numSamples & 7);
139         while (blkCnt > 0) {
140             mve_pred16_t    p = vctp16q(blkCnt);
141             pSrcA += 8;
142             pSrcB += 8;
143 
144             vecSrcA = vldrhq_z_f16(pSrcA, p);
145             vecSrcB = vldrhq_z_f16(pSrcB, p);
146             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
147             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
148 
149             vstrhq_p_f16(pDst, vec_acc, p);
150             pDst += 8;
151 
152             blkCnt -= 8;
153         }
154     } else {
155         /* small vector */
156         blkCnt = numSamples * CMPLX_DIM;
157 
158         do {
159             mve_pred16_t    p = vctp16q(blkCnt);
160 
161             vecSrcA = vldrhq_z_f16(pSrcA, p);
162             vecSrcB = vldrhq_z_f16(pSrcB, p);
163 
164             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
165             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
166             vstrhq_p_f16(pDst, vec_acc, p);
167             pDst += 8;
168 
169             /*
170              * Decrement the blkCnt loop counter
171              * Advance vector source and destination pointers
172              */
173             pSrcA += 8;
174             pSrcB += 8;
175             blkCnt -= 8;
176         }
177         while (blkCnt > 0);
178     }
179 
180 }
181 
182 
183 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)184 void arm_cmplx_mult_cmplx_f16(
185   const float16_t * pSrcA,
186   const float16_t * pSrcB,
187         float16_t * pDst,
188         uint32_t numSamples)
189 {
190     uint32_t blkCnt;                               /* Loop counter */
191     _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
192 
193 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
194 
195   /* Loop unrolling: Compute 4 outputs at a time */
196   blkCnt = numSamples >> 2U;
197 
198   while (blkCnt > 0U)
199   {
200     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
201     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
202 
203     a = *pSrcA++;
204     b = *pSrcA++;
205     c = *pSrcB++;
206     d = *pSrcB++;
207     /* store result in destination buffer. */
208     *pDst++ = (a * c) - (b * d);
209     *pDst++ = (a * d) + (b * c);
210 
211     a = *pSrcA++;
212     b = *pSrcA++;
213     c = *pSrcB++;
214     d = *pSrcB++;
215     *pDst++ = (a * c) - (b * d);
216     *pDst++ = (a * d) + (b * c);
217 
218     a = *pSrcA++;
219     b = *pSrcA++;
220     c = *pSrcB++;
221     d = *pSrcB++;
222     *pDst++ = (a * c) - (b * d);
223     *pDst++ = (a * d) + (b * c);
224 
225     a = *pSrcA++;
226     b = *pSrcA++;
227     c = *pSrcB++;
228     d = *pSrcB++;
229     *pDst++ = (a * c) - (b * d);
230     *pDst++ = (a * d) + (b * c);
231 
232     /* Decrement loop counter */
233     blkCnt--;
234   }
235 
236   /* Loop unrolling: Compute remaining outputs */
237   blkCnt = numSamples % 0x4U;
238 
239 #else
240 
241   /* Initialize blkCnt with number of samples */
242   blkCnt = numSamples;
243 
244 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
245 
246   while (blkCnt > 0U)
247   {
248     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
249     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
250 
251     a = *pSrcA++;
252     b = *pSrcA++;
253     c = *pSrcB++;
254     d = *pSrcB++;
255 
256     /* store result in destination buffer. */
257     *pDst++ = (a * c) - (b * d);
258     *pDst++ = (a * d) + (b * c);
259 
260     /* Decrement loop counter */
261     blkCnt--;
262   }
263 
264 }
265 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
266 
267 /**
268   @} end of CmplxByCmplxMult group
269  */
270 
271 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */