• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_q31.c
4  * Description:  Q31 complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup CmplxByCmplxMult
37   @{
38  */
39 
40 /**
41   @brief         Q31 complex-by-complex multiplication.
42   @param[in]     pSrcA       points to first input vector
43   @param[in]     pSrcB       points to second input vector
44   @param[out]    pDst        points to output vector
45   @param[in]     numSamples  number of samples in each vector
46   @return        none
47 
48   @par           Scaling and Overflow Behavior
49                    The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
50                    Input down scaling is not required.
51  */
52 
53 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)54 void arm_cmplx_mult_cmplx_q31(
55   const q31_t * pSrcA,
56   const q31_t * pSrcB,
57         q31_t * pDst,
58         uint32_t numSamples)
59 {
60     int32_t         blkCnt;
61     q31x4_t         vecSrcA, vecSrcB;
62     q31x4_t         vecSrcC, vecSrcD;
63     q31x4_t         vecDst;
64 
65     blkCnt = numSamples >> 2;
66     blkCnt -= 1;
67     if (blkCnt > 0) {
68         /* should give more freedom to generate stall free code */
69         vecSrcA = vld1q(pSrcA);
70         vecSrcB = vld1q(pSrcB);
71         pSrcA += 4;
72         pSrcB += 4;
73 
74         while (blkCnt > 0) {
75 
76             /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
77             vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
78             vecSrcC = vld1q(pSrcA);
79             pSrcA += 4;
80 
81             /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
82             vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
83             vecSrcD = vld1q(pSrcB);
84             pSrcB += 4;
85 
86             vst1q(pDst, vshrq(vecDst, 2));
87             pDst += 4;
88 
89             vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
90             vecSrcA = vld1q(pSrcA);
91             pSrcA += 4;
92 
93             vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
94             vecSrcB = vld1q(pSrcB);
95             pSrcB += 4;
96 
97             vst1q(pDst, vshrq(vecDst, 2));
98             pDst += 4;
99 
100             /*
101              * Decrement the blockSize loop counter
102              */
103             blkCnt--;
104         }
105 
106         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
107         vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
108         vecSrcC = vld1q(pSrcA);
109 
110         vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
111         vecSrcD = vld1q(pSrcB);
112 
113         vst1q(pDst, vshrq(vecDst, 2));
114         pDst += 4;
115 
116         vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
117         vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
118 
119         vst1q(pDst, vshrq(vecDst, 2));
120         pDst += 4;
121 
122         /*
123          * tail
124          */
125         blkCnt = CMPLX_DIM * (numSamples & 3);
126         do {
127             mve_pred16_t    p = vctp32q(blkCnt);
128 
129             pSrcA += 4;
130             pSrcB += 4;
131 
132             vecSrcA = vldrwq_z_s32(pSrcA, p);
133             vecSrcB = vldrwq_z_s32(pSrcB, p);
134 
135             vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
136             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
137 
138             vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
139             vstrwq_p_s32(pDst, vecDst, p);
140             pDst += 4;
141 
142             blkCnt -= 4;
143         }
144         while ((int32_t) blkCnt > 0);
145     } else {
146         blkCnt = numSamples * CMPLX_DIM;
147         while (blkCnt > 0) {
148             mve_pred16_t    p = vctp32q(blkCnt);
149 
150             vecSrcA = vldrwq_z_s32(pSrcA, p);
151             vecSrcB = vldrwq_z_s32(pSrcB, p);
152 
153             vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
154             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
155 
156             vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
157             vstrwq_p_s32(pDst, vecDst, p);
158 
159             pDst += 4;
160             pSrcA += 4;
161             pSrcB += 4;
162 
163             blkCnt -= 4;
164         }
165     }
166 }
167 #else
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)168 void arm_cmplx_mult_cmplx_q31(
169   const q31_t * pSrcA,
170   const q31_t * pSrcB,
171         q31_t * pDst,
172         uint32_t numSamples)
173 {
174         uint32_t blkCnt;                               /* Loop counter */
175         q31_t a, b, c, d;                              /* Temporary variables */
176 
177 #if defined (ARM_MATH_LOOPUNROLL)
178 
179   /* Loop unrolling: Compute 4 outputs at a time */
180   blkCnt = numSamples >> 2U;
181 
182   while (blkCnt > 0U)
183   {
184     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
185     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
186 
187     a = *pSrcA++;
188     b = *pSrcA++;
189     c = *pSrcB++;
190     d = *pSrcB++;
191     /* store result in 3.29 format in destination buffer. */
192     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
193     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
194 
195     a = *pSrcA++;
196     b = *pSrcA++;
197     c = *pSrcB++;
198     d = *pSrcB++;
199     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
200     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
201 
202     a = *pSrcA++;
203     b = *pSrcA++;
204     c = *pSrcB++;
205     d = *pSrcB++;
206     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
207     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
208 
209     a = *pSrcA++;
210     b = *pSrcA++;
211     c = *pSrcB++;
212     d = *pSrcB++;
213     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
214     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
215 
216     /* Decrement loop counter */
217     blkCnt--;
218   }
219 
220   /* Loop unrolling: Compute remaining outputs */
221   blkCnt = numSamples % 0x4U;
222 
223 #else
224 
225   /* Initialize blkCnt with number of samples */
226   blkCnt = numSamples;
227 
228 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
229 
230   while (blkCnt > 0U)
231   {
232     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
233     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
234 
235     a = *pSrcA++;
236     b = *pSrcA++;
237     c = *pSrcB++;
238     d = *pSrcB++;
239 
240     /* store result in 3.29 format in destination buffer. */
241     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
242     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
243 
244     /* Decrement loop counter */
245     blkCnt--;
246   }
247 
248 }
249 #endif /* defined(ARM_MATH_MVEI) */
250 
251 /**
252   @} end of CmplxByCmplxMult group
253  */
254