1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_q31.c
4 * Description: Q31 complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup CmplxByCmplxMult
37 @{
38 */
39
40 /**
41 @brief Q31 complex-by-complex multiplication.
42 @param[in] pSrcA points to first input vector
43 @param[in] pSrcB points to second input vector
44 @param[out] pDst points to output vector
45 @param[in] numSamples number of samples in each vector
46 @return none
47
48 @par Scaling and Overflow Behavior
49 The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
50 Input down scaling is not required.
51 */
52
53 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)54 void arm_cmplx_mult_cmplx_q31(
55 const q31_t * pSrcA,
56 const q31_t * pSrcB,
57 q31_t * pDst,
58 uint32_t numSamples)
59 {
60 int32_t blkCnt;
61 q31x4_t vecSrcA, vecSrcB;
62 q31x4_t vecSrcC, vecSrcD;
63 q31x4_t vecDst;
64
65 blkCnt = numSamples >> 2;
66 blkCnt -= 1;
67 if (blkCnt > 0) {
68 /* should give more freedom to generate stall free code */
69 vecSrcA = vld1q(pSrcA);
70 vecSrcB = vld1q(pSrcB);
71 pSrcA += 4;
72 pSrcB += 4;
73
74 while (blkCnt > 0) {
75
76 /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
77 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
78 vecSrcC = vld1q(pSrcA);
79 pSrcA += 4;
80
81 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
82 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
83 vecSrcD = vld1q(pSrcB);
84 pSrcB += 4;
85
86 vst1q(pDst, vshrq(vecDst, 2));
87 pDst += 4;
88
89 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
90 vecSrcA = vld1q(pSrcA);
91 pSrcA += 4;
92
93 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
94 vecSrcB = vld1q(pSrcB);
95 pSrcB += 4;
96
97 vst1q(pDst, vshrq(vecDst, 2));
98 pDst += 4;
99
100 /*
101 * Decrement the blockSize loop counter
102 */
103 blkCnt--;
104 }
105
106 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
107 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
108 vecSrcC = vld1q(pSrcA);
109
110 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
111 vecSrcD = vld1q(pSrcB);
112
113 vst1q(pDst, vshrq(vecDst, 2));
114 pDst += 4;
115
116 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
117 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
118
119 vst1q(pDst, vshrq(vecDst, 2));
120 pDst += 4;
121
122 /*
123 * tail
124 */
125 blkCnt = CMPLX_DIM * (numSamples & 3);
126 do {
127 mve_pred16_t p = vctp32q(blkCnt);
128
129 pSrcA += 4;
130 pSrcB += 4;
131
132 vecSrcA = vldrwq_z_s32(pSrcA, p);
133 vecSrcB = vldrwq_z_s32(pSrcB, p);
134
135 vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
136 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
137
138 vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
139 vstrwq_p_s32(pDst, vecDst, p);
140 pDst += 4;
141
142 blkCnt -= 4;
143 }
144 while ((int32_t) blkCnt > 0);
145 } else {
146 blkCnt = numSamples * CMPLX_DIM;
147 while (blkCnt > 0) {
148 mve_pred16_t p = vctp32q(blkCnt);
149
150 vecSrcA = vldrwq_z_s32(pSrcA, p);
151 vecSrcB = vldrwq_z_s32(pSrcB, p);
152
153 vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
154 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
155
156 vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
157 vstrwq_p_s32(pDst, vecDst, p);
158
159 pDst += 4;
160 pSrcA += 4;
161 pSrcB += 4;
162
163 blkCnt -= 4;
164 }
165 }
166 }
167 #else
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)168 void arm_cmplx_mult_cmplx_q31(
169 const q31_t * pSrcA,
170 const q31_t * pSrcB,
171 q31_t * pDst,
172 uint32_t numSamples)
173 {
174 uint32_t blkCnt; /* Loop counter */
175 q31_t a, b, c, d; /* Temporary variables */
176
177 #if defined (ARM_MATH_LOOPUNROLL)
178
179 /* Loop unrolling: Compute 4 outputs at a time */
180 blkCnt = numSamples >> 2U;
181
182 while (blkCnt > 0U)
183 {
184 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
185 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
186
187 a = *pSrcA++;
188 b = *pSrcA++;
189 c = *pSrcB++;
190 d = *pSrcB++;
191 /* store result in 3.29 format in destination buffer. */
192 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
193 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
194
195 a = *pSrcA++;
196 b = *pSrcA++;
197 c = *pSrcB++;
198 d = *pSrcB++;
199 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
200 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
201
202 a = *pSrcA++;
203 b = *pSrcA++;
204 c = *pSrcB++;
205 d = *pSrcB++;
206 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
207 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
208
209 a = *pSrcA++;
210 b = *pSrcA++;
211 c = *pSrcB++;
212 d = *pSrcB++;
213 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
214 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
215
216 /* Decrement loop counter */
217 blkCnt--;
218 }
219
220 /* Loop unrolling: Compute remaining outputs */
221 blkCnt = numSamples % 0x4U;
222
223 #else
224
225 /* Initialize blkCnt with number of samples */
226 blkCnt = numSamples;
227
228 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
229
230 while (blkCnt > 0U)
231 {
232 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
233 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
234
235 a = *pSrcA++;
236 b = *pSrcA++;
237 c = *pSrcB++;
238 d = *pSrcB++;
239
240 /* store result in 3.29 format in destination buffer. */
241 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
242 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
243
244 /* Decrement loop counter */
245 blkCnt--;
246 }
247
248 }
249 #endif /* defined(ARM_MATH_MVEI) */
250
251 /**
252 @} end of CmplxByCmplxMult group
253 */
254