1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_f16.c
4 * Description: Floating-point complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 /**
34 @ingroup groupCmplxMath
35 */
36
37 /**
38 @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
39
40 Multiplies a complex vector by another complex vector and generates a complex result.
41 The data in the complex arrays is stored in an interleaved fashion
42 (real, imag, real, imag, ...).
43 The parameter <code>numSamples</code> represents the number of complex
44 samples processed. The complex arrays have a total of <code>2*numSamples</code>
45 real values.
46
47 The underlying algorithm is used:
48
49 <pre>
50 for (n = 0; n < numSamples; n++) {
51 pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
52 pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
53 }
54 </pre>
55
56 There are separate functions for floating-point, Q15, and Q31 data types.
57 */
58
59 /**
60 @addtogroup CmplxByCmplxMult
61 @{
62 */
63
64 /**
65 @brief Floating-point complex-by-complex multiplication.
66 @param[in] pSrcA points to first input vector
67 @param[in] pSrcB points to second input vector
68 @param[out] pDst points to output vector
69 @param[in] numSamples number of samples in each vector
70 @return none
71 */
72
73 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
74
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)75 void arm_cmplx_mult_cmplx_f16(
76 const float16_t * pSrcA,
77 const float16_t * pSrcB,
78 float16_t * pDst,
79 uint32_t numSamples)
80 {
81 int32_t blkCnt;
82 f16x8_t vecSrcA, vecSrcB;
83 f16x8_t vecSrcC, vecSrcD;
84 f16x8_t vec_acc;
85
86 blkCnt = (numSamples >> 3);
87 blkCnt -= 1;
88 if (blkCnt > 0) {
89 /* should give more freedom to generate stall free code */
90 vecSrcA = vld1q(pSrcA);
91 vecSrcB = vld1q(pSrcB);
92 pSrcA += 8;
93 pSrcB += 8;
94
95 while (blkCnt > 0) {
96 vec_acc = vcmulq(vecSrcA, vecSrcB);
97 vecSrcC = vld1q(pSrcA);
98 pSrcA += 8;
99
100 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
101 vecSrcD = vld1q(pSrcB);
102 pSrcB += 8;
103 vst1q(pDst, vec_acc);
104 pDst += 8;
105
106 vec_acc = vcmulq(vecSrcC, vecSrcD);
107 vecSrcA = vld1q(pSrcA);
108 pSrcA += 8;
109
110 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
111 vecSrcB = vld1q(pSrcB);
112 pSrcB += 8;
113 vst1q(pDst, vec_acc);
114 pDst += 8;
115 /*
116 * Decrement the blockSize loop counter
117 */
118 blkCnt--;
119 }
120
121 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
122 vec_acc = vcmulq(vecSrcA, vecSrcB);
123 vecSrcC = vld1q(pSrcA);
124
125 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
126 vecSrcD = vld1q(pSrcB);
127 vst1q(pDst, vec_acc);
128 pDst += 8;
129
130 vec_acc = vcmulq(vecSrcC, vecSrcD);
131 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
132 vst1q(pDst, vec_acc);
133 pDst += 8;
134
135 /*
136 * tail
137 */
138 blkCnt = CMPLX_DIM * (numSamples & 7);
139 while (blkCnt > 0) {
140 mve_pred16_t p = vctp16q(blkCnt);
141 pSrcA += 8;
142 pSrcB += 8;
143
144 vecSrcA = vldrhq_z_f16(pSrcA, p);
145 vecSrcB = vldrhq_z_f16(pSrcB, p);
146 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
147 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
148
149 vstrhq_p_f16(pDst, vec_acc, p);
150 pDst += 8;
151
152 blkCnt -= 8;
153 }
154 } else {
155 /* small vector */
156 blkCnt = numSamples * CMPLX_DIM;
157
158 do {
159 mve_pred16_t p = vctp16q(blkCnt);
160
161 vecSrcA = vldrhq_z_f16(pSrcA, p);
162 vecSrcB = vldrhq_z_f16(pSrcB, p);
163
164 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
165 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
166 vstrhq_p_f16(pDst, vec_acc, p);
167 pDst += 8;
168
169 /*
170 * Decrement the blkCnt loop counter
171 * Advance vector source and destination pointers
172 */
173 pSrcA += 8;
174 pSrcB += 8;
175 blkCnt -= 8;
176 }
177 while (blkCnt > 0);
178 }
179
180 }
181
182
183 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)184 void arm_cmplx_mult_cmplx_f16(
185 const float16_t * pSrcA,
186 const float16_t * pSrcB,
187 float16_t * pDst,
188 uint32_t numSamples)
189 {
190 uint32_t blkCnt; /* Loop counter */
191 _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
192
193 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
194
195 /* Loop unrolling: Compute 4 outputs at a time */
196 blkCnt = numSamples >> 2U;
197
198 while (blkCnt > 0U)
199 {
200 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
201 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
202
203 a = *pSrcA++;
204 b = *pSrcA++;
205 c = *pSrcB++;
206 d = *pSrcB++;
207 /* store result in destination buffer. */
208 *pDst++ = (a * c) - (b * d);
209 *pDst++ = (a * d) + (b * c);
210
211 a = *pSrcA++;
212 b = *pSrcA++;
213 c = *pSrcB++;
214 d = *pSrcB++;
215 *pDst++ = (a * c) - (b * d);
216 *pDst++ = (a * d) + (b * c);
217
218 a = *pSrcA++;
219 b = *pSrcA++;
220 c = *pSrcB++;
221 d = *pSrcB++;
222 *pDst++ = (a * c) - (b * d);
223 *pDst++ = (a * d) + (b * c);
224
225 a = *pSrcA++;
226 b = *pSrcA++;
227 c = *pSrcB++;
228 d = *pSrcB++;
229 *pDst++ = (a * c) - (b * d);
230 *pDst++ = (a * d) + (b * c);
231
232 /* Decrement loop counter */
233 blkCnt--;
234 }
235
236 /* Loop unrolling: Compute remaining outputs */
237 blkCnt = numSamples % 0x4U;
238
239 #else
240
241 /* Initialize blkCnt with number of samples */
242 blkCnt = numSamples;
243
244 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
245
246 while (blkCnt > 0U)
247 {
248 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
249 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
250
251 a = *pSrcA++;
252 b = *pSrcA++;
253 c = *pSrcB++;
254 d = *pSrcB++;
255
256 /* store result in destination buffer. */
257 *pDst++ = (a * c) - (b * d);
258 *pDst++ = (a * d) + (b * c);
259
260 /* Decrement loop counter */
261 blkCnt--;
262 }
263
264 }
265 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
266
267 /**
268 @} end of CmplxByCmplxMult group
269 */
270
271 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */