• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_dot_prod_q15.c
4  * Description:  Processing function for the Q15 Complex Dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup cmplx_dot_prod
37   @{
38  */
39 
40 /**
41   @brief         Q15 complex dot product.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     numSamples  number of samples in each vector
45   @param[out]    realResult  real part of the result returned here
46   @param[out]    imagResult  imaginary part of the result returned her
47   @return        none
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using an internal 64-bit accumulator.
51                    The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
52                    These are accumulated in a 64-bit accumulator with 34.30 precision.
53                    As a final step, the accumulators are converted to 8.24 format.
54                    The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
55  */
56 
57 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)58 void arm_cmplx_dot_prod_q15(
59   const q15_t * pSrcA,
60   const q15_t * pSrcB,
61         uint32_t numSamples,
62         q31_t * realResult,
63         q31_t * imagResult)
64 {
65     int32_t         blkCnt;
66     q63_t           accReal = 0LL;
67     q63_t           accImag = 0LL;
68     q15x8_t         vecSrcA, vecSrcB;
69     q15x8_t         vecSrcC, vecSrcD;
70 
71     blkCnt = (numSamples >> 3);
72     blkCnt -= 1;
73     if (blkCnt > 0) {
74         /* should give more freedom to generate stall free code */
75         vecSrcA = vld1q(pSrcA);
76         vecSrcB = vld1q(pSrcB);
77         pSrcA += 8;
78         pSrcB += 8;
79 
80         while (blkCnt > 0) {
81 
82             accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
83             vecSrcC = vld1q(pSrcA);
84             pSrcA += 8;
85 
86             accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
87             vecSrcD = vld1q(pSrcB);
88             pSrcB += 8;
89 
90             accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
91             vecSrcA = vld1q(pSrcA);
92             pSrcA += 8;
93 
94             accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
95             vecSrcB = vld1q(pSrcB);
96             pSrcB += 8;
97             /*
98              * Decrement the blockSize loop counter
99              */
100             blkCnt--;
101         }
102 
103         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
104         accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
105         vecSrcC = vld1q(pSrcA);
106 
107         accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
108         vecSrcD = vld1q(pSrcB);
109 
110         accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
111         vecSrcA = vld1q(pSrcA);
112 
113         accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
114         vecSrcB = vld1q(pSrcB);
115 
116         /*
117          * tail
118          */
119         blkCnt = CMPLX_DIM * (numSamples & 7);
120         do {
121             mve_pred16_t    p = vctp16q(blkCnt);
122 
123             pSrcA += 8;
124             pSrcB += 8;
125 
126             vecSrcA = vldrhq_z_s16(pSrcA, p);
127             vecSrcB = vldrhq_z_s16(pSrcB, p);
128 
129             accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
130             accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
131 
132             blkCnt -= 8;
133         }
134         while ((int32_t) blkCnt > 0);
135     } else {
136         blkCnt = numSamples * CMPLX_DIM;
137         while (blkCnt > 0) {
138             mve_pred16_t    p = vctp16q(blkCnt);
139 
140             vecSrcA = vldrhq_z_s16(pSrcA, p);
141             vecSrcB = vldrhq_z_s16(pSrcB, p);
142 
143             accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
144             accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
145 
146             /*
147              * Decrement the blkCnt loop counter
148              * Advance vector source and destination pointers
149              */
150             pSrcA += 8;
151             pSrcB += 8;
152             blkCnt -= 8;
153         }
154     }
155     *realResult = asrl(accReal, (14 - 8));
156     *imagResult = asrl(accImag, (14 - 8));
157 }
158 #else
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)159 void arm_cmplx_dot_prod_q15(
160   const q15_t * pSrcA,
161   const q15_t * pSrcB,
162         uint32_t numSamples,
163         q31_t * realResult,
164         q31_t * imagResult)
165 {
166         uint32_t blkCnt;                               /* Loop counter */
167         q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
168         q15_t a0,b0,c0,d0;
169 
170 #if defined (ARM_MATH_LOOPUNROLL)
171   /* Loop unrolling: Compute 4 outputs at a time */
172   blkCnt = numSamples >> 2U;
173 
174   while (blkCnt > 0U)
175   {
176     a0 = *pSrcA++;
177     b0 = *pSrcA++;
178     c0 = *pSrcB++;
179     d0 = *pSrcB++;
180 
181     real_sum += (q31_t)a0 * c0;
182     imag_sum += (q31_t)a0 * d0;
183     real_sum -= (q31_t)b0 * d0;
184     imag_sum += (q31_t)b0 * c0;
185 
186     a0 = *pSrcA++;
187     b0 = *pSrcA++;
188     c0 = *pSrcB++;
189     d0 = *pSrcB++;
190 
191     real_sum += (q31_t)a0 * c0;
192     imag_sum += (q31_t)a0 * d0;
193     real_sum -= (q31_t)b0 * d0;
194     imag_sum += (q31_t)b0 * c0;
195 
196     a0 = *pSrcA++;
197     b0 = *pSrcA++;
198     c0 = *pSrcB++;
199     d0 = *pSrcB++;
200 
201     real_sum += (q31_t)a0 * c0;
202     imag_sum += (q31_t)a0 * d0;
203     real_sum -= (q31_t)b0 * d0;
204     imag_sum += (q31_t)b0 * c0;
205 
206     a0 = *pSrcA++;
207     b0 = *pSrcA++;
208     c0 = *pSrcB++;
209     d0 = *pSrcB++;
210 
211     real_sum += (q31_t)a0 * c0;
212     imag_sum += (q31_t)a0 * d0;
213     real_sum -= (q31_t)b0 * d0;
214     imag_sum += (q31_t)b0 * c0;
215 
216     /* Decrement loop counter */
217     blkCnt--;
218   }
219 
220   /* Loop unrolling: Compute remaining outputs */
221   blkCnt = numSamples % 0x4U;
222 
223 #else
224 
225   /* Initialize blkCnt with number of samples */
226   blkCnt = numSamples;
227 
228 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
229 
230   while (blkCnt > 0U)
231   {
232     a0 = *pSrcA++;
233     b0 = *pSrcA++;
234     c0 = *pSrcB++;
235     d0 = *pSrcB++;
236 
237     real_sum += (q31_t)a0 * c0;
238     imag_sum += (q31_t)a0 * d0;
239     real_sum -= (q31_t)b0 * d0;
240     imag_sum += (q31_t)b0 * c0;
241 
242     /* Decrement loop counter */
243     blkCnt--;
244   }
245 
246   /* Store real and imaginary result in 8.24 format  */
247   /* Convert real data in 34.30 to 8.24 by 6 right shifts */
248   *realResult = (q31_t) (real_sum >> 6);
249   /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
250   *imagResult = (q31_t) (imag_sum >> 6);
251 }
252 #endif /* defined(ARM_MATH_MVEI) */
253 
254 /**
255   @} end of cmplx_dot_prod group
256  */
257