1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_q15.c
4 * Description: Combined Radix Decimation in Q15 Frequency CFFT processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
32
33 #include "arm_vec_fft.h"
34
35
_arm_radix4_butterfly_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)36 static void _arm_radix4_butterfly_q15_mve(
37 const arm_cfft_instance_q15 * S,
38 q15_t *pSrc,
39 uint32_t fftLen)
40 {
41 q15x8_t vecTmp0, vecTmp1;
42 q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
43 q15x8_t vecA, vecB, vecC, vecD;
44 q15x8_t vecW;
45 uint32_t blkCnt;
46 uint32_t n1, n2;
47 uint32_t stage = 0;
48 int32_t iter = 1;
49 static const uint32_t strides[4] = {
50 (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
51 (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
52 };
53
54 /*
55 * Process first stages
56 * Each stage in middle stages provides two down scaling of the input
57 */
58 n2 = fftLen;
59 n1 = n2;
60 n2 >>= 2u;
61
62 for (int k = fftLen / 4u; k > 1; k >>= 2u)
63 {
64 for (int i = 0; i < iter; i++)
65 {
66 q15_t const *p_rearranged_twiddle_tab_stride2 =
67 &S->rearranged_twiddle_stride2[
68 S->rearranged_twiddle_tab_stride2_arr[stage]];
69 q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
70 S->rearranged_twiddle_tab_stride3_arr[stage]];
71 q15_t const *p_rearranged_twiddle_tab_stride1 =
72 &S->rearranged_twiddle_stride1[
73 S->rearranged_twiddle_tab_stride1_arr[stage]];
74 q15_t const *pW1, *pW2, *pW3;
75 q15_t *inA = pSrc + CMPLX_DIM * i * n1;
76 q15_t *inB = inA + n2 * CMPLX_DIM;
77 q15_t *inC = inB + n2 * CMPLX_DIM;
78 q15_t *inD = inC + n2 * CMPLX_DIM;
79
80 pW1 = p_rearranged_twiddle_tab_stride1;
81 pW2 = p_rearranged_twiddle_tab_stride2;
82 pW3 = p_rearranged_twiddle_tab_stride3;
83
84 blkCnt = n2 / 4;
85 /*
86 * load 4 x q15 complex pair
87 */
88 vecA = vldrhq_s16(inA);
89 vecC = vldrhq_s16(inC);
90 while (blkCnt > 0U)
91 {
92 vecB = vldrhq_s16(inB);
93 vecD = vldrhq_s16(inD);
94
95 vecSum0 = vhaddq(vecA, vecC);
96 vecDiff0 = vhsubq(vecA, vecC);
97
98 vecSum1 = vhaddq(vecB, vecD);
99 vecDiff1 = vhsubq(vecB, vecD);
100 /*
101 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
102 */
103 vecTmp0 = vhaddq(vecSum0, vecSum1);
104 vst1q(inA, vecTmp0);
105 inA += 8;
106 /*
107 * [ 1 -1 1 -1 ] * [ A B C D ]'
108 */
109 vecTmp0 = vhsubq(vecSum0, vecSum1);
110 /*
111 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
112 */
113 vecW = vld1q(pW2);
114 pW2 += 8;
115 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
116
117 vst1q(inB, vecTmp1);
118 inB += 8;
119 /*
120 * [ 1 -i -1 +i ] * [ A B C D ]'
121 */
122 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
123 /*
124 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
125 */
126 vecW = vld1q(pW1);
127 pW1 += 8;
128 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
129 vst1q(inC, vecTmp1);
130 inC += 8;
131
132 /*
133 * [ 1 +i -1 -i ] * [ A B C D ]'
134 */
135 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
136 /*
137 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
138 */
139 vecW = vld1q(pW3);
140 pW3 += 8;
141 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
142 vst1q(inD, vecTmp1);
143 inD += 8;
144
145 vecA = vldrhq_s16(inA);
146 vecC = vldrhq_s16(inC);
147
148 blkCnt--;
149 }
150 }
151 n1 = n2;
152 n2 >>= 2u;
153 iter = iter << 2;
154 stage++;
155 }
156
157 /*
158 * start of Last stage process
159 */
160 uint32x4_t vecScGathAddr = vld1q_u32 (strides);
161 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
162
163 /*
164 * load scheduling
165 */
166 vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
167 vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
168
169 blkCnt = (fftLen >> 4);
170 while (blkCnt > 0U)
171 {
172 vecSum0 = vhaddq(vecA, vecC);
173 vecDiff0 = vhsubq(vecA, vecC);
174
175 vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
176 vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
177
178 vecSum1 = vhaddq(vecB, vecD);
179 vecDiff1 = vhsubq(vecB, vecD);
180 /*
181 * pre-load for next iteration
182 */
183 vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
184 vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
185
186 vecTmp0 = vhaddq(vecSum0, vecSum1);
187 vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
188
189 vecTmp0 = vhsubq(vecSum0, vecSum1);
190 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
191
192 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
193 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
194
195 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
196 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
197
198 blkCnt--;
199 }
200
201 }
202
arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)203 static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
204 {
205 uint32_t n2;
206 q15_t *pIn0;
207 q15_t *pIn1;
208 const q15_t *pCoef = S->pTwiddle;
209 uint32_t blkCnt;
210 q15x8_t vecIn0, vecIn1, vecSum, vecDiff;
211 q15x8_t vecCmplxTmp, vecTw;
212 q15_t const *pCoefVec;
213
214 n2 = fftLen >> 1;
215
216 pIn0 = pSrc;
217 pIn1 = pSrc + fftLen;
218 pCoefVec = pCoef;
219
220 blkCnt = n2 / 4;
221
222 while (blkCnt > 0U)
223 {
224 vecIn0 = *(q15x8_t *) pIn0;
225 vecIn1 = *(q15x8_t *) pIn1;
226
227 vecIn0 = vecIn0 >> 1;
228 vecIn1 = vecIn1 >> 1;
229 vecSum = vhaddq(vecIn0, vecIn1);
230 vst1q(pIn0, vecSum);
231 pIn0 += 8;
232
233 vecTw = vld1q(pCoefVec);
234 pCoefVec += 8;
235
236 vecDiff = vhsubq(vecIn0, vecIn1);
237 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
238 vst1q(pIn1, vecCmplxTmp);
239 pIn1 += 8;
240
241 blkCnt--;
242 }
243
244 _arm_radix4_butterfly_q15_mve(S, pSrc, n2);
245
246 _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2);
247
248
249 pIn0 = pSrc;
250 blkCnt = (fftLen << 1) >> 3;
251 while (blkCnt > 0U)
252 {
253 vecIn0 = *(q15x8_t *) pIn0;
254 vecIn0 = vecIn0 << 1;
255 vst1q(pIn0, vecIn0);
256 pIn0 += 8;
257 blkCnt--;
258 }
259 /*
260 * tail
261 * (will be merged thru tail predication)
262 */
263 blkCnt = (fftLen << 1) & 7;
264 if (blkCnt > 0U)
265 {
266 mve_pred16_t p0 = vctp16q(blkCnt);
267
268 vecIn0 = *(q15x8_t *) pIn0;
269 vecIn0 = vecIn0 << 1;
270 vstrhq_p(pIn0, vecIn0, p0);
271 }
272 }
273
_arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)274 static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen)
275 {
276 q15x8_t vecTmp0, vecTmp1;
277 q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
278 q15x8_t vecA, vecB, vecC, vecD;
279 q15x8_t vecW;
280 uint32_t blkCnt;
281 uint32_t n1, n2;
282 uint32_t stage = 0;
283 int32_t iter = 1;
284 static const uint32_t strides[4] = {
285 (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
286 (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
287 };
288
289
290 /*
291 * Process first stages
292 * Each stage in middle stages provides two down scaling of the input
293 */
294 n2 = fftLen;
295 n1 = n2;
296 n2 >>= 2u;
297
298 for (int k = fftLen / 4u; k > 1; k >>= 2u)
299 {
300 for (int i = 0; i < iter; i++)
301 {
302 q15_t const *p_rearranged_twiddle_tab_stride2 =
303 &S->rearranged_twiddle_stride2[
304 S->rearranged_twiddle_tab_stride2_arr[stage]];
305 q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
306 S->rearranged_twiddle_tab_stride3_arr[stage]];
307 q15_t const *p_rearranged_twiddle_tab_stride1 =
308 &S->rearranged_twiddle_stride1[
309 S->rearranged_twiddle_tab_stride1_arr[stage]];
310 q15_t const *pW1, *pW2, *pW3;
311 q15_t *inA = pSrc + CMPLX_DIM * i * n1;
312 q15_t *inB = inA + n2 * CMPLX_DIM;
313 q15_t *inC = inB + n2 * CMPLX_DIM;
314 q15_t *inD = inC + n2 * CMPLX_DIM;
315
316 pW1 = p_rearranged_twiddle_tab_stride1;
317 pW2 = p_rearranged_twiddle_tab_stride2;
318 pW3 = p_rearranged_twiddle_tab_stride3;
319
320 blkCnt = n2 / 4;
321 /*
322 * load 4 x q15 complex pair
323 */
324 vecA = vldrhq_s16(inA);
325 vecC = vldrhq_s16(inC);
326 while (blkCnt > 0U)
327 {
328 vecB = vldrhq_s16(inB);
329 vecD = vldrhq_s16(inD);
330
331 vecSum0 = vhaddq(vecA, vecC);
332 vecDiff0 = vhsubq(vecA, vecC);
333
334 vecSum1 = vhaddq(vecB, vecD);
335 vecDiff1 = vhsubq(vecB, vecD);
336 /*
337 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
338 */
339 vecTmp0 = vhaddq(vecSum0, vecSum1);
340 vst1q(inA, vecTmp0);
341 inA += 8;
342 /*
343 * [ 1 -1 1 -1 ] * [ A B C D ]'
344 */
345 vecTmp0 = vhsubq(vecSum0, vecSum1);
346 /*
347 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
348 */
349 vecW = vld1q(pW2);
350 pW2 += 8;
351 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
352
353 vst1q(inB, vecTmp1);
354 inB += 8;
355 /*
356 * [ 1 -i -1 +i ] * [ A B C D ]'
357 */
358 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
359 /*
360 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
361 */
362 vecW = vld1q(pW1);
363 pW1 += 8;
364 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
365 vst1q(inC, vecTmp1);
366 inC += 8;
367 /*
368 * [ 1 +i -1 -i ] * [ A B C D ]'
369 */
370 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
371 /*
372 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
373 */
374 vecW = vld1q(pW3);
375 pW3 += 8;
376 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
377 vst1q(inD, vecTmp1);
378 inD += 8;
379
380 vecA = vldrhq_s16(inA);
381 vecC = vldrhq_s16(inC);
382
383 blkCnt--;
384 }
385 }
386 n1 = n2;
387 n2 >>= 2u;
388 iter = iter << 2;
389 stage++;
390 }
391
392 /*
393 * start of Last stage process
394 */
395 uint32x4_t vecScGathAddr = vld1q_u32(strides);
396 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
397
398 /*
399 * load scheduling
400 */
401 vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
402 vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
403
404 blkCnt = (fftLen >> 4);
405 while (blkCnt > 0U)
406 {
407 vecSum0 = vhaddq(vecA, vecC);
408 vecDiff0 = vhsubq(vecA, vecC);
409
410 vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
411 vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
412
413 vecSum1 = vhaddq(vecB, vecD);
414 vecDiff1 = vhsubq(vecB, vecD);
415 /*
416 * pre-load for next iteration
417 */
418 vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
419 vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
420
421 vecTmp0 = vhaddq(vecSum0, vecSum1);
422 vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
423
424 vecTmp0 = vhsubq(vecSum0, vecSum1);
425 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
426
427 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
428 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
429
430 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
431 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
432
433 blkCnt--;
434 }
435 }
436
arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)437 static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
438 {
439 uint32_t n2;
440 q15_t *pIn0;
441 q15_t *pIn1;
442 const q15_t *pCoef = S->pTwiddle;
443
444 uint32_t blkCnt;
445 q15x8_t vecIn0, vecIn1, vecSum, vecDiff;
446 q15x8_t vecCmplxTmp, vecTw;
447 q15_t const *pCoefVec;
448
449 n2 = fftLen >> 1;
450
451 pIn0 = pSrc;
452 pIn1 = pSrc + fftLen;
453 pCoefVec = pCoef;
454
455 blkCnt = n2 / 4;
456
457 while (blkCnt > 0U)
458 {
459 vecIn0 = *(q15x8_t *) pIn0;
460 vecIn1 = *(q15x8_t *) pIn1;
461
462 vecIn0 = vecIn0 >> 1;
463 vecIn1 = vecIn1 >> 1;
464 vecSum = vhaddq(vecIn0, vecIn1);
465 vst1q(pIn0, vecSum);
466 pIn0 += 8;
467
468 vecTw = vld1q(pCoefVec);
469 pCoefVec += 8;
470
471 vecDiff = vhsubq(vecIn0, vecIn1);
472 vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw);
473 vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw);
474 vst1q(pIn1, vecCmplxTmp);
475 pIn1 += 8;
476
477 blkCnt--;
478 }
479
480
481 _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2);
482
483 _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2);
484
485 pIn0 = pSrc;
486 blkCnt = (fftLen << 1) >> 3;
487 while (blkCnt > 0U)
488 {
489 vecIn0 = *(q15x8_t *) pIn0;
490 vecIn0 = vecIn0 << 1;
491 vst1q(pIn0, vecIn0);
492 pIn0 += 8;
493 blkCnt--;
494 }
495 /*
496 * tail
497 * (will be merged thru tail predication)
498 */
499 blkCnt = (fftLen << 1) & 7;
500 while (blkCnt > 0U)
501 {
502 mve_pred16_t p0 = vctp16q(blkCnt);
503
504 vecIn0 = *(q15x8_t *) pIn0;
505 vecIn0 = vecIn0 << 1;
506 vstrhq_p(pIn0, vecIn0, p0);
507 }
508 }
509
510 /**
511 @ingroup groupTransforms
512 */
513
514 /**
515 @addtogroup ComplexFFT
516 @{
517 */
518
519 /**
520 @brief Processing function for Q15 complex FFT.
521 @param[in] S points to an instance of Q15 CFFT structure
522 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
523 @param[in] ifftFlag flag that selects transform direction
524 - value = 0: forward transform
525 - value = 1: inverse transform
526 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
527 - value = 0: disables bit reversal of output
528 - value = 1: enables bit reversal of output
529 @return none
530 */
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)531 void arm_cfft_q15(
532 const arm_cfft_instance_q15 * S,
533 q15_t * pSrc,
534 uint8_t ifftFlag,
535 uint8_t bitReverseFlag)
536 {
537 uint32_t fftLen = S->fftLen;
538
539 if (ifftFlag == 1U) {
540
541 switch (fftLen) {
542 case 16:
543 case 64:
544 case 256:
545 case 1024:
546 case 4096:
547 _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
548 break;
549
550 case 32:
551 case 128:
552 case 512:
553 case 2048:
554 arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
555 break;
556 }
557 } else {
558 switch (fftLen) {
559 case 16:
560 case 64:
561 case 256:
562 case 1024:
563 case 4096:
564 _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
565 break;
566
567 case 32:
568 case 128:
569 case 512:
570 case 2048:
571 arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
572 break;
573 }
574 }
575
576
577 if (bitReverseFlag)
578 {
579
580 arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
581
582 }
583 }
584
585 #else
586
587 extern void arm_radix4_butterfly_q15(
588 q15_t * pSrc,
589 uint32_t fftLen,
590 const q15_t * pCoef,
591 uint32_t twidCoefModifier);
592
593 extern void arm_radix4_butterfly_inverse_q15(
594 q15_t * pSrc,
595 uint32_t fftLen,
596 const q15_t * pCoef,
597 uint32_t twidCoefModifier);
598
599 extern void arm_bitreversal_16(
600 uint16_t * pSrc,
601 const uint16_t bitRevLen,
602 const uint16_t * pBitRevTable);
603
604 void arm_cfft_radix4by2_q15(
605 q15_t * pSrc,
606 uint32_t fftLen,
607 const q15_t * pCoef);
608
609 void arm_cfft_radix4by2_inverse_q15(
610 q15_t * pSrc,
611 uint32_t fftLen,
612 const q15_t * pCoef);
613
614 /**
615 @ingroup groupTransforms
616 */
617
618 /**
619 @addtogroup ComplexFFT
620 @{
621 */
622
623 /**
624 @brief Processing function for Q15 complex FFT.
625 @param[in] S points to an instance of Q15 CFFT structure
626 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
627 @param[in] ifftFlag flag that selects transform direction
628 - value = 0: forward transform
629 - value = 1: inverse transform
630 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
631 - value = 0: disables bit reversal of output
632 - value = 1: enables bit reversal of output
633 @return none
634 */
635
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)636 void arm_cfft_q15(
637 const arm_cfft_instance_q15 * S,
638 q15_t * p1,
639 uint8_t ifftFlag,
640 uint8_t bitReverseFlag)
641 {
642 uint32_t L = S->fftLen;
643
644 if (ifftFlag == 1U)
645 {
646 switch (L)
647 {
648 case 16:
649 case 64:
650 case 256:
651 case 1024:
652 case 4096:
653 arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
654 break;
655
656 case 32:
657 case 128:
658 case 512:
659 case 2048:
660 arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
661 break;
662 }
663 }
664 else
665 {
666 switch (L)
667 {
668 case 16:
669 case 64:
670 case 256:
671 case 1024:
672 case 4096:
673 arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
674 break;
675
676 case 32:
677 case 128:
678 case 512:
679 case 2048:
680 arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle );
681 break;
682 }
683 }
684
685 if ( bitReverseFlag )
686 arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
687 }
688
689 /**
690 @} end of ComplexFFT group
691 */
692
arm_cfft_radix4by2_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)693 void arm_cfft_radix4by2_q15(
694 q15_t * pSrc,
695 uint32_t fftLen,
696 const q15_t * pCoef)
697 {
698 uint32_t i;
699 uint32_t n2;
700 q15_t p0, p1, p2, p3;
701 #if defined (ARM_MATH_DSP)
702 q31_t T, S, R;
703 q31_t coeff, out1, out2;
704 const q15_t *pC = pCoef;
705 q15_t *pSi = pSrc;
706 q15_t *pSl = pSrc + fftLen;
707 #else
708 uint32_t l;
709 q15_t xt, yt, cosVal, sinVal;
710 #endif
711
712 n2 = fftLen >> 1U;
713
714 #if defined (ARM_MATH_DSP)
715
716 for (i = n2; i > 0; i--)
717 {
718 coeff = read_q15x2_ia ((q15_t **) &pC);
719
720 T = read_q15x2 (pSi);
721 T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
722
723 S = read_q15x2 (pSl);
724 S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
725
726 R = __QSUB16(T, S);
727
728 write_q15x2_ia (&pSi, __SHADD16(T, S));
729
730 #ifndef ARM_MATH_BIG_ENDIAN
731 out1 = __SMUAD(coeff, R) >> 16U;
732 out2 = __SMUSDX(coeff, R);
733 #else
734 out1 = __SMUSDX(R, coeff) >> 16U;
735 out2 = __SMUAD(coeff, R);
736 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
737
738 write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
739 }
740
741 #else /* #if defined (ARM_MATH_DSP) */
742
743 for (i = 0; i < n2; i++)
744 {
745 cosVal = pCoef[2 * i];
746 sinVal = pCoef[2 * i + 1];
747
748 l = i + n2;
749
750 xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
751 pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
752
753 yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
754 pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
755
756 pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
757 ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
758
759 pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
760 ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
761 }
762
763 #endif /* #if defined (ARM_MATH_DSP) */
764
765 /* first col */
766 arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U);
767
768 /* second col */
769 arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
770
771 n2 = fftLen >> 1U;
772 for (i = 0; i < n2; i++)
773 {
774 p0 = pSrc[4 * i + 0];
775 p1 = pSrc[4 * i + 1];
776 p2 = pSrc[4 * i + 2];
777 p3 = pSrc[4 * i + 3];
778
779 p0 <<= 1U;
780 p1 <<= 1U;
781 p2 <<= 1U;
782 p3 <<= 1U;
783
784 pSrc[4 * i + 0] = p0;
785 pSrc[4 * i + 1] = p1;
786 pSrc[4 * i + 2] = p2;
787 pSrc[4 * i + 3] = p3;
788 }
789
790 }
791
arm_cfft_radix4by2_inverse_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)792 void arm_cfft_radix4by2_inverse_q15(
793 q15_t * pSrc,
794 uint32_t fftLen,
795 const q15_t * pCoef)
796 {
797 uint32_t i;
798 uint32_t n2;
799 q15_t p0, p1, p2, p3;
800 #if defined (ARM_MATH_DSP)
801 q31_t T, S, R;
802 q31_t coeff, out1, out2;
803 const q15_t *pC = pCoef;
804 q15_t *pSi = pSrc;
805 q15_t *pSl = pSrc + fftLen;
806 #else
807 uint32_t l;
808 q15_t xt, yt, cosVal, sinVal;
809 #endif
810
811 n2 = fftLen >> 1U;
812
813 #if defined (ARM_MATH_DSP)
814
815 for (i = n2; i > 0; i--)
816 {
817 coeff = read_q15x2_ia ((q15_t **) &pC);
818
819 T = read_q15x2 (pSi);
820 T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
821
822 S = read_q15x2 (pSl);
823 S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
824
825 R = __QSUB16(T, S);
826
827 write_q15x2_ia (&pSi, __SHADD16(T, S));
828
829 #ifndef ARM_MATH_BIG_ENDIAN
830 out1 = __SMUSD(coeff, R) >> 16U;
831 out2 = __SMUADX(coeff, R);
832 #else
833 out1 = __SMUADX(R, coeff) >> 16U;
834 out2 = __SMUSD(__QSUB(0, coeff), R);
835 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
836
837 write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
838 }
839
840 #else /* #if defined (ARM_MATH_DSP) */
841
842 for (i = 0; i < n2; i++)
843 {
844 cosVal = pCoef[2 * i];
845 sinVal = pCoef[2 * i + 1];
846
847 l = i + n2;
848
849 xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
850 pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
851
852 yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
853 pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
854
855 pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
856 ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
857
858 pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
859 ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
860 }
861
862 #endif /* #if defined (ARM_MATH_DSP) */
863
864 /* first col */
865 arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U);
866
867 /* second col */
868 arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
869
870 n2 = fftLen >> 1U;
871 for (i = 0; i < n2; i++)
872 {
873 p0 = pSrc[4 * i + 0];
874 p1 = pSrc[4 * i + 1];
875 p2 = pSrc[4 * i + 2];
876 p3 = pSrc[4 * i + 3];
877
878 p0 <<= 1U;
879 p1 <<= 1U;
880 p2 <<= 1U;
881 p3 <<= 1U;
882
883 pSrc[4 * i + 0] = p0;
884 pSrc[4 * i + 1] = p1;
885 pSrc[4 * i + 2] = p2;
886 pSrc[4 * i + 3] = p3;
887 }
888 }
889
890 #endif /* defined(ARM_MATH_MVEI) */
891