1 // Auto-generated file. Do not edit!
2 // Template: src/cs16-fftr/scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2022 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11 #include <stddef.h>
12 #include <stdint.h>
13
14 #include <xnnpack/math.h>
15 #include <xnnpack/fft.h>
16
17
xnn_cs16_fftr_ukernel__scalar_x4(size_t samples,int16_t * data,const int16_t * twiddle)18 void xnn_cs16_fftr_ukernel__scalar_x4(
19 size_t samples,
20 int16_t* data,
21 const int16_t* twiddle)
22 {
23 assert(samples >= 2);
24 assert(samples % 2 == 0);
25 assert(data != NULL);
26 assert(data != NULL);
27 assert(twiddle != NULL);
28
29 int16_t* dl = data;
30 int16_t* dr = data + samples * 2;
31 int32_t vdcr = (int32_t) dl[0];
32 int32_t vdci = (int32_t) dl[1];
33
34 vdcr = math_asr_s32(vdcr * 16383 + 16384, 15);
35 vdci = math_asr_s32(vdci * 16383 + 16384, 15);
36
37 dl[0] = vdcr + vdci;
38 dl[1] = 0;
39 dl += 2;
40 dr[0] = vdcr - vdci;
41 dr[1] = 0;
42
43 samples >>= 1;
44
45 for (; samples >= 4; samples -= 4) {
46 dr -= 4 * 2;
47 int32_t vilr0 = dl[0];
48 int32_t vili0 = dl[1];
49 int32_t vilr1 = dl[2];
50 int32_t vili1 = dl[3];
51 int32_t vilr2 = dl[4];
52 int32_t vili2 = dl[5];
53 int32_t vilr3 = dl[6];
54 int32_t vili3 = dl[7];
55 int32_t virr0 = (int32_t) dr[6];
56 int32_t viri0 = -(int32_t) dr[7];
57 int32_t virr1 = (int32_t) dr[4];
58 int32_t viri1 = -(int32_t) dr[5];
59 int32_t virr2 = (int32_t) dr[2];
60 int32_t viri2 = -(int32_t) dr[3];
61 int32_t virr3 = (int32_t) dr[0];
62 int32_t viri3 = -(int32_t) dr[1];
63 const int32_t vtwr0 = twiddle[0];
64 const int32_t vtwi0 = twiddle[1];
65 const int32_t vtwr1 = twiddle[2];
66 const int32_t vtwi1 = twiddle[3];
67 const int32_t vtwr2 = twiddle[4];
68 const int32_t vtwi2 = twiddle[5];
69 const int32_t vtwr3 = twiddle[6];
70 const int32_t vtwi3 = twiddle[7];
71 twiddle += 4 * 2;
72
73 vilr0 = math_asr_s32(vilr0 * 16383 + 16384, 15);
74 virr0 = math_asr_s32(virr0 * 16383 + 16384, 15);
75 vilr1 = math_asr_s32(vilr1 * 16383 + 16384, 15);
76 virr1 = math_asr_s32(virr1 * 16383 + 16384, 15);
77 vilr2 = math_asr_s32(vilr2 * 16383 + 16384, 15);
78 virr2 = math_asr_s32(virr2 * 16383 + 16384, 15);
79 vilr3 = math_asr_s32(vilr3 * 16383 + 16384, 15);
80 virr3 = math_asr_s32(virr3 * 16383 + 16384, 15);
81 vili0 = math_asr_s32(vili0 * 16383 + 16384, 15);
82 viri0 = math_asr_s32(viri0 * 16383 + 16384, 15);
83 vili1 = math_asr_s32(vili1 * 16383 + 16384, 15);
84 viri1 = math_asr_s32(viri1 * 16383 + 16384, 15);
85 vili2 = math_asr_s32(vili2 * 16383 + 16384, 15);
86 viri2 = math_asr_s32(viri2 * 16383 + 16384, 15);
87 vili3 = math_asr_s32(vili3 * 16383 + 16384, 15);
88 viri3 = math_asr_s32(viri3 * 16383 + 16384, 15);
89 const int32_t vacc1r0 = vilr0 + virr0;
90 const int32_t vacc2r0 = vilr0 - virr0;
91 const int32_t vacc1r1 = vilr1 + virr1;
92 const int32_t vacc2r1 = vilr1 - virr1;
93 const int32_t vacc1r2 = vilr2 + virr2;
94 const int32_t vacc2r2 = vilr2 - virr2;
95 const int32_t vacc1r3 = vilr3 + virr3;
96 const int32_t vacc2r3 = vilr3 - virr3;
97 const int32_t vacc1i0 = vili0 + viri0;
98 const int32_t vacc2i0 = vili0 - viri0;
99 const int32_t vacc1i1 = vili1 + viri1;
100 const int32_t vacc2i1 = vili1 - viri1;
101 const int32_t vacc1i2 = vili2 + viri2;
102 const int32_t vacc2i2 = vili2 - viri2;
103 const int32_t vacc1i3 = vili3 + viri3;
104 const int32_t vacc2i3 = vili3 - viri3;
105
106 const int32_t twr0 = math_asr_s32(vacc2r0 * vtwr0 - vacc2i0 * vtwi0 + 16384, 15);
107 const int32_t twr1 = math_asr_s32(vacc2r1 * vtwr1 - vacc2i1 * vtwi1 + 16384, 15);
108 const int32_t twr2 = math_asr_s32(vacc2r2 * vtwr2 - vacc2i2 * vtwi2 + 16384, 15);
109 const int32_t twr3 = math_asr_s32(vacc2r3 * vtwr3 - vacc2i3 * vtwi3 + 16384, 15);
110 const int32_t twi0 = math_asr_s32(vacc2r0 * vtwi0 + vacc2i0 * vtwr0 + 16384, 15);
111 const int32_t twi1 = math_asr_s32(vacc2r1 * vtwi1 + vacc2i1 * vtwr1 + 16384, 15);
112 const int32_t twi2 = math_asr_s32(vacc2r2 * vtwi2 + vacc2i2 * vtwr2 + 16384, 15);
113 const int32_t twi3 = math_asr_s32(vacc2r3 * vtwi3 + vacc2i3 * vtwr3 + 16384, 15);
114
115 dl[0] = math_asr_s32(vacc1r0 + twr0, 1);
116 dl[1] = math_asr_s32(vacc1i0 + twi0, 1);
117 dl[2] = math_asr_s32(vacc1r1 + twr1, 1);
118 dl[3] = math_asr_s32(vacc1i1 + twi1, 1);
119 dl[4] = math_asr_s32(vacc1r2 + twr2, 1);
120 dl[5] = math_asr_s32(vacc1i2 + twi2, 1);
121 dl[6] = math_asr_s32(vacc1r3 + twr3, 1);
122 dl[7] = math_asr_s32(vacc1i3 + twi3, 1);
123 dr[6] = math_asr_s32(vacc1r0 - twr0, 1);
124 dr[7] = math_asr_s32(twi0 - vacc1i0, 1);
125 dr[4] = math_asr_s32(vacc1r1 - twr1, 1);
126 dr[5] = math_asr_s32(twi1 - vacc1i1, 1);
127 dr[2] = math_asr_s32(vacc1r2 - twr2, 1);
128 dr[3] = math_asr_s32(twi2 - vacc1i2, 1);
129 dr[0] = math_asr_s32(vacc1r3 - twr3, 1);
130 dr[1] = math_asr_s32(twi3 - vacc1i3, 1);
131 dl += 4 * 2;
132 }
133
134 if XNN_UNLIKELY(samples != 0) {
135 do {
136 dr -= 2;
137 int32_t vilr = dl[0];
138 int32_t vili = dl[1];
139 int32_t virr = (int32_t) dr[0];
140 int32_t viri = -(int32_t) dr[1];
141 const int32_t vtwr = twiddle[0];
142 const int32_t vtwi = twiddle[1];
143 twiddle += 2;
144
145 vilr = math_asr_s32(vilr * 16383 + 16384, 15);
146 vili = math_asr_s32(vili * 16383 + 16384, 15);
147 virr = math_asr_s32(virr * 16383 + 16384, 15);
148 viri = math_asr_s32(viri * 16383 + 16384, 15);
149 const int32_t vacc1r = vilr + virr;
150 const int32_t vacc1i = vili + viri;
151 const int32_t vacc2r = vilr - virr;
152 const int32_t vacc2i = vili - viri;
153
154 const int32_t twr = math_asr_s32(vacc2r * vtwr - vacc2i * vtwi + 16384, 15);
155 const int32_t twi = math_asr_s32(vacc2r * vtwi + vacc2i * vtwr + 16384, 15);
156
157 dl[0] = math_asr_s32(vacc1r + twr, 1);
158 dl[1] = math_asr_s32(vacc1i + twi, 1);
159 dr[0] = math_asr_s32(vacc1r - twr, 1);
160 dr[1] = math_asr_s32(twi - vacc1i, 1);
161 dl += 2;
162 } while (--samples != 0);
163 }
164 }
165