1 /******************************************************************************
2 *
3 * Copyright 2022 Google LLC
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18
19 #if __ARM_NEON && __ARM_ARCH_ISA_A64
20
21 #ifndef TEST_NEON
22 #include <arm_neon.h>
23 #endif /* TEST_NEON */
24
25
26 /**
27 * FFT 5 Points
28 * The number of interleaved transform `n` assumed to be even
29 */
30 #ifndef fft_5
31 #define fft_5 neon_fft_5
neon_fft_5(const struct lc3_complex * x,struct lc3_complex * y,int n)32 LC3_HOT static inline void neon_fft_5(
33 const struct lc3_complex *x, struct lc3_complex *y, int n)
34 {
35 static const union { float f[2]; uint64_t u64; }
36 __cos1 = { { 0.3090169944, 0.3090169944 } },
37 __cos2 = { { -0.8090169944, -0.8090169944 } },
38 __sin1 = { { 0.9510565163, -0.9510565163 } },
39 __sin2 = { { 0.5877852523, -0.5877852523 } };
40
41 float32x2_t sin1 = vcreate_f32(__sin1.u64);
42 float32x2_t sin2 = vcreate_f32(__sin2.u64);
43 float32x2_t cos1 = vcreate_f32(__cos1.u64);
44 float32x2_t cos2 = vcreate_f32(__cos2.u64);
45
46 float32x4_t sin1q = vcombine_f32(sin1, sin1);
47 float32x4_t sin2q = vcombine_f32(sin2, sin2);
48 float32x4_t cos1q = vcombine_f32(cos1, cos1);
49 float32x4_t cos2q = vcombine_f32(cos2, cos2);
50
51 for (int i = 0; i < n; i += 2, x += 2, y += 10) {
52
53 float32x4_t y0, y1, y2, y3, y4;
54
55 float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) );
56 float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) );
57 float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) );
58 float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) );
59 float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) );
60
61 float32x4_t s14 = vaddq_f32(x1, x4);
62 float32x4_t s23 = vaddq_f32(x2, x3);
63
64 float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
65 float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );
66
67 y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );
68
69 y4 = vfmaq_f32( x0, s14, cos1q );
70 y4 = vfmaq_f32( y4, s23, cos2q );
71
72 y1 = vfmaq_f32( y4, d14, sin1q );
73 y1 = vfmaq_f32( y1, d23, sin2q );
74
75 y4 = vfmsq_f32( y4, d14, sin1q );
76 y4 = vfmsq_f32( y4, d23, sin2q );
77
78 y3 = vfmaq_f32( x0, s14, cos2q );
79 y3 = vfmaq_f32( y3, s23, cos1q );
80
81 y2 = vfmaq_f32( y3, d14, sin2q );
82 y2 = vfmsq_f32( y2, d23, sin1q );
83
84 y3 = vfmsq_f32( y3, d14, sin2q );
85 y3 = vfmaq_f32( y3, d23, sin1q );
86
87 vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
88 vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
89 vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
90 vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
91 vst1_f32( (float *)(y + 4), vget_low_f32(y4) );
92
93 vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
94 vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
95 vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
96 vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
97 vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
98 }
99 }
100 #endif /* fft_5 */
101
102 /**
103 * FFT Butterfly 3 Points
104 */
105 #ifndef fft_bf3
106 #define fft_bf3 neon_fft_bf3
neon_fft_bf3(const struct lc3_fft_bf3_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)107 LC3_HOT static inline void neon_fft_bf3(
108 const struct lc3_fft_bf3_twiddles *twiddles,
109 const struct lc3_complex *x, struct lc3_complex *y, int n)
110 {
111 int n3 = twiddles->n3;
112 const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
113 const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
114 const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;
115
116 const struct lc3_complex *x0_ptr = x;
117 const struct lc3_complex *x1_ptr = x0_ptr + n*n3;
118 const struct lc3_complex *x2_ptr = x1_ptr + n*n3;
119
120 struct lc3_complex *y0_ptr = y;
121 struct lc3_complex *y1_ptr = y0_ptr + n3;
122 struct lc3_complex *y2_ptr = y1_ptr + n3;
123
124 for (int j, i = 0; i < n; i++,
125 y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) {
126
127 /* --- Process by pair --- */
128
129 for (j = 0; j < (n3 >> 1); j++,
130 x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {
131
132 float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
133 float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
134 float32x4_t x2 = vld1q_f32( (float *)x2_ptr );
135
136 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
137 float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );
138
139 float32x4x2_t wn;
140 float32x4_t yn;
141
142 wn = vld2q_f32( (float *)(w0_ptr + 2*j) );
143
144 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
145 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
146 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
147 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
148 vst1q_f32( (float *)(y0_ptr + 2*j), yn );
149
150 wn = vld2q_f32( (float *)(w1_ptr + 2*j) );
151
152 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
153 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
154 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
155 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
156 vst1q_f32( (float *)(y1_ptr + 2*j), yn );
157
158 wn = vld2q_f32( (float *)(w2_ptr + 2*j) );
159
160 yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
161 yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
162 yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
163 yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
164 vst1q_f32( (float *)(y2_ptr + 2*j), yn );
165
166 }
167
168 /* --- Last iteration --- */
169
170 if (n3 & 1) {
171
172 float32x2x2_t wn;
173 float32x2_t yn;
174
175 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
176 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
177 float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );
178
179 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
180 float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );
181
182 wn = vld2_f32( (float *)(w0_ptr + 2*j) );
183
184 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
185 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
186 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
187 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
188 vst1_f32( (float *)(y0_ptr + 2*j), yn );
189
190 wn = vld2_f32( (float *)(w1_ptr + 2*j) );
191
192 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
193 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
194 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
195 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
196 vst1_f32( (float *)(y1_ptr + 2*j), yn );
197
198 wn = vld2_f32( (float *)(w2_ptr + 2*j) );
199
200 yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
201 yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
202 yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
203 yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
204 vst1_f32( (float *)(y2_ptr + 2*j), yn );
205 }
206
207 }
208 }
209 #endif /* fft_bf3 */
210
211 /**
212 * FFT Butterfly 2 Points
213 */
214 #ifndef fft_bf2
215 #define fft_bf2 neon_fft_bf2
neon_fft_bf2(const struct lc3_fft_bf2_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)216 LC3_HOT static inline void neon_fft_bf2(
217 const struct lc3_fft_bf2_twiddles *twiddles,
218 const struct lc3_complex *x, struct lc3_complex *y, int n)
219 {
220 int n2 = twiddles->n2;
221 const struct lc3_complex *w_ptr = twiddles->t;
222
223 const struct lc3_complex *x0_ptr = x;
224 const struct lc3_complex *x1_ptr = x0_ptr + n*n2;
225
226 struct lc3_complex *y0_ptr = y;
227 struct lc3_complex *y1_ptr = y0_ptr + n2;
228
229 for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) {
230
231 /* --- Process by pair --- */
232
233 for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {
234
235 float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
236 float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
237 float32x4_t y0, y1;
238
239 float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
240
241 float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) );
242 float32x4_t w_re = vtrn1q_f32(w, w);
243 float32x4_t w_im = vtrn2q_f32(w, w);
244
245 y0 = vfmaq_f32( x0, x1 , w_re );
246 y0 = vfmaq_f32( y0, x1r, w_im );
247 vst1q_f32( (float *)(y0_ptr + 2*j), y0 );
248
249 y1 = vfmsq_f32( x0, x1 , w_re );
250 y1 = vfmsq_f32( y1, x1r, w_im );
251 vst1q_f32( (float *)(y1_ptr + 2*j), y1 );
252 }
253
254 /* --- Last iteration --- */
255
256 if (n2 & 1) {
257
258 float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
259 float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
260 float32x2_t y0, y1;
261
262 float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
263
264 float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) );
265 float32x2_t w_re = vtrn1_f32(w, w);
266 float32x2_t w_im = vtrn2_f32(w, w);
267
268 y0 = vfma_f32( x0, x1 , w_re );
269 y0 = vfma_f32( y0, x1r, w_im );
270 vst1_f32( (float *)(y0_ptr + 2*j), y0 );
271
272 y1 = vfms_f32( x0, x1 , w_re );
273 y1 = vfms_f32( y1, x1r, w_im );
274 vst1_f32( (float *)(y1_ptr + 2*j), y1 );
275 }
276 }
277 }
278 #endif /* fft_bf2 */
279
280 #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
281