• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_FFT_COMMON_H_
13 #define AOM_AOM_DSP_FFT_COMMON_H_
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
19 /*!\brief A function pointer for computing 1d fft and ifft.
20  *
21  * The function will point to an implementation for a specific transform size,
22  * and may perform the transforms using vectorized instructions.
23  *
24  * For a non-vectorized forward transforms of size n, the input and output
25  * buffers will be size n. The output takes advantage of conjugate symmetry and
26  * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
27  * (r_{j}, i_{j}) is the complex output for index j.
28  *
29  * An inverse transform will assume that the complex "input" is packed
30  * similarly. Its output will be real.
31  *
32  * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
33  *
34  * Vectorized implementations are parallelized along the columns so that the fft
35  * can be performed on multiple columns at a time. In such cases the data block
36  * for input and output is typically square (n x n) and the stride will
37  * correspond to the spacing between rows. At minimum, the input size must be
38  * n x simd_vector_length.
39  *
40  * \param[in]  input   Input buffer. See above for size restrictions.
41  * \param[out] output  Output buffer. See above for size restrictions.
42  * \param[in]  stride  The spacing in number of elements between rows
43  *                     (or elements)
44  */
45 typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
46                                   int stride);
47 
48 // Declare some of the forward non-vectorized transforms which are used in some
49 // of the vectorized implementations
50 void aom_fft1d_2_float(const float *input, float *output, int stride);
51 void aom_fft1d_4_float(const float *input, float *output, int stride);
52 void aom_fft1d_8_float(const float *input, float *output, int stride);
53 void aom_fft1d_16_float(const float *input, float *output, int stride);
54 void aom_fft1d_32_float(const float *input, float *output, int stride);
55 void aom_ifft1d_2_float(const float *input, float *output, int stride);
56 void aom_ifft1d_4_float(const float *input, float *output, int stride);
57 void aom_ifft1d_8_float(const float *input, float *output, int stride);
58 void aom_ifft1d_16_float(const float *input, float *output, int stride);
59 void aom_ifft1d_32_float(const float *input, float *output, int stride);
60 
61 /**\!brief Function pointer for transposing a matrix of floats.
62  *
63  * \param[in]  input  Input buffer (size n x n)
64  * \param[out] output Output buffer (size n x n)
65  * \param[in]  n      Extent of one dimension of the square matrix.
66  */
67 typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
68                                          int n);
69 
70 /**\!brief Function pointer for re-arranging intermediate 2d transform results.
71  *
72  * After re-arrangement, the real and imaginary components will be packed
73  * tightly next to each other.
74  *
75  * \param[in]  input  Input buffer (size n x n)
76  * \param[out] output Output buffer (size 2 x n x n)
77  * \param[in]  n      Extent of one dimension of the square matrix.
78  */
79 typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
80 
81 /*!\brief Performs a 2d fft with the given functions.
82  *
83  * This generator function allows for multiple different implementations of 2d
84  * fft with different vector operations, without having to redefine the main
85  * body multiple times.
86  *
87  * \param[in]  input     Input buffer to run the transform on (size n x n)
88  * \param[out] temp      Working buffer for computing the transform (size n x n)
89  * \param[out] output    Output buffer (size 2 x n x n)
90  * \param[in]  tform     Forward transform function
91  * \param[in]  transpose Transpose function (for n x n matrix)
92  * \param[in]  unpack    Unpack function used to massage outputs to correct form
93  * \param[in]  vec_size  Vector size (the transform is done vec_size units at
94  *                       a time)
95  */
96 void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
97                     aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
98                     aom_fft_unpack_func_t unpack, int vec_size);
99 
100 /*!\brief Perform a 2d inverse fft with the given helper functions
101  *
102  * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
103  * \param[out] temp       Working buffer for computations (size 2 x n x n)
104  * \param[out] output     Output buffer (size n x n)
105  * \param[in]  fft_single Forward transform function (non vectorized)
106  * \param[in]  fft_multi  Forward transform function (vectorized)
107  * \param[in]  ifft_multi Inverse transform function (vectorized)
108  * \param[in]  transpose  Transpose function (for n x n matrix)
109  * \param[in]  vec_size   Vector size (the transform is done vec_size
110  *                        units at a time)
111  */
112 void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
113                      aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
114                      aom_fft_1d_func_t ifft_multi,
115                      aom_fft_transpose_func_t transpose, int vec_size);
116 #ifdef __cplusplus
117 }
118 #endif
119 
120 // The macros below define 1D fft/ifft for different data types and for
121 // different simd vector intrinsic types.
122 
123 #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
124   ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
125     const T_VEC i0 = load(input + 0 * stride);                      \
126     const T_VEC i1 = load(input + 1 * stride);                      \
127     store(output + 0 * stride, i0 + i1);                            \
128     store(output + 1 * stride, i0 - i1);                            \
129   }
130 
131 #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
132   ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
133     const T_VEC kWeight0 = constant(0.0f);                                \
134     const T_VEC i0 = load(input + 0 * stride);                            \
135     const T_VEC i1 = load(input + 1 * stride);                            \
136     const T_VEC i2 = load(input + 2 * stride);                            \
137     const T_VEC i3 = load(input + 3 * stride);                            \
138     const T_VEC w0 = add(i0, i2);                                         \
139     const T_VEC w1 = sub(i0, i2);                                         \
140     const T_VEC w2 = add(i1, i3);                                         \
141     const T_VEC w3 = sub(i1, i3);                                         \
142     store(output + 0 * stride, add(w0, w2));                              \
143     store(output + 1 * stride, w1);                                       \
144     store(output + 2 * stride, sub(w0, w2));                              \
145     store(output + 3 * stride, sub(kWeight0, w3));                        \
146   }
147 
148 #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
149   ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
150     const T_VEC kWeight0 = constant(0.0f);                                     \
151     const T_VEC kWeight2 = constant(0.707107f);                                \
152     const T_VEC i0 = load(input + 0 * stride);                                 \
153     const T_VEC i1 = load(input + 1 * stride);                                 \
154     const T_VEC i2 = load(input + 2 * stride);                                 \
155     const T_VEC i3 = load(input + 3 * stride);                                 \
156     const T_VEC i4 = load(input + 4 * stride);                                 \
157     const T_VEC i5 = load(input + 5 * stride);                                 \
158     const T_VEC i6 = load(input + 6 * stride);                                 \
159     const T_VEC i7 = load(input + 7 * stride);                                 \
160     const T_VEC w0 = add(i0, i4);                                              \
161     const T_VEC w1 = sub(i0, i4);                                              \
162     const T_VEC w2 = add(i2, i6);                                              \
163     const T_VEC w3 = sub(i2, i6);                                              \
164     const T_VEC w4 = add(w0, w2);                                              \
165     const T_VEC w5 = sub(w0, w2);                                              \
166     const T_VEC w7 = add(i1, i5);                                              \
167     const T_VEC w8 = sub(i1, i5);                                              \
168     const T_VEC w9 = add(i3, i7);                                              \
169     const T_VEC w10 = sub(i3, i7);                                             \
170     const T_VEC w11 = add(w7, w9);                                             \
171     const T_VEC w12 = sub(w7, w9);                                             \
172     store(output + 0 * stride, add(w4, w11));                                  \
173     store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
174     store(output + 2 * stride, w5);                                            \
175     store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
176     store(output + 4 * stride, sub(w4, w11));                                  \
177     store(output + 5 * stride,                                                 \
178           sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
179     store(output + 6 * stride, sub(kWeight0, w12));                            \
180     store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
181   }
182 
183 #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
184                    mul)                                                    \
185   ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
186     const T_VEC kWeight0 = constant(0.0f);                                 \
187     const T_VEC kWeight2 = constant(0.707107f);                            \
188     const T_VEC kWeight3 = constant(0.92388f);                             \
189     const T_VEC kWeight4 = constant(0.382683f);                            \
190     const T_VEC i0 = load(input + 0 * stride);                             \
191     const T_VEC i1 = load(input + 1 * stride);                             \
192     const T_VEC i2 = load(input + 2 * stride);                             \
193     const T_VEC i3 = load(input + 3 * stride);                             \
194     const T_VEC i4 = load(input + 4 * stride);                             \
195     const T_VEC i5 = load(input + 5 * stride);                             \
196     const T_VEC i6 = load(input + 6 * stride);                             \
197     const T_VEC i7 = load(input + 7 * stride);                             \
198     const T_VEC i8 = load(input + 8 * stride);                             \
199     const T_VEC i9 = load(input + 9 * stride);                             \
200     const T_VEC i10 = load(input + 10 * stride);                           \
201     const T_VEC i11 = load(input + 11 * stride);                           \
202     const T_VEC i12 = load(input + 12 * stride);                           \
203     const T_VEC i13 = load(input + 13 * stride);                           \
204     const T_VEC i14 = load(input + 14 * stride);                           \
205     const T_VEC i15 = load(input + 15 * stride);                           \
206     const T_VEC w0 = add(i0, i8);                                          \
207     const T_VEC w1 = sub(i0, i8);                                          \
208     const T_VEC w2 = add(i4, i12);                                         \
209     const T_VEC w3 = sub(i4, i12);                                         \
210     const T_VEC w4 = add(w0, w2);                                          \
211     const T_VEC w5 = sub(w0, w2);                                          \
212     const T_VEC w7 = add(i2, i10);                                         \
213     const T_VEC w8 = sub(i2, i10);                                         \
214     const T_VEC w9 = add(i6, i14);                                         \
215     const T_VEC w10 = sub(i6, i14);                                        \
216     const T_VEC w11 = add(w7, w9);                                         \
217     const T_VEC w12 = sub(w7, w9);                                         \
218     const T_VEC w14 = add(w4, w11);                                        \
219     const T_VEC w15 = sub(w4, w11);                                        \
220     const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
221                            sub(sub(kWeight0, w3),                          \
222                                mul(kWeight2, add(w10, w8))) };             \
223     const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
224                            sub(w3, mul(kWeight2, add(w10, w8))) };         \
225     const T_VEC w19 = add(i1, i9);                                         \
226     const T_VEC w20 = sub(i1, i9);                                         \
227     const T_VEC w21 = add(i5, i13);                                        \
228     const T_VEC w22 = sub(i5, i13);                                        \
229     const T_VEC w23 = add(w19, w21);                                       \
230     const T_VEC w24 = sub(w19, w21);                                       \
231     const T_VEC w26 = add(i3, i11);                                        \
232     const T_VEC w27 = sub(i3, i11);                                        \
233     const T_VEC w28 = add(i7, i15);                                        \
234     const T_VEC w29 = sub(i7, i15);                                        \
235     const T_VEC w30 = add(w26, w28);                                       \
236     const T_VEC w31 = sub(w26, w28);                                       \
237     const T_VEC w33 = add(w23, w30);                                       \
238     const T_VEC w34 = sub(w23, w30);                                       \
239     const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
240                            sub(sub(kWeight0, w22),                         \
241                                mul(kWeight2, add(w29, w27))) };            \
242     const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
243                            sub(w22, mul(kWeight2, add(w29, w27))) };       \
244     store(output + 0 * stride, add(w14, w33));                             \
245     store(output + 1 * stride,                                             \
246           add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
247     store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
248     store(output + 3 * stride,                                             \
249           add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
250     store(output + 4 * stride, w15);                                       \
251     store(output + 5 * stride,                                             \
252           add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
253                           mul(kWeight3, w37[1]))));                        \
254     store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
255     store(output + 7 * stride,                                             \
256           add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
257                           mul(kWeight4, w35[1]))));                        \
258     store(output + 8 * stride, sub(w14, w33));                             \
259     store(output + 9 * stride,                                             \
260           add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
261     store(output + 10 * stride,                                            \
262           sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
263     store(output + 11 * stride,                                            \
264           add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
265     store(output + 12 * stride, sub(kWeight0, w34));                       \
266     store(output + 13 * stride,                                            \
267           sub(sub(kWeight0, w18[1]),                                       \
268               sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
269     store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
270     store(output + 15 * stride,                                            \
271           sub(sub(kWeight0, w16[1]),                                       \
272               sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
273   }
274 
275 #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
276                    mul)                                                      \
277   ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
278     const T_VEC kWeight0 = constant(0.0f);                                   \
279     const T_VEC kWeight2 = constant(0.707107f);                              \
280     const T_VEC kWeight3 = constant(0.92388f);                               \
281     const T_VEC kWeight4 = constant(0.382683f);                              \
282     const T_VEC kWeight5 = constant(0.980785f);                              \
283     const T_VEC kWeight6 = constant(0.19509f);                               \
284     const T_VEC kWeight7 = constant(0.83147f);                               \
285     const T_VEC kWeight8 = constant(0.55557f);                               \
286     const T_VEC i0 = load(input + 0 * stride);                               \
287     const T_VEC i1 = load(input + 1 * stride);                               \
288     const T_VEC i2 = load(input + 2 * stride);                               \
289     const T_VEC i3 = load(input + 3 * stride);                               \
290     const T_VEC i4 = load(input + 4 * stride);                               \
291     const T_VEC i5 = load(input + 5 * stride);                               \
292     const T_VEC i6 = load(input + 6 * stride);                               \
293     const T_VEC i7 = load(input + 7 * stride);                               \
294     const T_VEC i8 = load(input + 8 * stride);                               \
295     const T_VEC i9 = load(input + 9 * stride);                               \
296     const T_VEC i10 = load(input + 10 * stride);                             \
297     const T_VEC i11 = load(input + 11 * stride);                             \
298     const T_VEC i12 = load(input + 12 * stride);                             \
299     const T_VEC i13 = load(input + 13 * stride);                             \
300     const T_VEC i14 = load(input + 14 * stride);                             \
301     const T_VEC i15 = load(input + 15 * stride);                             \
302     const T_VEC i16 = load(input + 16 * stride);                             \
303     const T_VEC i17 = load(input + 17 * stride);                             \
304     const T_VEC i18 = load(input + 18 * stride);                             \
305     const T_VEC i19 = load(input + 19 * stride);                             \
306     const T_VEC i20 = load(input + 20 * stride);                             \
307     const T_VEC i21 = load(input + 21 * stride);                             \
308     const T_VEC i22 = load(input + 22 * stride);                             \
309     const T_VEC i23 = load(input + 23 * stride);                             \
310     const T_VEC i24 = load(input + 24 * stride);                             \
311     const T_VEC i25 = load(input + 25 * stride);                             \
312     const T_VEC i26 = load(input + 26 * stride);                             \
313     const T_VEC i27 = load(input + 27 * stride);                             \
314     const T_VEC i28 = load(input + 28 * stride);                             \
315     const T_VEC i29 = load(input + 29 * stride);                             \
316     const T_VEC i30 = load(input + 30 * stride);                             \
317     const T_VEC i31 = load(input + 31 * stride);                             \
318     const T_VEC w0 = add(i0, i16);                                           \
319     const T_VEC w1 = sub(i0, i16);                                           \
320     const T_VEC w2 = add(i8, i24);                                           \
321     const T_VEC w3 = sub(i8, i24);                                           \
322     const T_VEC w4 = add(w0, w2);                                            \
323     const T_VEC w5 = sub(w0, w2);                                            \
324     const T_VEC w7 = add(i4, i20);                                           \
325     const T_VEC w8 = sub(i4, i20);                                           \
326     const T_VEC w9 = add(i12, i28);                                          \
327     const T_VEC w10 = sub(i12, i28);                                         \
328     const T_VEC w11 = add(w7, w9);                                           \
329     const T_VEC w12 = sub(w7, w9);                                           \
330     const T_VEC w14 = add(w4, w11);                                          \
331     const T_VEC w15 = sub(w4, w11);                                          \
332     const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
333                            sub(sub(kWeight0, w3),                            \
334                                mul(kWeight2, add(w10, w8))) };               \
335     const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
336                            sub(w3, mul(kWeight2, add(w10, w8))) };           \
337     const T_VEC w19 = add(i2, i18);                                          \
338     const T_VEC w20 = sub(i2, i18);                                          \
339     const T_VEC w21 = add(i10, i26);                                         \
340     const T_VEC w22 = sub(i10, i26);                                         \
341     const T_VEC w23 = add(w19, w21);                                         \
342     const T_VEC w24 = sub(w19, w21);                                         \
343     const T_VEC w26 = add(i6, i22);                                          \
344     const T_VEC w27 = sub(i6, i22);                                          \
345     const T_VEC w28 = add(i14, i30);                                         \
346     const T_VEC w29 = sub(i14, i30);                                         \
347     const T_VEC w30 = add(w26, w28);                                         \
348     const T_VEC w31 = sub(w26, w28);                                         \
349     const T_VEC w33 = add(w23, w30);                                         \
350     const T_VEC w34 = sub(w23, w30);                                         \
351     const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
352                            sub(sub(kWeight0, w22),                           \
353                                mul(kWeight2, add(w29, w27))) };              \
354     const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
355                            sub(w22, mul(kWeight2, add(w29, w27))) };         \
356     const T_VEC w38 = add(w14, w33);                                         \
357     const T_VEC w39 = sub(w14, w33);                                         \
358     const T_VEC w40[2] = {                                                   \
359       add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
360       add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
361     };                                                                       \
362     const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
363                            sub(sub(kWeight0, w12),                           \
364                                mul(kWeight2, add(w31, w24))) };              \
365     const T_VEC w42[2] = {                                                   \
366       add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
367       add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
368     };                                                                       \
369     const T_VEC w44[2] = {                                                   \
370       add(w18[0],                                                            \
371           sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
372       sub(sub(kWeight0, w18[1]),                                             \
373           sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
374     };                                                                       \
375     const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
376                            sub(w12, mul(kWeight2, add(w31, w24))) };         \
377     const T_VEC w46[2] = {                                                   \
378       add(w16[0],                                                            \
379           sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
380       sub(sub(kWeight0, w16[1]),                                             \
381           sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
382     };                                                                       \
383     const T_VEC w47 = add(i1, i17);                                          \
384     const T_VEC w48 = sub(i1, i17);                                          \
385     const T_VEC w49 = add(i9, i25);                                          \
386     const T_VEC w50 = sub(i9, i25);                                          \
387     const T_VEC w51 = add(w47, w49);                                         \
388     const T_VEC w52 = sub(w47, w49);                                         \
389     const T_VEC w54 = add(i5, i21);                                          \
390     const T_VEC w55 = sub(i5, i21);                                          \
391     const T_VEC w56 = add(i13, i29);                                         \
392     const T_VEC w57 = sub(i13, i29);                                         \
393     const T_VEC w58 = add(w54, w56);                                         \
394     const T_VEC w59 = sub(w54, w56);                                         \
395     const T_VEC w61 = add(w51, w58);                                         \
396     const T_VEC w62 = sub(w51, w58);                                         \
397     const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
398                            sub(sub(kWeight0, w50),                           \
399                                mul(kWeight2, add(w57, w55))) };              \
400     const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
401                            sub(w50, mul(kWeight2, add(w57, w55))) };         \
402     const T_VEC w66 = add(i3, i19);                                          \
403     const T_VEC w67 = sub(i3, i19);                                          \
404     const T_VEC w68 = add(i11, i27);                                         \
405     const T_VEC w69 = sub(i11, i27);                                         \
406     const T_VEC w70 = add(w66, w68);                                         \
407     const T_VEC w71 = sub(w66, w68);                                         \
408     const T_VEC w73 = add(i7, i23);                                          \
409     const T_VEC w74 = sub(i7, i23);                                          \
410     const T_VEC w75 = add(i15, i31);                                         \
411     const T_VEC w76 = sub(i15, i31);                                         \
412     const T_VEC w77 = add(w73, w75);                                         \
413     const T_VEC w78 = sub(w73, w75);                                         \
414     const T_VEC w80 = add(w70, w77);                                         \
415     const T_VEC w81 = sub(w70, w77);                                         \
416     const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
417                            sub(sub(kWeight0, w69),                           \
418                                mul(kWeight2, add(w76, w74))) };              \
419     const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
420                            sub(w69, mul(kWeight2, add(w76, w74))) };         \
421     const T_VEC w85 = add(w61, w80);                                         \
422     const T_VEC w86 = sub(w61, w80);                                         \
423     const T_VEC w87[2] = {                                                   \
424       add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
425       add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
426     };                                                                       \
427     const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
428                            sub(sub(kWeight0, w59),                           \
429                                mul(kWeight2, add(w78, w71))) };              \
430     const T_VEC w89[2] = {                                                   \
431       add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
432       add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
433     };                                                                       \
434     const T_VEC w91[2] = {                                                   \
435       add(w65[0],                                                            \
436           sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
437       sub(sub(kWeight0, w65[1]),                                             \
438           sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
439     };                                                                       \
440     const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
441                            sub(w59, mul(kWeight2, add(w78, w71))) };         \
442     const T_VEC w93[2] = {                                                   \
443       add(w63[0],                                                            \
444           sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
445       sub(sub(kWeight0, w63[1]),                                             \
446           sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
447     };                                                                       \
448     store(output + 0 * stride, add(w38, w85));                               \
449     store(output + 1 * stride,                                               \
450           add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
451     store(output + 2 * stride,                                               \
452           add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
453     store(output + 3 * stride,                                               \
454           add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
455     store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
456     store(output + 5 * stride,                                               \
457           add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
458     store(output + 6 * stride,                                               \
459           add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
460     store(output + 7 * stride,                                               \
461           add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
462     store(output + 8 * stride, w39);                                         \
463     store(output + 9 * stride,                                               \
464           add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
465                           mul(kWeight5, w93[1]))));                          \
466     store(output + 10 * stride,                                              \
467           add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
468                           mul(kWeight3, w92[1]))));                          \
469     store(output + 11 * stride,                                              \
470           add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
471                           mul(kWeight7, w91[1]))));                          \
472     store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
473     store(output + 13 * stride,                                              \
474           add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
475                           mul(kWeight8, w89[1]))));                          \
476     store(output + 14 * stride,                                              \
477           add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
478                           mul(kWeight4, w88[1]))));                          \
479     store(output + 15 * stride,                                              \
480           add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
481                           mul(kWeight6, w87[1]))));                          \
482     store(output + 16 * stride, sub(w38, w85));                              \
483     store(output + 17 * stride,                                              \
484           add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
485     store(output + 18 * stride,                                              \
486           add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
487     store(output + 19 * stride,                                              \
488           add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
489     store(output + 20 * stride,                                              \
490           sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
491     store(output + 21 * stride,                                              \
492           add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
493     store(output + 22 * stride,                                              \
494           add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
495     store(output + 23 * stride,                                              \
496           add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
497     store(output + 24 * stride, sub(kWeight0, w86));                         \
498     store(output + 25 * stride,                                              \
499           sub(sub(kWeight0, w46[1]),                                         \
500               sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
501     store(output + 26 * stride,                                              \
502           sub(sub(kWeight0, w45[1]),                                         \
503               sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
504     store(output + 27 * stride,                                              \
505           sub(sub(kWeight0, w44[1]),                                         \
506               sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
507     store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
508     store(output + 29 * stride,                                              \
509           sub(sub(kWeight0, w42[1]),                                         \
510               sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
511     store(output + 30 * stride,                                              \
512           sub(sub(kWeight0, w41[1]),                                         \
513               sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
514     store(output + 31 * stride,                                              \
515           sub(sub(kWeight0, w40[1]),                                         \
516               sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
517   }
518 
519 #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
520   ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
521     const T_VEC i0 = load(input + 0 * stride);                       \
522     const T_VEC i1 = load(input + 1 * stride);                       \
523     store(output + 0 * stride, i0 + i1);                             \
524     store(output + 1 * stride, i0 - i1);                             \
525   }
526 
527 #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
528   ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
529     const T_VEC kWeight0 = constant(0.0f);                                 \
530     const T_VEC i0 = load(input + 0 * stride);                             \
531     const T_VEC i1 = load(input + 1 * stride);                             \
532     const T_VEC i2 = load(input + 2 * stride);                             \
533     const T_VEC i3 = load(input + 3 * stride);                             \
534     const T_VEC w2 = add(i0, i2);                                          \
535     const T_VEC w3 = sub(i0, i2);                                          \
536     const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
537     const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
538     store(output + 0 * stride, add(w2, w4[0]));                            \
539     store(output + 1 * stride, add(w3, w5[1]));                            \
540     store(output + 2 * stride, sub(w2, w4[0]));                            \
541     store(output + 3 * stride, sub(w3, w5[1]));                            \
542   }
543 
544 #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
545                    mul)                                                    \
546   ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
547     const T_VEC kWeight0 = constant(0.0f);                                 \
548     const T_VEC kWeight2 = constant(0.707107f);                            \
549     const T_VEC i0 = load(input + 0 * stride);                             \
550     const T_VEC i1 = load(input + 1 * stride);                             \
551     const T_VEC i2 = load(input + 2 * stride);                             \
552     const T_VEC i3 = load(input + 3 * stride);                             \
553     const T_VEC i4 = load(input + 4 * stride);                             \
554     const T_VEC i5 = load(input + 5 * stride);                             \
555     const T_VEC i6 = load(input + 6 * stride);                             \
556     const T_VEC i7 = load(input + 7 * stride);                             \
557     const T_VEC w6 = add(i0, i4);                                          \
558     const T_VEC w7 = sub(i0, i4);                                          \
559     const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
560     const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
561     const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
562     const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
563     const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
564     const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
565     const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
566     const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
567     const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
568     const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
569     const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
570     const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
571     const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
572     const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
573     store(output + 0 * stride, add(w10[0], w18[0]));                       \
574     store(output + 1 * stride,                                             \
575           add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
576     store(output + 2 * stride, add(w11[0], w19[1]));                       \
577     store(output + 3 * stride,                                             \
578           sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
579     store(output + 4 * stride, sub(w10[0], w18[0]));                       \
580     store(output + 5 * stride,                                             \
581           add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
582                           mul(kWeight2, w20[1]))));                        \
583     store(output + 6 * stride, sub(w11[0], w19[1]));                       \
584     store(output + 7 * stride,                                             \
585           add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
586   }
587 
588 #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
589                     mul)                                                      \
590   ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
591     const T_VEC kWeight0 = constant(0.0f);                                    \
592     const T_VEC kWeight2 = constant(0.707107f);                               \
593     const T_VEC kWeight3 = constant(0.92388f);                                \
594     const T_VEC kWeight4 = constant(0.382683f);                               \
595     const T_VEC i0 = load(input + 0 * stride);                                \
596     const T_VEC i1 = load(input + 1 * stride);                                \
597     const T_VEC i2 = load(input + 2 * stride);                                \
598     const T_VEC i3 = load(input + 3 * stride);                                \
599     const T_VEC i4 = load(input + 4 * stride);                                \
600     const T_VEC i5 = load(input + 5 * stride);                                \
601     const T_VEC i6 = load(input + 6 * stride);                                \
602     const T_VEC i7 = load(input + 7 * stride);                                \
603     const T_VEC i8 = load(input + 8 * stride);                                \
604     const T_VEC i9 = load(input + 9 * stride);                                \
605     const T_VEC i10 = load(input + 10 * stride);                              \
606     const T_VEC i11 = load(input + 11 * stride);                              \
607     const T_VEC i12 = load(input + 12 * stride);                              \
608     const T_VEC i13 = load(input + 13 * stride);                              \
609     const T_VEC i14 = load(input + 14 * stride);                              \
610     const T_VEC i15 = load(input + 15 * stride);                              \
611     const T_VEC w14 = add(i0, i8);                                            \
612     const T_VEC w15 = sub(i0, i8);                                            \
613     const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
614     const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
615     const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
616     const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
617     const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
618     const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
619     const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
620     const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
621     const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
622     const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
623     const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
624     const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
625     const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
626     const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
627     const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
628     const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
629     const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
630                            add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
631     const T_VEC w33[2] = { add(w20[0],                                        \
632                                sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
633                                    mul(kWeight2, w28[1]))),                   \
634                            add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
635     const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
636     const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
637     const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
638                            sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
639     const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
640                            add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
641     const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
642     const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
643     const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
644     const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
645     const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
646     const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
647     const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
648     const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
649     const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
650     const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
651     const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
652     const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
653     const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
654     const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
655     const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
656     const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
657     const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
658     const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
659     const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
660                            add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
661     const T_VEC w57[2] = { add(w44[0],                                        \
662                                sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
663                                    mul(kWeight2, w52[1]))),                   \
664                            add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
665     const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
666     const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
667     const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
668                            sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
669     const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
670                            add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
671     store(output + 0 * stride, add(w30[0], w54[0]));                          \
672     store(output + 1 * stride,                                                \
673           add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
674     store(output + 2 * stride,                                                \
675           add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
676     store(output + 3 * stride,                                                \
677           add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
678     store(output + 4 * stride, add(w31[0], w55[1]));                          \
679     store(output + 5 * stride,                                                \
680           sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
681     store(output + 6 * stride,                                                \
682           sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
683     store(output + 7 * stride,                                                \
684           sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
685     store(output + 8 * stride, sub(w30[0], w54[0]));                          \
686     store(output + 9 * stride,                                                \
687           add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
688                           mul(kWeight4, w56[1]))));                           \
689     store(output + 10 * stride,                                               \
690           add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
691                           mul(kWeight2, w58[1]))));                           \
692     store(output + 11 * stride,                                               \
693           add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
694                           mul(kWeight3, w60[1]))));                           \
695     store(output + 12 * stride, sub(w31[0], w55[1]));                         \
696     store(output + 13 * stride,                                               \
697           add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
698     store(output + 14 * stride,                                               \
699           add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
700     store(output + 15 * stride,                                               \
701           add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
702   }
703 #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
704                     mul)                                                       \
705   ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
706     const T_VEC kWeight0 = constant(0.0f);                                     \
707     const T_VEC kWeight2 = constant(0.707107f);                                \
708     const T_VEC kWeight3 = constant(0.92388f);                                 \
709     const T_VEC kWeight4 = constant(0.382683f);                                \
710     const T_VEC kWeight5 = constant(0.980785f);                                \
711     const T_VEC kWeight6 = constant(0.19509f);                                 \
712     const T_VEC kWeight7 = constant(0.83147f);                                 \
713     const T_VEC kWeight8 = constant(0.55557f);                                 \
714     const T_VEC i0 = load(input + 0 * stride);                                 \
715     const T_VEC i1 = load(input + 1 * stride);                                 \
716     const T_VEC i2 = load(input + 2 * stride);                                 \
717     const T_VEC i3 = load(input + 3 * stride);                                 \
718     const T_VEC i4 = load(input + 4 * stride);                                 \
719     const T_VEC i5 = load(input + 5 * stride);                                 \
720     const T_VEC i6 = load(input + 6 * stride);                                 \
721     const T_VEC i7 = load(input + 7 * stride);                                 \
722     const T_VEC i8 = load(input + 8 * stride);                                 \
723     const T_VEC i9 = load(input + 9 * stride);                                 \
724     const T_VEC i10 = load(input + 10 * stride);                               \
725     const T_VEC i11 = load(input + 11 * stride);                               \
726     const T_VEC i12 = load(input + 12 * stride);                               \
727     const T_VEC i13 = load(input + 13 * stride);                               \
728     const T_VEC i14 = load(input + 14 * stride);                               \
729     const T_VEC i15 = load(input + 15 * stride);                               \
730     const T_VEC i16 = load(input + 16 * stride);                               \
731     const T_VEC i17 = load(input + 17 * stride);                               \
732     const T_VEC i18 = load(input + 18 * stride);                               \
733     const T_VEC i19 = load(input + 19 * stride);                               \
734     const T_VEC i20 = load(input + 20 * stride);                               \
735     const T_VEC i21 = load(input + 21 * stride);                               \
736     const T_VEC i22 = load(input + 22 * stride);                               \
737     const T_VEC i23 = load(input + 23 * stride);                               \
738     const T_VEC i24 = load(input + 24 * stride);                               \
739     const T_VEC i25 = load(input + 25 * stride);                               \
740     const T_VEC i26 = load(input + 26 * stride);                               \
741     const T_VEC i27 = load(input + 27 * stride);                               \
742     const T_VEC i28 = load(input + 28 * stride);                               \
743     const T_VEC i29 = load(input + 29 * stride);                               \
744     const T_VEC i30 = load(input + 30 * stride);                               \
745     const T_VEC i31 = load(input + 31 * stride);                               \
746     const T_VEC w30 = add(i0, i16);                                            \
747     const T_VEC w31 = sub(i0, i16);                                            \
748     const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
749     const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
750     const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
751     const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
752     const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
753     const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
754     const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
755     const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
756     const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
757     const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
758     const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
759     const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
760     const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
761     const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
762     const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
763     const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
764     const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
765                            add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
766     const T_VEC w49[2] = { add(w36[0],                                         \
767                                sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
768                                    mul(kWeight2, w44[1]))),                    \
769                            add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
770     const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
771     const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
772     const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
773                            sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
774     const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
775                            add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
776     const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
777     const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
778     const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
779     const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
780     const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
781     const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
782     const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
783     const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
784     const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
785     const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
786     const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
787     const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
788     const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
789     const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
790     const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
791     const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
792     const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
793     const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
794     const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
795                            add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
796     const T_VEC w73[2] = { add(w60[0],                                         \
797                                sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
798                                    mul(kWeight2, w68[1]))),                    \
799                            add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
800     const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
801     const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
802     const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
803                            sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
804     const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
805                            add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
806     const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
807     const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
808     const T_VEC w80[2] = {                                                     \
809       add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
810       add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
811     };                                                                         \
812     const T_VEC w81[2] = {                                                     \
813       add(w48[0],                                                              \
814           sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
815       add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
816     };                                                                         \
817     const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
818                            add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
819     const T_VEC w83[2] = { add(w50[0],                                         \
820                                sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
821                                    mul(kWeight2, w74[1]))),                    \
822                            add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
823     const T_VEC w84[2] = {                                                     \
824       add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
825       add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
826     };                                                                         \
827     const T_VEC w85[2] = {                                                     \
828       add(w52[0],                                                              \
829           sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
830       add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
831     };                                                                         \
832     const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
833     const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
834     const T_VEC w88[2] = {                                                     \
835       sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
836       add(w49[1],                                                              \
837           sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
838     };                                                                         \
839     const T_VEC w89[2] = {                                                     \
840       add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
841       add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
842     };                                                                         \
843     const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
844                            sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
845     const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
846                            add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
847     const T_VEC w92[2] = {                                                     \
848       sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
849       add(w53[1],                                                              \
850           sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
851     };                                                                         \
852     const T_VEC w93[2] = {                                                     \
853       add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
854       add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
855     };                                                                         \
856     const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
857     const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
858     const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
859     const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
860     const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
861     const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
862     const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
863     const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
864     const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
865     const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
866     const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
867     const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
868     const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
869     const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
870     const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
871     const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
872     const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
873     const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
874     const T_VEC w112[2] = {                                                    \
875       add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
876       add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
877     };                                                                         \
878     const T_VEC w113[2] = {                                                    \
879       add(w100[0],                                                             \
880           sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
881       add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
882     };                                                                         \
883     const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
884     const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
885     const T_VEC w116[2] = {                                                    \
886       sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
887       sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
888     };                                                                         \
889     const T_VEC w117[2] = {                                                    \
890       add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
891       add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
892     };                                                                         \
893     const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
894     const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
895     const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
896     const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
897     const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
898     const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
899     const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
900     const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
901     const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
902     const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
903     const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
904     const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
905     const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
906     const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
907     const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
908     const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
909     const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
910     const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
911     const T_VEC w136[2] = {                                                    \
912       add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
913       add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
914     };                                                                         \
915     const T_VEC w137[2] = {                                                    \
916       add(w124[0],                                                             \
917           sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
918       add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
919     };                                                                         \
920     const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
921     const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
922     const T_VEC w140[2] = {                                                    \
923       sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
924       sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
925     };                                                                         \
926     const T_VEC w141[2] = {                                                    \
927       add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
928       add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
929     };                                                                         \
930     const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
931     const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
932     const T_VEC w144[2] = {                                                    \
933       add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
934       add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
935     };                                                                         \
936     const T_VEC w145[2] = {                                                    \
937       add(w112[0],                                                             \
938           sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
939       add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
940     };                                                                         \
941     const T_VEC w146[2] = {                                                    \
942       add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
943       add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
944     };                                                                         \
945     const T_VEC w147[2] = {                                                    \
946       add(w114[0],                                                             \
947           sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
948       add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
949     };                                                                         \
950     const T_VEC w148[2] = {                                                    \
951       add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
952       add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
953     };                                                                         \
954     const T_VEC w149[2] = {                                                    \
955       add(w116[0],                                                             \
956           sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
957       add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
958     };                                                                         \
959     const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
960     const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
961     const T_VEC w152[2] = {                                                    \
962       sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
963       add(w113[1],                                                             \
964           sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
965     };                                                                         \
966     const T_VEC w153[2] = {                                                    \
967       add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
968       add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
969     };                                                                         \
970     const T_VEC w154[2] = {                                                    \
971       sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
972       sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
973     };                                                                         \
974     const T_VEC w155[2] = {                                                    \
975       add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
976       add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
977     };                                                                         \
978     const T_VEC w156[2] = {                                                    \
979       sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
980       add(w117[1],                                                             \
981           sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
982     };                                                                         \
983     const T_VEC w157[2] = {                                                    \
984       add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
985       add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
986     };                                                                         \
987     store(output + 0 * stride, add(w78[0], w142[0]));                          \
988     store(output + 1 * stride,                                                 \
989           add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
990     store(output + 2 * stride,                                                 \
991           add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
992     store(output + 3 * stride,                                                 \
993           add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
994     store(output + 4 * stride,                                                 \
995           add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
996     store(output + 5 * stride,                                                 \
997           add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
998     store(output + 6 * stride,                                                 \
999           add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
1000     store(output + 7 * stride,                                                 \
1001           add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
1002     store(output + 8 * stride, add(w79[0], w143[1]));                          \
1003     store(output + 9 * stride,                                                 \
1004           sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
1005     store(output + 10 * stride,                                                \
1006           sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
1007     store(output + 11 * stride,                                                \
1008           sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
1009     store(output + 12 * stride,                                                \
1010           sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
1011     store(output + 13 * stride,                                                \
1012           sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
1013     store(output + 14 * stride,                                                \
1014           sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
1015     store(output + 15 * stride,                                                \
1016           sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
1017     store(output + 16 * stride, sub(w78[0], w142[0]));                         \
1018     store(output + 17 * stride,                                                \
1019           add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
1020                           mul(kWeight6, w144[1]))));                           \
1021     store(output + 18 * stride,                                                \
1022           add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
1023                           mul(kWeight4, w146[1]))));                           \
1024     store(output + 19 * stride,                                                \
1025           add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
1026                           mul(kWeight8, w148[1]))));                           \
1027     store(output + 20 * stride,                                                \
1028           add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
1029                           mul(kWeight2, w150[1]))));                           \
1030     store(output + 21 * stride,                                                \
1031           add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
1032                           mul(kWeight7, w152[1]))));                           \
1033     store(output + 22 * stride,                                                \
1034           add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
1035                           mul(kWeight3, w154[1]))));                           \
1036     store(output + 23 * stride,                                                \
1037           add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
1038                           mul(kWeight5, w156[1]))));                           \
1039     store(output + 24 * stride, sub(w79[0], w143[1]));                         \
1040     store(output + 25 * stride,                                                \
1041           add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
1042     store(output + 26 * stride,                                                \
1043           add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
1044     store(output + 27 * stride,                                                \
1045           add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
1046     store(output + 28 * stride,                                                \
1047           add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
1048     store(output + 29 * stride,                                                \
1049           add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
1050     store(output + 30 * stride,                                                \
1051           add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
1052     store(output + 31 * stride,                                                \
1053           add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
1054   }
1055 
1056 #endif  // AOM_AOM_DSP_FFT_COMMON_H_
1057