1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_FFT_COMMON_H_ 13 #define AOM_AOM_DSP_FFT_COMMON_H_ 14 15 #ifdef __cplusplus 16 extern "C" { 17 #endif 18 19 /*!\brief A function pointer for computing 1d fft and ifft. 20 * 21 * The function will point to an implementation for a specific transform size, 22 * and may perform the transforms using vectorized instructions. 23 * 24 * For a non-vectorized forward transforms of size n, the input and output 25 * buffers will be size n. The output takes advantage of conjugate symmetry and 26 * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where 27 * (r_{j}, i_{j}) is the complex output for index j. 28 * 29 * An inverse transform will assume that the complex "input" is packed 30 * similarly. Its output will be real. 31 * 32 * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. 33 * 34 * Vectorized implementations are parallelized along the columns so that the fft 35 * can be performed on multiple columns at a time. In such cases the data block 36 * for input and output is typically square (n x n) and the stride will 37 * correspond to the spacing between rows. At minimum, the input size must be 38 * n x simd_vector_length. 39 * 40 * \param[in] input Input buffer. See above for size restrictions. 41 * \param[out] output Output buffer. See above for size restrictions. 42 * \param[in] stride The spacing in number of elements between rows 43 * (or elements) 44 */ 45 typedef void (*aom_fft_1d_func_t)(const float *input, float *output, 46 int stride); 47 48 // Declare some of the forward non-vectorized transforms which are used in some 49 // of the vectorized implementations 50 void aom_fft1d_2_float(const float *input, float *output, int stride); 51 void aom_fft1d_4_float(const float *input, float *output, int stride); 52 void aom_fft1d_8_float(const float *input, float *output, int stride); 53 void aom_fft1d_16_float(const float *input, float *output, int stride); 54 void aom_fft1d_32_float(const float *input, float *output, int stride); 55 void aom_ifft1d_2_float(const float *input, float *output, int stride); 56 void aom_ifft1d_4_float(const float *input, float *output, int stride); 57 void aom_ifft1d_8_float(const float *input, float *output, int stride); 58 void aom_ifft1d_16_float(const float *input, float *output, int stride); 59 void aom_ifft1d_32_float(const float *input, float *output, int stride); 60 61 /**\!brief Function pointer for transposing a matrix of floats. 62 * 63 * \param[in] input Input buffer (size n x n) 64 * \param[out] output Output buffer (size n x n) 65 * \param[in] n Extent of one dimension of the square matrix. 66 */ 67 typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, 68 int n); 69 70 /**\!brief Function pointer for re-arranging intermediate 2d transform results. 71 * 72 * After re-arrangement, the real and imaginary components will be packed 73 * tightly next to each other. 74 * 75 * \param[in] input Input buffer (size n x n) 76 * \param[out] output Output buffer (size 2 x n x n) 77 * \param[in] n Extent of one dimension of the square matrix. 78 */ 79 typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); 80 81 /*!\brief Performs a 2d fft with the given functions. 82 * 83 * This generator function allows for multiple different implementations of 2d 84 * fft with different vector operations, without having to redefine the main 85 * body multiple times. 86 * 87 * \param[in] input Input buffer to run the transform on (size n x n) 88 * \param[out] temp Working buffer for computing the transform (size n x n) 89 * \param[out] output Output buffer (size 2 x n x n) 90 * \param[in] tform Forward transform function 91 * \param[in] transpose Transpose function (for n x n matrix) 92 * \param[in] unpack Unpack function used to massage outputs to correct form 93 * \param[in] vec_size Vector size (the transform is done vec_size units at 94 * a time) 95 */ 96 void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, 97 aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, 98 aom_fft_unpack_func_t unpack, int vec_size); 99 100 /*!\brief Perform a 2d inverse fft with the given helper functions 101 * 102 * \param[in] input Input buffer to run the transform on (size 2 x n x n) 103 * \param[out] temp Working buffer for computations (size 2 x n x n) 104 * \param[out] output Output buffer (size n x n) 105 * \param[in] fft_single Forward transform function (non vectorized) 106 * \param[in] fft_multi Forward transform function (vectorized) 107 * \param[in] ifft_multi Inverse transform function (vectorized) 108 * \param[in] transpose Transpose function (for n x n matrix) 109 * \param[in] vec_size Vector size (the transform is done vec_size 110 * units at a time) 111 */ 112 void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, 113 aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, 114 aom_fft_1d_func_t ifft_multi, 115 aom_fft_transpose_func_t transpose, int vec_size); 116 #ifdef __cplusplus 117 } 118 #endif 119 120 // The macros below define 1D fft/ifft for different data types and for 121 // different simd vector intrinsic types. 122 123 #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ 124 ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ 125 const T_VEC i0 = load(input + 0 * stride); \ 126 const T_VEC i1 = load(input + 1 * stride); \ 127 store(output + 0 * stride, i0 + i1); \ 128 store(output + 1 * stride, i0 - i1); \ 129 } 130 131 #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ 132 ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ 133 const T_VEC kWeight0 = constant(0.0f); \ 134 const T_VEC i0 = load(input + 0 * stride); \ 135 const T_VEC i1 = load(input + 1 * stride); \ 136 const T_VEC i2 = load(input + 2 * stride); \ 137 const T_VEC i3 = load(input + 3 * stride); \ 138 const T_VEC w0 = add(i0, i2); \ 139 const T_VEC w1 = sub(i0, i2); \ 140 const T_VEC w2 = add(i1, i3); \ 141 const T_VEC w3 = sub(i1, i3); \ 142 store(output + 0 * stride, add(w0, w2)); \ 143 store(output + 1 * stride, w1); \ 144 store(output + 2 * stride, sub(w0, w2)); \ 145 store(output + 3 * stride, sub(kWeight0, w3)); \ 146 } 147 148 #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ 149 ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ 150 const T_VEC kWeight0 = constant(0.0f); \ 151 const T_VEC kWeight2 = constant(0.707107f); \ 152 const T_VEC i0 = load(input + 0 * stride); \ 153 const T_VEC i1 = load(input + 1 * stride); \ 154 const T_VEC i2 = load(input + 2 * stride); \ 155 const T_VEC i3 = load(input + 3 * stride); \ 156 const T_VEC i4 = load(input + 4 * stride); \ 157 const T_VEC i5 = load(input + 5 * stride); \ 158 const T_VEC i6 = load(input + 6 * stride); \ 159 const T_VEC i7 = load(input + 7 * stride); \ 160 const T_VEC w0 = add(i0, i4); \ 161 const T_VEC w1 = sub(i0, i4); \ 162 const T_VEC w2 = add(i2, i6); \ 163 const T_VEC w3 = sub(i2, i6); \ 164 const T_VEC w4 = add(w0, w2); \ 165 const T_VEC w5 = sub(w0, w2); \ 166 const T_VEC w7 = add(i1, i5); \ 167 const T_VEC w8 = sub(i1, i5); \ 168 const T_VEC w9 = add(i3, i7); \ 169 const T_VEC w10 = sub(i3, i7); \ 170 const T_VEC w11 = add(w7, w9); \ 171 const T_VEC w12 = sub(w7, w9); \ 172 store(output + 0 * stride, add(w4, w11)); \ 173 store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ 174 store(output + 2 * stride, w5); \ 175 store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ 176 store(output + 4 * stride, sub(w4, w11)); \ 177 store(output + 5 * stride, \ 178 sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ 179 store(output + 6 * stride, sub(kWeight0, w12)); \ 180 store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ 181 } 182 183 #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ 184 mul) \ 185 ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ 186 const T_VEC kWeight0 = constant(0.0f); \ 187 const T_VEC kWeight2 = constant(0.707107f); \ 188 const T_VEC kWeight3 = constant(0.92388f); \ 189 const T_VEC kWeight4 = constant(0.382683f); \ 190 const T_VEC i0 = load(input + 0 * stride); \ 191 const T_VEC i1 = load(input + 1 * stride); \ 192 const T_VEC i2 = load(input + 2 * stride); \ 193 const T_VEC i3 = load(input + 3 * stride); \ 194 const T_VEC i4 = load(input + 4 * stride); \ 195 const T_VEC i5 = load(input + 5 * stride); \ 196 const T_VEC i6 = load(input + 6 * stride); \ 197 const T_VEC i7 = load(input + 7 * stride); \ 198 const T_VEC i8 = load(input + 8 * stride); \ 199 const T_VEC i9 = load(input + 9 * stride); \ 200 const T_VEC i10 = load(input + 10 * stride); \ 201 const T_VEC i11 = load(input + 11 * stride); \ 202 const T_VEC i12 = load(input + 12 * stride); \ 203 const T_VEC i13 = load(input + 13 * stride); \ 204 const T_VEC i14 = load(input + 14 * stride); \ 205 const T_VEC i15 = load(input + 15 * stride); \ 206 const T_VEC w0 = add(i0, i8); \ 207 const T_VEC w1 = sub(i0, i8); \ 208 const T_VEC w2 = add(i4, i12); \ 209 const T_VEC w3 = sub(i4, i12); \ 210 const T_VEC w4 = add(w0, w2); \ 211 const T_VEC w5 = sub(w0, w2); \ 212 const T_VEC w7 = add(i2, i10); \ 213 const T_VEC w8 = sub(i2, i10); \ 214 const T_VEC w9 = add(i6, i14); \ 215 const T_VEC w10 = sub(i6, i14); \ 216 const T_VEC w11 = add(w7, w9); \ 217 const T_VEC w12 = sub(w7, w9); \ 218 const T_VEC w14 = add(w4, w11); \ 219 const T_VEC w15 = sub(w4, w11); \ 220 const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ 221 sub(sub(kWeight0, w3), \ 222 mul(kWeight2, add(w10, w8))) }; \ 223 const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ 224 sub(w3, mul(kWeight2, add(w10, w8))) }; \ 225 const T_VEC w19 = add(i1, i9); \ 226 const T_VEC w20 = sub(i1, i9); \ 227 const T_VEC w21 = add(i5, i13); \ 228 const T_VEC w22 = sub(i5, i13); \ 229 const T_VEC w23 = add(w19, w21); \ 230 const T_VEC w24 = sub(w19, w21); \ 231 const T_VEC w26 = add(i3, i11); \ 232 const T_VEC w27 = sub(i3, i11); \ 233 const T_VEC w28 = add(i7, i15); \ 234 const T_VEC w29 = sub(i7, i15); \ 235 const T_VEC w30 = add(w26, w28); \ 236 const T_VEC w31 = sub(w26, w28); \ 237 const T_VEC w33 = add(w23, w30); \ 238 const T_VEC w34 = sub(w23, w30); \ 239 const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ 240 sub(sub(kWeight0, w22), \ 241 mul(kWeight2, add(w29, w27))) }; \ 242 const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ 243 sub(w22, mul(kWeight2, add(w29, w27))) }; \ 244 store(output + 0 * stride, add(w14, w33)); \ 245 store(output + 1 * stride, \ 246 add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ 247 store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ 248 store(output + 3 * stride, \ 249 add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ 250 store(output + 4 * stride, w15); \ 251 store(output + 5 * stride, \ 252 add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ 253 mul(kWeight3, w37[1])))); \ 254 store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ 255 store(output + 7 * stride, \ 256 add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ 257 mul(kWeight4, w35[1])))); \ 258 store(output + 8 * stride, sub(w14, w33)); \ 259 store(output + 9 * stride, \ 260 add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ 261 store(output + 10 * stride, \ 262 sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ 263 store(output + 11 * stride, \ 264 add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ 265 store(output + 12 * stride, sub(kWeight0, w34)); \ 266 store(output + 13 * stride, \ 267 sub(sub(kWeight0, w18[1]), \ 268 sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ 269 store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ 270 store(output + 15 * stride, \ 271 sub(sub(kWeight0, w16[1]), \ 272 sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ 273 } 274 275 #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ 276 mul) \ 277 ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ 278 const T_VEC kWeight0 = constant(0.0f); \ 279 const T_VEC kWeight2 = constant(0.707107f); \ 280 const T_VEC kWeight3 = constant(0.92388f); \ 281 const T_VEC kWeight4 = constant(0.382683f); \ 282 const T_VEC kWeight5 = constant(0.980785f); \ 283 const T_VEC kWeight6 = constant(0.19509f); \ 284 const T_VEC kWeight7 = constant(0.83147f); \ 285 const T_VEC kWeight8 = constant(0.55557f); \ 286 const T_VEC i0 = load(input + 0 * stride); \ 287 const T_VEC i1 = load(input + 1 * stride); \ 288 const T_VEC i2 = load(input + 2 * stride); \ 289 const T_VEC i3 = load(input + 3 * stride); \ 290 const T_VEC i4 = load(input + 4 * stride); \ 291 const T_VEC i5 = load(input + 5 * stride); \ 292 const T_VEC i6 = load(input + 6 * stride); \ 293 const T_VEC i7 = load(input + 7 * stride); \ 294 const T_VEC i8 = load(input + 8 * stride); \ 295 const T_VEC i9 = load(input + 9 * stride); \ 296 const T_VEC i10 = load(input + 10 * stride); \ 297 const T_VEC i11 = load(input + 11 * stride); \ 298 const T_VEC i12 = load(input + 12 * stride); \ 299 const T_VEC i13 = load(input + 13 * stride); \ 300 const T_VEC i14 = load(input + 14 * stride); \ 301 const T_VEC i15 = load(input + 15 * stride); \ 302 const T_VEC i16 = load(input + 16 * stride); \ 303 const T_VEC i17 = load(input + 17 * stride); \ 304 const T_VEC i18 = load(input + 18 * stride); \ 305 const T_VEC i19 = load(input + 19 * stride); \ 306 const T_VEC i20 = load(input + 20 * stride); \ 307 const T_VEC i21 = load(input + 21 * stride); \ 308 const T_VEC i22 = load(input + 22 * stride); \ 309 const T_VEC i23 = load(input + 23 * stride); \ 310 const T_VEC i24 = load(input + 24 * stride); \ 311 const T_VEC i25 = load(input + 25 * stride); \ 312 const T_VEC i26 = load(input + 26 * stride); \ 313 const T_VEC i27 = load(input + 27 * stride); \ 314 const T_VEC i28 = load(input + 28 * stride); \ 315 const T_VEC i29 = load(input + 29 * stride); \ 316 const T_VEC i30 = load(input + 30 * stride); \ 317 const T_VEC i31 = load(input + 31 * stride); \ 318 const T_VEC w0 = add(i0, i16); \ 319 const T_VEC w1 = sub(i0, i16); \ 320 const T_VEC w2 = add(i8, i24); \ 321 const T_VEC w3 = sub(i8, i24); \ 322 const T_VEC w4 = add(w0, w2); \ 323 const T_VEC w5 = sub(w0, w2); \ 324 const T_VEC w7 = add(i4, i20); \ 325 const T_VEC w8 = sub(i4, i20); \ 326 const T_VEC w9 = add(i12, i28); \ 327 const T_VEC w10 = sub(i12, i28); \ 328 const T_VEC w11 = add(w7, w9); \ 329 const T_VEC w12 = sub(w7, w9); \ 330 const T_VEC w14 = add(w4, w11); \ 331 const T_VEC w15 = sub(w4, w11); \ 332 const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ 333 sub(sub(kWeight0, w3), \ 334 mul(kWeight2, add(w10, w8))) }; \ 335 const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ 336 sub(w3, mul(kWeight2, add(w10, w8))) }; \ 337 const T_VEC w19 = add(i2, i18); \ 338 const T_VEC w20 = sub(i2, i18); \ 339 const T_VEC w21 = add(i10, i26); \ 340 const T_VEC w22 = sub(i10, i26); \ 341 const T_VEC w23 = add(w19, w21); \ 342 const T_VEC w24 = sub(w19, w21); \ 343 const T_VEC w26 = add(i6, i22); \ 344 const T_VEC w27 = sub(i6, i22); \ 345 const T_VEC w28 = add(i14, i30); \ 346 const T_VEC w29 = sub(i14, i30); \ 347 const T_VEC w30 = add(w26, w28); \ 348 const T_VEC w31 = sub(w26, w28); \ 349 const T_VEC w33 = add(w23, w30); \ 350 const T_VEC w34 = sub(w23, w30); \ 351 const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ 352 sub(sub(kWeight0, w22), \ 353 mul(kWeight2, add(w29, w27))) }; \ 354 const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ 355 sub(w22, mul(kWeight2, add(w29, w27))) }; \ 356 const T_VEC w38 = add(w14, w33); \ 357 const T_VEC w39 = sub(w14, w33); \ 358 const T_VEC w40[2] = { \ 359 add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ 360 add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ 361 }; \ 362 const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ 363 sub(sub(kWeight0, w12), \ 364 mul(kWeight2, add(w31, w24))) }; \ 365 const T_VEC w42[2] = { \ 366 add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ 367 add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ 368 }; \ 369 const T_VEC w44[2] = { \ 370 add(w18[0], \ 371 sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ 372 sub(sub(kWeight0, w18[1]), \ 373 sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ 374 }; \ 375 const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ 376 sub(w12, mul(kWeight2, add(w31, w24))) }; \ 377 const T_VEC w46[2] = { \ 378 add(w16[0], \ 379 sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ 380 sub(sub(kWeight0, w16[1]), \ 381 sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ 382 }; \ 383 const T_VEC w47 = add(i1, i17); \ 384 const T_VEC w48 = sub(i1, i17); \ 385 const T_VEC w49 = add(i9, i25); \ 386 const T_VEC w50 = sub(i9, i25); \ 387 const T_VEC w51 = add(w47, w49); \ 388 const T_VEC w52 = sub(w47, w49); \ 389 const T_VEC w54 = add(i5, i21); \ 390 const T_VEC w55 = sub(i5, i21); \ 391 const T_VEC w56 = add(i13, i29); \ 392 const T_VEC w57 = sub(i13, i29); \ 393 const T_VEC w58 = add(w54, w56); \ 394 const T_VEC w59 = sub(w54, w56); \ 395 const T_VEC w61 = add(w51, w58); \ 396 const T_VEC w62 = sub(w51, w58); \ 397 const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ 398 sub(sub(kWeight0, w50), \ 399 mul(kWeight2, add(w57, w55))) }; \ 400 const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ 401 sub(w50, mul(kWeight2, add(w57, w55))) }; \ 402 const T_VEC w66 = add(i3, i19); \ 403 const T_VEC w67 = sub(i3, i19); \ 404 const T_VEC w68 = add(i11, i27); \ 405 const T_VEC w69 = sub(i11, i27); \ 406 const T_VEC w70 = add(w66, w68); \ 407 const T_VEC w71 = sub(w66, w68); \ 408 const T_VEC w73 = add(i7, i23); \ 409 const T_VEC w74 = sub(i7, i23); \ 410 const T_VEC w75 = add(i15, i31); \ 411 const T_VEC w76 = sub(i15, i31); \ 412 const T_VEC w77 = add(w73, w75); \ 413 const T_VEC w78 = sub(w73, w75); \ 414 const T_VEC w80 = add(w70, w77); \ 415 const T_VEC w81 = sub(w70, w77); \ 416 const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ 417 sub(sub(kWeight0, w69), \ 418 mul(kWeight2, add(w76, w74))) }; \ 419 const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ 420 sub(w69, mul(kWeight2, add(w76, w74))) }; \ 421 const T_VEC w85 = add(w61, w80); \ 422 const T_VEC w86 = sub(w61, w80); \ 423 const T_VEC w87[2] = { \ 424 add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ 425 add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ 426 }; \ 427 const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ 428 sub(sub(kWeight0, w59), \ 429 mul(kWeight2, add(w78, w71))) }; \ 430 const T_VEC w89[2] = { \ 431 add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ 432 add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ 433 }; \ 434 const T_VEC w91[2] = { \ 435 add(w65[0], \ 436 sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ 437 sub(sub(kWeight0, w65[1]), \ 438 sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ 439 }; \ 440 const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ 441 sub(w59, mul(kWeight2, add(w78, w71))) }; \ 442 const T_VEC w93[2] = { \ 443 add(w63[0], \ 444 sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ 445 sub(sub(kWeight0, w63[1]), \ 446 sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ 447 }; \ 448 store(output + 0 * stride, add(w38, w85)); \ 449 store(output + 1 * stride, \ 450 add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ 451 store(output + 2 * stride, \ 452 add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ 453 store(output + 3 * stride, \ 454 add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ 455 store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ 456 store(output + 5 * stride, \ 457 add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ 458 store(output + 6 * stride, \ 459 add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ 460 store(output + 7 * stride, \ 461 add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ 462 store(output + 8 * stride, w39); \ 463 store(output + 9 * stride, \ 464 add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ 465 mul(kWeight5, w93[1])))); \ 466 store(output + 10 * stride, \ 467 add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ 468 mul(kWeight3, w92[1])))); \ 469 store(output + 11 * stride, \ 470 add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ 471 mul(kWeight7, w91[1])))); \ 472 store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ 473 store(output + 13 * stride, \ 474 add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ 475 mul(kWeight8, w89[1])))); \ 476 store(output + 14 * stride, \ 477 add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ 478 mul(kWeight4, w88[1])))); \ 479 store(output + 15 * stride, \ 480 add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ 481 mul(kWeight6, w87[1])))); \ 482 store(output + 16 * stride, sub(w38, w85)); \ 483 store(output + 17 * stride, \ 484 add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ 485 store(output + 18 * stride, \ 486 add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ 487 store(output + 19 * stride, \ 488 add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ 489 store(output + 20 * stride, \ 490 sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ 491 store(output + 21 * stride, \ 492 add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ 493 store(output + 22 * stride, \ 494 add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ 495 store(output + 23 * stride, \ 496 add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ 497 store(output + 24 * stride, sub(kWeight0, w86)); \ 498 store(output + 25 * stride, \ 499 sub(sub(kWeight0, w46[1]), \ 500 sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ 501 store(output + 26 * stride, \ 502 sub(sub(kWeight0, w45[1]), \ 503 sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ 504 store(output + 27 * stride, \ 505 sub(sub(kWeight0, w44[1]), \ 506 sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ 507 store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ 508 store(output + 29 * stride, \ 509 sub(sub(kWeight0, w42[1]), \ 510 sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ 511 store(output + 30 * stride, \ 512 sub(sub(kWeight0, w41[1]), \ 513 sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ 514 store(output + 31 * stride, \ 515 sub(sub(kWeight0, w40[1]), \ 516 sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ 517 } 518 519 #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ 520 ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ 521 const T_VEC i0 = load(input + 0 * stride); \ 522 const T_VEC i1 = load(input + 1 * stride); \ 523 store(output + 0 * stride, i0 + i1); \ 524 store(output + 1 * stride, i0 - i1); \ 525 } 526 527 #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ 528 ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ 529 const T_VEC kWeight0 = constant(0.0f); \ 530 const T_VEC i0 = load(input + 0 * stride); \ 531 const T_VEC i1 = load(input + 1 * stride); \ 532 const T_VEC i2 = load(input + 2 * stride); \ 533 const T_VEC i3 = load(input + 3 * stride); \ 534 const T_VEC w2 = add(i0, i2); \ 535 const T_VEC w3 = sub(i0, i2); \ 536 const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ 537 const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ 538 store(output + 0 * stride, add(w2, w4[0])); \ 539 store(output + 1 * stride, add(w3, w5[1])); \ 540 store(output + 2 * stride, sub(w2, w4[0])); \ 541 store(output + 3 * stride, sub(w3, w5[1])); \ 542 } 543 544 #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ 545 mul) \ 546 ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ 547 const T_VEC kWeight0 = constant(0.0f); \ 548 const T_VEC kWeight2 = constant(0.707107f); \ 549 const T_VEC i0 = load(input + 0 * stride); \ 550 const T_VEC i1 = load(input + 1 * stride); \ 551 const T_VEC i2 = load(input + 2 * stride); \ 552 const T_VEC i3 = load(input + 3 * stride); \ 553 const T_VEC i4 = load(input + 4 * stride); \ 554 const T_VEC i5 = load(input + 5 * stride); \ 555 const T_VEC i6 = load(input + 6 * stride); \ 556 const T_VEC i7 = load(input + 7 * stride); \ 557 const T_VEC w6 = add(i0, i4); \ 558 const T_VEC w7 = sub(i0, i4); \ 559 const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ 560 const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ 561 const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ 562 const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ 563 const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ 564 const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ 565 const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ 566 const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ 567 const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ 568 const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ 569 const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ 570 const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ 571 const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ 572 const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ 573 store(output + 0 * stride, add(w10[0], w18[0])); \ 574 store(output + 1 * stride, \ 575 add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ 576 store(output + 2 * stride, add(w11[0], w19[1])); \ 577 store(output + 3 * stride, \ 578 sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ 579 store(output + 4 * stride, sub(w10[0], w18[0])); \ 580 store(output + 5 * stride, \ 581 add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ 582 mul(kWeight2, w20[1])))); \ 583 store(output + 6 * stride, sub(w11[0], w19[1])); \ 584 store(output + 7 * stride, \ 585 add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ 586 } 587 588 #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ 589 mul) \ 590 ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ 591 const T_VEC kWeight0 = constant(0.0f); \ 592 const T_VEC kWeight2 = constant(0.707107f); \ 593 const T_VEC kWeight3 = constant(0.92388f); \ 594 const T_VEC kWeight4 = constant(0.382683f); \ 595 const T_VEC i0 = load(input + 0 * stride); \ 596 const T_VEC i1 = load(input + 1 * stride); \ 597 const T_VEC i2 = load(input + 2 * stride); \ 598 const T_VEC i3 = load(input + 3 * stride); \ 599 const T_VEC i4 = load(input + 4 * stride); \ 600 const T_VEC i5 = load(input + 5 * stride); \ 601 const T_VEC i6 = load(input + 6 * stride); \ 602 const T_VEC i7 = load(input + 7 * stride); \ 603 const T_VEC i8 = load(input + 8 * stride); \ 604 const T_VEC i9 = load(input + 9 * stride); \ 605 const T_VEC i10 = load(input + 10 * stride); \ 606 const T_VEC i11 = load(input + 11 * stride); \ 607 const T_VEC i12 = load(input + 12 * stride); \ 608 const T_VEC i13 = load(input + 13 * stride); \ 609 const T_VEC i14 = load(input + 14 * stride); \ 610 const T_VEC i15 = load(input + 15 * stride); \ 611 const T_VEC w14 = add(i0, i8); \ 612 const T_VEC w15 = sub(i0, i8); \ 613 const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ 614 const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ 615 const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ 616 const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ 617 const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ 618 const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ 619 const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ 620 const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ 621 const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ 622 const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ 623 const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ 624 const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ 625 const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ 626 const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ 627 const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ 628 const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ 629 const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ 630 add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ 631 const T_VEC w33[2] = { add(w20[0], \ 632 sub(sub(kWeight0, mul(kWeight2, w28[0])), \ 633 mul(kWeight2, w28[1]))), \ 634 add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ 635 const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ 636 const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ 637 const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ 638 sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ 639 const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ 640 add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ 641 const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ 642 const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ 643 const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ 644 const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ 645 const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ 646 const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ 647 const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ 648 const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ 649 const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ 650 const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ 651 const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ 652 const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ 653 const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ 654 const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ 655 const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ 656 const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ 657 const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ 658 const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ 659 const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ 660 add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ 661 const T_VEC w57[2] = { add(w44[0], \ 662 sub(sub(kWeight0, mul(kWeight2, w52[0])), \ 663 mul(kWeight2, w52[1]))), \ 664 add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ 665 const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ 666 const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ 667 const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ 668 sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ 669 const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ 670 add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ 671 store(output + 0 * stride, add(w30[0], w54[0])); \ 672 store(output + 1 * stride, \ 673 add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ 674 store(output + 2 * stride, \ 675 add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ 676 store(output + 3 * stride, \ 677 add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ 678 store(output + 4 * stride, add(w31[0], w55[1])); \ 679 store(output + 5 * stride, \ 680 sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ 681 store(output + 6 * stride, \ 682 sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ 683 store(output + 7 * stride, \ 684 sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ 685 store(output + 8 * stride, sub(w30[0], w54[0])); \ 686 store(output + 9 * stride, \ 687 add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ 688 mul(kWeight4, w56[1])))); \ 689 store(output + 10 * stride, \ 690 add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ 691 mul(kWeight2, w58[1])))); \ 692 store(output + 11 * stride, \ 693 add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ 694 mul(kWeight3, w60[1])))); \ 695 store(output + 12 * stride, sub(w31[0], w55[1])); \ 696 store(output + 13 * stride, \ 697 add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ 698 store(output + 14 * stride, \ 699 add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ 700 store(output + 15 * stride, \ 701 add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ 702 } 703 #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ 704 mul) \ 705 ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ 706 const T_VEC kWeight0 = constant(0.0f); \ 707 const T_VEC kWeight2 = constant(0.707107f); \ 708 const T_VEC kWeight3 = constant(0.92388f); \ 709 const T_VEC kWeight4 = constant(0.382683f); \ 710 const T_VEC kWeight5 = constant(0.980785f); \ 711 const T_VEC kWeight6 = constant(0.19509f); \ 712 const T_VEC kWeight7 = constant(0.83147f); \ 713 const T_VEC kWeight8 = constant(0.55557f); \ 714 const T_VEC i0 = load(input + 0 * stride); \ 715 const T_VEC i1 = load(input + 1 * stride); \ 716 const T_VEC i2 = load(input + 2 * stride); \ 717 const T_VEC i3 = load(input + 3 * stride); \ 718 const T_VEC i4 = load(input + 4 * stride); \ 719 const T_VEC i5 = load(input + 5 * stride); \ 720 const T_VEC i6 = load(input + 6 * stride); \ 721 const T_VEC i7 = load(input + 7 * stride); \ 722 const T_VEC i8 = load(input + 8 * stride); \ 723 const T_VEC i9 = load(input + 9 * stride); \ 724 const T_VEC i10 = load(input + 10 * stride); \ 725 const T_VEC i11 = load(input + 11 * stride); \ 726 const T_VEC i12 = load(input + 12 * stride); \ 727 const T_VEC i13 = load(input + 13 * stride); \ 728 const T_VEC i14 = load(input + 14 * stride); \ 729 const T_VEC i15 = load(input + 15 * stride); \ 730 const T_VEC i16 = load(input + 16 * stride); \ 731 const T_VEC i17 = load(input + 17 * stride); \ 732 const T_VEC i18 = load(input + 18 * stride); \ 733 const T_VEC i19 = load(input + 19 * stride); \ 734 const T_VEC i20 = load(input + 20 * stride); \ 735 const T_VEC i21 = load(input + 21 * stride); \ 736 const T_VEC i22 = load(input + 22 * stride); \ 737 const T_VEC i23 = load(input + 23 * stride); \ 738 const T_VEC i24 = load(input + 24 * stride); \ 739 const T_VEC i25 = load(input + 25 * stride); \ 740 const T_VEC i26 = load(input + 26 * stride); \ 741 const T_VEC i27 = load(input + 27 * stride); \ 742 const T_VEC i28 = load(input + 28 * stride); \ 743 const T_VEC i29 = load(input + 29 * stride); \ 744 const T_VEC i30 = load(input + 30 * stride); \ 745 const T_VEC i31 = load(input + 31 * stride); \ 746 const T_VEC w30 = add(i0, i16); \ 747 const T_VEC w31 = sub(i0, i16); \ 748 const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ 749 const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ 750 const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ 751 const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ 752 const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ 753 const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ 754 const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ 755 const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ 756 const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ 757 const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ 758 const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ 759 const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ 760 const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ 761 const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ 762 const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ 763 const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ 764 const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ 765 add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ 766 const T_VEC w49[2] = { add(w36[0], \ 767 sub(sub(kWeight0, mul(kWeight2, w44[0])), \ 768 mul(kWeight2, w44[1]))), \ 769 add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ 770 const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ 771 const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ 772 const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ 773 sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ 774 const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ 775 add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ 776 const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ 777 const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ 778 const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ 779 const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ 780 const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ 781 const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ 782 const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ 783 const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ 784 const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ 785 const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ 786 const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ 787 const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ 788 const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ 789 const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ 790 const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ 791 const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ 792 const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ 793 const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ 794 const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ 795 add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ 796 const T_VEC w73[2] = { add(w60[0], \ 797 sub(sub(kWeight0, mul(kWeight2, w68[0])), \ 798 mul(kWeight2, w68[1]))), \ 799 add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ 800 const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ 801 const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ 802 const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ 803 sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ 804 const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ 805 add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ 806 const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ 807 const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ 808 const T_VEC w80[2] = { \ 809 add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ 810 add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ 811 }; \ 812 const T_VEC w81[2] = { \ 813 add(w48[0], \ 814 sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ 815 add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ 816 }; \ 817 const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ 818 add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ 819 const T_VEC w83[2] = { add(w50[0], \ 820 sub(sub(kWeight0, mul(kWeight2, w74[0])), \ 821 mul(kWeight2, w74[1]))), \ 822 add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ 823 const T_VEC w84[2] = { \ 824 add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ 825 add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ 826 }; \ 827 const T_VEC w85[2] = { \ 828 add(w52[0], \ 829 sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ 830 add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ 831 }; \ 832 const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ 833 const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ 834 const T_VEC w88[2] = { \ 835 sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ 836 add(w49[1], \ 837 sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ 838 }; \ 839 const T_VEC w89[2] = { \ 840 add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ 841 add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ 842 }; \ 843 const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ 844 sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ 845 const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ 846 add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ 847 const T_VEC w92[2] = { \ 848 sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ 849 add(w53[1], \ 850 sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ 851 }; \ 852 const T_VEC w93[2] = { \ 853 add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ 854 add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ 855 }; \ 856 const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ 857 const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ 858 const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ 859 const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ 860 const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ 861 const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ 862 const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ 863 const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ 864 const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ 865 const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ 866 const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ 867 const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ 868 const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ 869 const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ 870 const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ 871 const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ 872 const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ 873 const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ 874 const T_VEC w112[2] = { \ 875 add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ 876 add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ 877 }; \ 878 const T_VEC w113[2] = { \ 879 add(w100[0], \ 880 sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ 881 add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ 882 }; \ 883 const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ 884 const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ 885 const T_VEC w116[2] = { \ 886 sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ 887 sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ 888 }; \ 889 const T_VEC w117[2] = { \ 890 add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ 891 add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ 892 }; \ 893 const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ 894 const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ 895 const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ 896 const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ 897 const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ 898 const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ 899 const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ 900 const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ 901 const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ 902 const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ 903 const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ 904 const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ 905 const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ 906 const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ 907 const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ 908 const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ 909 const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ 910 const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ 911 const T_VEC w136[2] = { \ 912 add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ 913 add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ 914 }; \ 915 const T_VEC w137[2] = { \ 916 add(w124[0], \ 917 sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ 918 add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ 919 }; \ 920 const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ 921 const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ 922 const T_VEC w140[2] = { \ 923 sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ 924 sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ 925 }; \ 926 const T_VEC w141[2] = { \ 927 add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ 928 add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ 929 }; \ 930 const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ 931 const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ 932 const T_VEC w144[2] = { \ 933 add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ 934 add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ 935 }; \ 936 const T_VEC w145[2] = { \ 937 add(w112[0], \ 938 sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ 939 add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ 940 }; \ 941 const T_VEC w146[2] = { \ 942 add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ 943 add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ 944 }; \ 945 const T_VEC w147[2] = { \ 946 add(w114[0], \ 947 sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ 948 add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ 949 }; \ 950 const T_VEC w148[2] = { \ 951 add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ 952 add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ 953 }; \ 954 const T_VEC w149[2] = { \ 955 add(w116[0], \ 956 sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ 957 add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ 958 }; \ 959 const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ 960 const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ 961 const T_VEC w152[2] = { \ 962 sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ 963 add(w113[1], \ 964 sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ 965 }; \ 966 const T_VEC w153[2] = { \ 967 add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ 968 add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ 969 }; \ 970 const T_VEC w154[2] = { \ 971 sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ 972 sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ 973 }; \ 974 const T_VEC w155[2] = { \ 975 add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ 976 add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ 977 }; \ 978 const T_VEC w156[2] = { \ 979 sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ 980 add(w117[1], \ 981 sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ 982 }; \ 983 const T_VEC w157[2] = { \ 984 add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ 985 add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ 986 }; \ 987 store(output + 0 * stride, add(w78[0], w142[0])); \ 988 store(output + 1 * stride, \ 989 add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ 990 store(output + 2 * stride, \ 991 add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ 992 store(output + 3 * stride, \ 993 add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ 994 store(output + 4 * stride, \ 995 add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ 996 store(output + 5 * stride, \ 997 add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ 998 store(output + 6 * stride, \ 999 add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ 1000 store(output + 7 * stride, \ 1001 add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ 1002 store(output + 8 * stride, add(w79[0], w143[1])); \ 1003 store(output + 9 * stride, \ 1004 sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ 1005 store(output + 10 * stride, \ 1006 sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ 1007 store(output + 11 * stride, \ 1008 sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ 1009 store(output + 12 * stride, \ 1010 sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ 1011 store(output + 13 * stride, \ 1012 sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ 1013 store(output + 14 * stride, \ 1014 sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ 1015 store(output + 15 * stride, \ 1016 sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ 1017 store(output + 16 * stride, sub(w78[0], w142[0])); \ 1018 store(output + 17 * stride, \ 1019 add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ 1020 mul(kWeight6, w144[1])))); \ 1021 store(output + 18 * stride, \ 1022 add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ 1023 mul(kWeight4, w146[1])))); \ 1024 store(output + 19 * stride, \ 1025 add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ 1026 mul(kWeight8, w148[1])))); \ 1027 store(output + 20 * stride, \ 1028 add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ 1029 mul(kWeight2, w150[1])))); \ 1030 store(output + 21 * stride, \ 1031 add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ 1032 mul(kWeight7, w152[1])))); \ 1033 store(output + 22 * stride, \ 1034 add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ 1035 mul(kWeight3, w154[1])))); \ 1036 store(output + 23 * stride, \ 1037 add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ 1038 mul(kWeight5, w156[1])))); \ 1039 store(output + 24 * stride, sub(w79[0], w143[1])); \ 1040 store(output + 25 * stride, \ 1041 add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ 1042 store(output + 26 * stride, \ 1043 add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ 1044 store(output + 27 * stride, \ 1045 add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ 1046 store(output + 28 * stride, \ 1047 add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ 1048 store(output + 29 * stride, \ 1049 add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ 1050 store(output + 30 * stride, \ 1051 add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ 1052 store(output + 31 * stride, \ 1053 add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ 1054 } 1055 1056 #endif // AOM_AOM_DSP_FFT_COMMON_H_ 1057