Lines Matching full:stride
32 * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
36 * for input and output is typically square (n x n) and the stride will
42 * \param[in] stride The spacing in number of elements between rows
46 int stride);
50 void aom_fft1d_2_float(const float *input, float *output, int stride);
51 void aom_fft1d_4_float(const float *input, float *output, int stride);
52 void aom_fft1d_8_float(const float *input, float *output, int stride);
53 void aom_fft1d_16_float(const float *input, float *output, int stride);
54 void aom_fft1d_32_float(const float *input, float *output, int stride);
119 ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
120 const T_VEC i0 = load(input + 0 * stride); \
121 const T_VEC i1 = load(input + 1 * stride); \
122 store(output + 0 * stride, i0 + i1); \
123 store(output + 1 * stride, i0 - i1); \
127 ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
129 const T_VEC i0 = load(input + 0 * stride); \
130 const T_VEC i1 = load(input + 1 * stride); \
131 const T_VEC i2 = load(input + 2 * stride); \
132 const T_VEC i3 = load(input + 3 * stride); \
137 store(output + 0 * stride, add(w0, w2)); \
138 store(output + 1 * stride, w1); \
139 store(output + 2 * stride, sub(w0, w2)); \
140 store(output + 3 * stride, sub(kWeight0, w3)); \
144 ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
147 const T_VEC i0 = load(input + 0 * stride); \
148 const T_VEC i1 = load(input + 1 * stride); \
149 const T_VEC i2 = load(input + 2 * stride); \
150 const T_VEC i3 = load(input + 3 * stride); \
151 const T_VEC i4 = load(input + 4 * stride); \
152 const T_VEC i5 = load(input + 5 * stride); \
153 const T_VEC i6 = load(input + 6 * stride); \
154 const T_VEC i7 = load(input + 7 * stride); \
167 store(output + 0 * stride, add(w4, w11)); \
168 store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
169 store(output + 2 * stride, w5); \
170 store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
171 store(output + 4 * stride, sub(w4, w11)); \
172 store(output + 5 * stride, \
174 store(output + 6 * stride, sub(kWeight0, w12)); \
175 store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
180 ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
185 const T_VEC i0 = load(input + 0 * stride); \
186 const T_VEC i1 = load(input + 1 * stride); \
187 const T_VEC i2 = load(input + 2 * stride); \
188 const T_VEC i3 = load(input + 3 * stride); \
189 const T_VEC i4 = load(input + 4 * stride); \
190 const T_VEC i5 = load(input + 5 * stride); \
191 const T_VEC i6 = load(input + 6 * stride); \
192 const T_VEC i7 = load(input + 7 * stride); \
193 const T_VEC i8 = load(input + 8 * stride); \
194 const T_VEC i9 = load(input + 9 * stride); \
195 const T_VEC i10 = load(input + 10 * stride); \
196 const T_VEC i11 = load(input + 11 * stride); \
197 const T_VEC i12 = load(input + 12 * stride); \
198 const T_VEC i13 = load(input + 13 * stride); \
199 const T_VEC i14 = load(input + 14 * stride); \
200 const T_VEC i15 = load(input + 15 * stride); \
239 store(output + 0 * stride, add(w14, w33)); \
240 store(output + 1 * stride, \
242 store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
243 store(output + 3 * stride, \
245 store(output + 4 * stride, w15); \
246 store(output + 5 * stride, \
249 store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
250 store(output + 7 * stride, \
253 store(output + 8 * stride, sub(w14, w33)); \
254 store(output + 9 * stride, \
256 store(output + 10 * stride, \
258 store(output + 11 * stride, \
260 store(output + 12 * stride, sub(kWeight0, w34)); \
261 store(output + 13 * stride, \
264 store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
265 store(output + 15 * stride, \
272 ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
281 const T_VEC i0 = load(input + 0 * stride); \
282 const T_VEC i1 = load(input + 1 * stride); \
283 const T_VEC i2 = load(input + 2 * stride); \
284 const T_VEC i3 = load(input + 3 * stride); \
285 const T_VEC i4 = load(input + 4 * stride); \
286 const T_VEC i5 = load(input + 5 * stride); \
287 const T_VEC i6 = load(input + 6 * stride); \
288 const T_VEC i7 = load(input + 7 * stride); \
289 const T_VEC i8 = load(input + 8 * stride); \
290 const T_VEC i9 = load(input + 9 * stride); \
291 const T_VEC i10 = load(input + 10 * stride); \
292 const T_VEC i11 = load(input + 11 * stride); \
293 const T_VEC i12 = load(input + 12 * stride); \
294 const T_VEC i13 = load(input + 13 * stride); \
295 const T_VEC i14 = load(input + 14 * stride); \
296 const T_VEC i15 = load(input + 15 * stride); \
297 const T_VEC i16 = load(input + 16 * stride); \
298 const T_VEC i17 = load(input + 17 * stride); \
299 const T_VEC i18 = load(input + 18 * stride); \
300 const T_VEC i19 = load(input + 19 * stride); \
301 const T_VEC i20 = load(input + 20 * stride); \
302 const T_VEC i21 = load(input + 21 * stride); \
303 const T_VEC i22 = load(input + 22 * stride); \
304 const T_VEC i23 = load(input + 23 * stride); \
305 const T_VEC i24 = load(input + 24 * stride); \
306 const T_VEC i25 = load(input + 25 * stride); \
307 const T_VEC i26 = load(input + 26 * stride); \
308 const T_VEC i27 = load(input + 27 * stride); \
309 const T_VEC i28 = load(input + 28 * stride); \
310 const T_VEC i29 = load(input + 29 * stride); \
311 const T_VEC i30 = load(input + 30 * stride); \
312 const T_VEC i31 = load(input + 31 * stride); \
443 store(output + 0 * stride, add(w38, w85)); \
444 store(output + 1 * stride, \
446 store(output + 2 * stride, \
448 store(output + 3 * stride, \
450 store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
451 store(output + 5 * stride, \
453 store(output + 6 * stride, \
455 store(output + 7 * stride, \
457 store(output + 8 * stride, w39); \
458 store(output + 9 * stride, \
461 store(output + 10 * stride, \
464 store(output + 11 * stride, \
467 store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
468 store(output + 13 * stride, \
471 store(output + 14 * stride, \
474 store(output + 15 * stride, \
477 store(output + 16 * stride, sub(w38, w85)); \
478 store(output + 17 * stride, \
480 store(output + 18 * stride, \
482 store(output + 19 * stride, \
484 store(output + 20 * stride, \
486 store(output + 21 * stride, \
488 store(output + 22 * stride, \
490 store(output + 23 * stride, \
492 store(output + 24 * stride, sub(kWeight0, w86)); \
493 store(output + 25 * stride, \
496 store(output + 26 * stride, \
499 store(output + 27 * stride, \
502 store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
503 store(output + 29 * stride, \
506 store(output + 30 * stride, \
509 store(output + 31 * stride, \
515 ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
516 const T_VEC i0 = load(input + 0 * stride); \
517 const T_VEC i1 = load(input + 1 * stride); \
518 store(output + 0 * stride, i0 + i1); \
519 store(output + 1 * stride, i0 - i1); \
523 ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
525 const T_VEC i0 = load(input + 0 * stride); \
526 const T_VEC i1 = load(input + 1 * stride); \
527 const T_VEC i2 = load(input + 2 * stride); \
528 const T_VEC i3 = load(input + 3 * stride); \
533 store(output + 0 * stride, add(w2, w4[0])); \
534 store(output + 1 * stride, add(w3, w5[1])); \
535 store(output + 2 * stride, sub(w2, w4[0])); \
536 store(output + 3 * stride, sub(w3, w5[1])); \
541 ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
544 const T_VEC i0 = load(input + 0 * stride); \
545 const T_VEC i1 = load(input + 1 * stride); \
546 const T_VEC i2 = load(input + 2 * stride); \
547 const T_VEC i3 = load(input + 3 * stride); \
548 const T_VEC i4 = load(input + 4 * stride); \
549 const T_VEC i5 = load(input + 5 * stride); \
550 const T_VEC i6 = load(input + 6 * stride); \
551 const T_VEC i7 = load(input + 7 * stride); \
568 store(output + 0 * stride, add(w10[0], w18[0])); \
569 store(output + 1 * stride, \
571 store(output + 2 * stride, add(w11[0], w19[1])); \
572 store(output + 3 * stride, \
574 store(output + 4 * stride, sub(w10[0], w18[0])); \
575 store(output + 5 * stride, \
578 store(output + 6 * stride, sub(w11[0], w19[1])); \
579 store(output + 7 * stride, \
585 ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
590 const T_VEC i0 = load(input + 0 * stride); \
591 const T_VEC i1 = load(input + 1 * stride); \
592 const T_VEC i2 = load(input + 2 * stride); \
593 const T_VEC i3 = load(input + 3 * stride); \
594 const T_VEC i4 = load(input + 4 * stride); \
595 const T_VEC i5 = load(input + 5 * stride); \
596 const T_VEC i6 = load(input + 6 * stride); \
597 const T_VEC i7 = load(input + 7 * stride); \
598 const T_VEC i8 = load(input + 8 * stride); \
599 const T_VEC i9 = load(input + 9 * stride); \
600 const T_VEC i10 = load(input + 10 * stride); \
601 const T_VEC i11 = load(input + 11 * stride); \
602 const T_VEC i12 = load(input + 12 * stride); \
603 const T_VEC i13 = load(input + 13 * stride); \
604 const T_VEC i14 = load(input + 14 * stride); \
605 const T_VEC i15 = load(input + 15 * stride); \
666 store(output + 0 * stride, add(w30[0], w54[0])); \
667 store(output + 1 * stride, \
669 store(output + 2 * stride, \
671 store(output + 3 * stride, \
673 store(output + 4 * stride, add(w31[0], w55[1])); \
674 store(output + 5 * stride, \
676 store(output + 6 * stride, \
678 store(output + 7 * stride, \
680 store(output + 8 * stride, sub(w30[0], w54[0])); \
681 store(output + 9 * stride, \
684 store(output + 10 * stride, \
687 store(output + 11 * stride, \
690 store(output + 12 * stride, sub(w31[0], w55[1])); \
691 store(output + 13 * stride, \
693 store(output + 14 * stride, \
695 store(output + 15 * stride, \
700 ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
709 const T_VEC i0 = load(input + 0 * stride); \
710 const T_VEC i1 = load(input + 1 * stride); \
711 const T_VEC i2 = load(input + 2 * stride); \
712 const T_VEC i3 = load(input + 3 * stride); \
713 const T_VEC i4 = load(input + 4 * stride); \
714 const T_VEC i5 = load(input + 5 * stride); \
715 const T_VEC i6 = load(input + 6 * stride); \
716 const T_VEC i7 = load(input + 7 * stride); \
717 const T_VEC i8 = load(input + 8 * stride); \
718 const T_VEC i9 = load(input + 9 * stride); \
719 const T_VEC i10 = load(input + 10 * stride); \
720 const T_VEC i11 = load(input + 11 * stride); \
721 const T_VEC i12 = load(input + 12 * stride); \
722 const T_VEC i13 = load(input + 13 * stride); \
723 const T_VEC i14 = load(input + 14 * stride); \
724 const T_VEC i15 = load(input + 15 * stride); \
725 const T_VEC i16 = load(input + 16 * stride); \
726 const T_VEC i17 = load(input + 17 * stride); \
727 const T_VEC i18 = load(input + 18 * stride); \
728 const T_VEC i19 = load(input + 19 * stride); \
729 const T_VEC i20 = load(input + 20 * stride); \
730 const T_VEC i21 = load(input + 21 * stride); \
731 const T_VEC i22 = load(input + 22 * stride); \
732 const T_VEC i23 = load(input + 23 * stride); \
733 const T_VEC i24 = load(input + 24 * stride); \
734 const T_VEC i25 = load(input + 25 * stride); \
735 const T_VEC i26 = load(input + 26 * stride); \
736 const T_VEC i27 = load(input + 27 * stride); \
737 const T_VEC i28 = load(input + 28 * stride); \
738 const T_VEC i29 = load(input + 29 * stride); \
739 const T_VEC i30 = load(input + 30 * stride); \
740 const T_VEC i31 = load(input + 31 * stride); \
982 store(output + 0 * stride, add(w78[0], w142[0])); \
983 store(output + 1 * stride, \
985 store(output + 2 * stride, \
987 store(output + 3 * stride, \
989 store(output + 4 * stride, \
991 store(output + 5 * stride, \
993 store(output + 6 * stride, \
995 store(output + 7 * stride, \
997 store(output + 8 * stride, add(w79[0], w143[1])); \
998 store(output + 9 * stride, \
1000 store(output + 10 * stride, \
1002 store(output + 11 * stride, \
1004 store(output + 12 * stride, \
1006 store(output + 13 * stride, \
1008 store(output + 14 * stride, \
1010 store(output + 15 * stride, \
1012 store(output + 16 * stride, sub(w78[0], w142[0])); \
1013 store(output + 17 * stride, \
1016 store(output + 18 * stride, \
1019 store(output + 19 * stride, \
1022 store(output + 20 * stride, \
1025 store(output + 21 * stride, \
1028 store(output + 22 * stride, \
1031 store(output + 23 * stride, \
1034 store(output + 24 * stride, sub(w79[0], w143[1])); \
1035 store(output + 25 * stride, \
1037 store(output + 26 * stride, \
1039 store(output + 27 * stride, \
1041 store(output + 28 * stride, \
1043 store(output + 29 * stride, \
1045 store(output + 30 * stride, \
1047 store(output + 31 * stride, \