• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9 s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <xmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom_dsp/aom_dsp_common.h"
16 #include "aom_dsp/fft_common.h"
17 
transpose4x4(const float * A,float * B,const int lda,const int ldb)18 static INLINE void transpose4x4(const float *A, float *B, const int lda,
19                                 const int ldb) {
20   __m128 row1 = _mm_load_ps(&A[0 * lda]);
21   __m128 row2 = _mm_load_ps(&A[1 * lda]);
22   __m128 row3 = _mm_load_ps(&A[2 * lda]);
23   __m128 row4 = _mm_load_ps(&A[3 * lda]);
24   _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
25   _mm_store_ps(&B[0 * ldb], row1);
26   _mm_store_ps(&B[1 * ldb], row2);
27   _mm_store_ps(&B[2 * ldb], row3);
28   _mm_store_ps(&B[3 * ldb], row4);
29 }
30 
aom_transpose_float_sse2(const float * A,float * B,int n)31 void aom_transpose_float_sse2(const float *A, float *B, int n) {
32   for (int y = 0; y < n; y += 4) {
33     for (int x = 0; x < n; x += 4) {
34       transpose4x4(A + y * n + x, B + x * n + y, n, n);
35     }
36   }
37 }
38 
aom_fft_unpack_2d_output_sse2(const float * packed,float * output,int n)39 void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
40   const int n2 = n / 2;
41   output[0] = packed[0];
42   output[1] = 0;
43   output[2 * (n2 * n)] = packed[n2 * n];
44   output[2 * (n2 * n) + 1] = 0;
45 
46   output[2 * n2] = packed[n2];
47   output[2 * n2 + 1] = 0;
48   output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
49   output[2 * (n2 * n + n2) + 1] = 0;
50 
51   for (int c = 1; c < n2; ++c) {
52     output[2 * (0 * n + c)] = packed[c];
53     output[2 * (0 * n + c) + 1] = packed[c + n2];
54     output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
55     output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
56   }
57   for (int r = 1; r < n2; ++r) {
58     output[2 * (r * n + 0)] = packed[r * n];
59     output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
60     output[2 * (r * n + n2) + 0] = packed[r * n + n2];
61     output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
62 
63     for (int c = 1; c < AOMMIN(n2, 4); ++c) {
64       output[2 * (r * n + c)] =
65           packed[r * n + c] - packed[(r + n2) * n + c + n2];
66       output[2 * (r * n + c) + 1] =
67           packed[(r + n2) * n + c] + packed[r * n + c + n2];
68     }
69 
70     for (int c = 4; c < n2; c += 4) {
71       __m128 real1 = _mm_load_ps(packed + r * n + c);
72       __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
73       __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
74       __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
75       real1 = _mm_sub_ps(real1, real2);
76       imag1 = _mm_add_ps(imag1, imag2);
77       _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
78       _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
79     }
80 
81     int r2 = r + n2;
82     int r3 = n - r2;
83     output[2 * (r2 * n + 0)] = packed[r3 * n];
84     output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
85     output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
86     output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
87     for (int c = 1; c < AOMMIN(4, n2); ++c) {
88       output[2 * (r2 * n + c)] =
89           packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
90       output[2 * (r2 * n + c) + 1] =
91           -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
92     }
93     for (int c = 4; c < n2; c += 4) {
94       __m128 real1 = _mm_load_ps(packed + r3 * n + c);
95       __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
96       __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
97       __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
98       real1 = _mm_add_ps(real1, real2);
99       imag1 = _mm_sub_ps(imag2, imag1);
100       _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
101       _mm_store_ps(output + 2 * (r2 * n + c + 2),
102                    _mm_unpackhi_ps(real1, imag1));
103     }
104   }
105 }
106 
107 // Generate definitions for 1d transforms using float and __mm128
GEN_FFT_4(static INLINE void,sse2,float,__m128,_mm_load_ps,_mm_store_ps,_mm_set1_ps,_mm_add_ps,_mm_sub_ps)108 GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
109           _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
110 GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
111           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
112 GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
113            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
114 GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
115            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
116 
117 void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
118   aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
119                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
120 }
121 
aom_fft8x8_float_sse2(const float * input,float * temp,float * output)122 void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
123   aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
124                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
125 }
126 
aom_fft16x16_float_sse2(const float * input,float * temp,float * output)127 void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
128   aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
129                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
130 }
131 
aom_fft32x32_float_sse2(const float * input,float * temp,float * output)132 void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
133   aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
134                  aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
135 }
136 
137 // Generate definitions for 1d inverse transforms using float and mm128
GEN_IFFT_4(static INLINE void,sse2,float,__m128,_mm_load_ps,_mm_store_ps,_mm_set1_ps,_mm_add_ps,_mm_sub_ps)138 GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
139            _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
140 GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
141            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
142 GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
143             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
144 GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
145             _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
146 
147 void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
148   aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
149                   aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
150 }
151 
aom_ifft8x8_float_sse2(const float * input,float * temp,float * output)152 void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
153   aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
154                   aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
155 }
156 
aom_ifft16x16_float_sse2(const float * input,float * temp,float * output)157 void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
158   aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
159                   aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
160                   aom_transpose_float_sse2, 4);
161 }
162 
aom_ifft32x32_float_sse2(const float * input,float * temp,float * output)163 void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
164   aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
165                   aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
166                   aom_transpose_float_sse2, 4);
167 }
168