1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vpx_config.h"
14 #include "vpx_dsp/vpx_dsp_common.h"
15 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
16
vpx_fdct4x4_1_sse2(const int16_t * input,tran_low_t * output,int stride)17 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
18 __m128i in0, in1;
19 __m128i tmp;
20 const __m128i zero = _mm_setzero_si128();
21 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
22 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
23 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
24 (input + 2 * stride)));
25 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
26 (input + 3 * stride)));
27
28 tmp = _mm_add_epi16(in0, in1);
29 in0 = _mm_unpacklo_epi16(zero, tmp);
30 in1 = _mm_unpackhi_epi16(zero, tmp);
31 in0 = _mm_srai_epi32(in0, 16);
32 in1 = _mm_srai_epi32(in1, 16);
33
34 tmp = _mm_add_epi32(in0, in1);
35 in0 = _mm_unpacklo_epi32(tmp, zero);
36 in1 = _mm_unpackhi_epi32(tmp, zero);
37
38 tmp = _mm_add_epi32(in0, in1);
39 in0 = _mm_srli_si128(tmp, 8);
40
41 in1 = _mm_add_epi32(tmp, in0);
42 in0 = _mm_slli_epi32(in1, 1);
43 store_output(&in0, output);
44 }
45
vpx_fdct8x8_1_sse2(const int16_t * input,tran_low_t * output,int stride)46 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
47 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
48 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
49 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
50 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
51 __m128i u0, u1, sum;
52
53 u0 = _mm_add_epi16(in0, in1);
54 u1 = _mm_add_epi16(in2, in3);
55
56 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
57 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
58 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
59 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
60
61 sum = _mm_add_epi16(u0, u1);
62
63 in0 = _mm_add_epi16(in0, in1);
64 in2 = _mm_add_epi16(in2, in3);
65 sum = _mm_add_epi16(sum, in0);
66
67 u0 = _mm_setzero_si128();
68 sum = _mm_add_epi16(sum, in2);
69
70 in0 = _mm_unpacklo_epi16(u0, sum);
71 in1 = _mm_unpackhi_epi16(u0, sum);
72 in0 = _mm_srai_epi32(in0, 16);
73 in1 = _mm_srai_epi32(in1, 16);
74
75 sum = _mm_add_epi32(in0, in1);
76 in0 = _mm_unpacklo_epi32(sum, u0);
77 in1 = _mm_unpackhi_epi32(sum, u0);
78
79 sum = _mm_add_epi32(in0, in1);
80 in0 = _mm_srli_si128(sum, 8);
81
82 in1 = _mm_add_epi32(sum, in0);
83 store_output(&in1, output);
84 }
85
vpx_fdct16x16_1_sse2(const int16_t * input,tran_low_t * output,int stride)86 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
87 int stride) {
88 __m128i in0, in1, in2, in3;
89 __m128i u0, u1;
90 __m128i sum = _mm_setzero_si128();
91 int i;
92
93 for (i = 0; i < 2; ++i) {
94 input += 8 * i;
95 in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
96 in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
97 in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
98 in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
99
100 u0 = _mm_add_epi16(in0, in1);
101 u1 = _mm_add_epi16(in2, in3);
102 sum = _mm_add_epi16(sum, u0);
103
104 in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
105 in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
106 in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
107 in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
108
109 sum = _mm_add_epi16(sum, u1);
110 u0 = _mm_add_epi16(in0, in1);
111 u1 = _mm_add_epi16(in2, in3);
112 sum = _mm_add_epi16(sum, u0);
113
114 in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
115 in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
116 in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
117 in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
118
119 sum = _mm_add_epi16(sum, u1);
120 u0 = _mm_add_epi16(in0, in1);
121 u1 = _mm_add_epi16(in2, in3);
122 sum = _mm_add_epi16(sum, u0);
123
124 in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
125 in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
126 in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
127 in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
128
129 sum = _mm_add_epi16(sum, u1);
130 u0 = _mm_add_epi16(in0, in1);
131 u1 = _mm_add_epi16(in2, in3);
132 sum = _mm_add_epi16(sum, u0);
133
134 sum = _mm_add_epi16(sum, u1);
135 }
136
137 u0 = _mm_setzero_si128();
138 in0 = _mm_unpacklo_epi16(u0, sum);
139 in1 = _mm_unpackhi_epi16(u0, sum);
140 in0 = _mm_srai_epi32(in0, 16);
141 in1 = _mm_srai_epi32(in1, 16);
142
143 sum = _mm_add_epi32(in0, in1);
144 in0 = _mm_unpacklo_epi32(sum, u0);
145 in1 = _mm_unpackhi_epi32(sum, u0);
146
147 sum = _mm_add_epi32(in0, in1);
148 in0 = _mm_srli_si128(sum, 8);
149
150 in1 = _mm_add_epi32(sum, in0);
151 in1 = _mm_srai_epi32(in1, 1);
152 store_output(&in1, output);
153 }
154
vpx_fdct32x32_1_sse2(const int16_t * input,tran_low_t * output,int stride)155 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
156 int stride) {
157 __m128i in0, in1, in2, in3;
158 __m128i u0, u1;
159 __m128i sum = _mm_setzero_si128();
160 int i;
161
162 for (i = 0; i < 8; ++i) {
163 in0 = _mm_load_si128((const __m128i *)(input + 0));
164 in1 = _mm_load_si128((const __m128i *)(input + 8));
165 in2 = _mm_load_si128((const __m128i *)(input + 16));
166 in3 = _mm_load_si128((const __m128i *)(input + 24));
167
168 input += stride;
169 u0 = _mm_add_epi16(in0, in1);
170 u1 = _mm_add_epi16(in2, in3);
171 sum = _mm_add_epi16(sum, u0);
172
173 in0 = _mm_load_si128((const __m128i *)(input + 0));
174 in1 = _mm_load_si128((const __m128i *)(input + 8));
175 in2 = _mm_load_si128((const __m128i *)(input + 16));
176 in3 = _mm_load_si128((const __m128i *)(input + 24));
177
178 input += stride;
179 sum = _mm_add_epi16(sum, u1);
180 u0 = _mm_add_epi16(in0, in1);
181 u1 = _mm_add_epi16(in2, in3);
182 sum = _mm_add_epi16(sum, u0);
183
184 in0 = _mm_load_si128((const __m128i *)(input + 0));
185 in1 = _mm_load_si128((const __m128i *)(input + 8));
186 in2 = _mm_load_si128((const __m128i *)(input + 16));
187 in3 = _mm_load_si128((const __m128i *)(input + 24));
188
189 input += stride;
190 sum = _mm_add_epi16(sum, u1);
191 u0 = _mm_add_epi16(in0, in1);
192 u1 = _mm_add_epi16(in2, in3);
193 sum = _mm_add_epi16(sum, u0);
194
195 in0 = _mm_load_si128((const __m128i *)(input + 0));
196 in1 = _mm_load_si128((const __m128i *)(input + 8));
197 in2 = _mm_load_si128((const __m128i *)(input + 16));
198 in3 = _mm_load_si128((const __m128i *)(input + 24));
199
200 input += stride;
201 sum = _mm_add_epi16(sum, u1);
202 u0 = _mm_add_epi16(in0, in1);
203 u1 = _mm_add_epi16(in2, in3);
204 sum = _mm_add_epi16(sum, u0);
205
206 sum = _mm_add_epi16(sum, u1);
207 }
208
209 u0 = _mm_setzero_si128();
210 in0 = _mm_unpacklo_epi16(u0, sum);
211 in1 = _mm_unpackhi_epi16(u0, sum);
212 in0 = _mm_srai_epi32(in0, 16);
213 in1 = _mm_srai_epi32(in1, 16);
214
215 sum = _mm_add_epi32(in0, in1);
216 in0 = _mm_unpacklo_epi32(sum, u0);
217 in1 = _mm_unpackhi_epi32(sum, u0);
218
219 sum = _mm_add_epi32(in0, in1);
220 in0 = _mm_srli_si128(sum, 8);
221
222 in1 = _mm_add_epi32(sum, in0);
223 in1 = _mm_srai_epi32(in1, 3);
224 store_output(&in1, output);
225 }
226
227 #define DCT_HIGH_BIT_DEPTH 0
228 #define FDCT4x4_2D vpx_fdct4x4_sse2
229 #define FDCT8x8_2D vpx_fdct8x8_sse2
230 #define FDCT16x16_2D vpx_fdct16x16_sse2
231 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
232 #undef FDCT4x4_2D
233 #undef FDCT8x8_2D
234 #undef FDCT16x16_2D
235
236 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
237 #define FDCT32x32_HIGH_PRECISION 0
238 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
239 #undef FDCT32x32_2D
240 #undef FDCT32x32_HIGH_PRECISION
241
242 #define FDCT32x32_2D vpx_fdct32x32_sse2
243 #define FDCT32x32_HIGH_PRECISION 1
244 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
245 #undef FDCT32x32_2D
246 #undef FDCT32x32_HIGH_PRECISION
247 #undef DCT_HIGH_BIT_DEPTH
248
249 #if CONFIG_VP9_HIGHBITDEPTH
250 #define DCT_HIGH_BIT_DEPTH 1
251 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
252 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
253 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
254 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
255 #undef FDCT4x4_2D
256 #undef FDCT8x8_2D
257 #undef FDCT16x16_2D
258
259 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
260 #define FDCT32x32_HIGH_PRECISION 0
261 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
262 #undef FDCT32x32_2D
263 #undef FDCT32x32_HIGH_PRECISION
264
265 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
266 #define FDCT32x32_HIGH_PRECISION 1
267 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
268 #undef FDCT32x32_2D
269 #undef FDCT32x32_HIGH_PRECISION
270 #undef DCT_HIGH_BIT_DEPTH
271 #endif // CONFIG_VP9_HIGHBITDEPTH
272