1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/transpose_sse2.h"
14 #include "vpx_dsp/x86/txfm_common_sse2.h"
15
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)16 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
17 int stride) {
18 const __m128i eight = _mm_set1_epi16(8);
19 __m128i in[2];
20
21 // Rows
22 in[0] = load_input_data(input);
23 in[1] = load_input_data(input + 8);
24 idct4_sse2(in);
25
26 // Columns
27 idct4_sse2(in);
28
29 // Final round and shift
30 in[0] = _mm_add_epi16(in[0], eight);
31 in[1] = _mm_add_epi16(in[1], eight);
32 in[0] = _mm_srai_epi16(in[0], 4);
33 in[1] = _mm_srai_epi16(in[1], 4);
34
35 recon_and_store4x4_sse2(in, dest, stride);
36 }
37
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)38 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
39 int stride) {
40 const __m128i zero = _mm_setzero_si128();
41 int a;
42 __m128i dc_value, d[2];
43
44 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
45 a = (int)dct_const_round_shift(a * cospi_16_64);
46 a = ROUND_POWER_OF_TWO(a, 4);
47
48 dc_value = _mm_set1_epi16(a);
49
50 // Reconstruction and Store
51 d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
52 d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
53 d[0] = _mm_unpacklo_epi32(d[0],
54 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
55 d[1] = _mm_unpacklo_epi32(
56 _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
57 d[0] = _mm_unpacklo_epi8(d[0], zero);
58 d[1] = _mm_unpacklo_epi8(d[1], zero);
59 d[0] = _mm_add_epi16(d[0], dc_value);
60 d[1] = _mm_add_epi16(d[1], dc_value);
61 d[0] = _mm_packus_epi16(d[0], d[1]);
62
63 *(int *)dest = _mm_cvtsi128_si32(d[0]);
64 d[0] = _mm_srli_si128(d[0], 4);
65 *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
66 d[0] = _mm_srli_si128(d[0], 4);
67 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
68 d[0] = _mm_srli_si128(d[0], 4);
69 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
70 }
71
idct4_sse2(__m128i * in)72 void idct4_sse2(__m128i *in) {
73 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
74 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
75 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
76 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
77 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
78 __m128i u[8], v[8];
79
80 transpose_16bit_4x4(in);
81 // stage 1
82 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
83 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
84 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
85 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
86 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
87 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
88
89 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
90 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
91 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
92 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
93
94 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
95 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
96 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
97 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
98
99 u[0] = _mm_packs_epi32(v[0], v[1]);
100 u[1] = _mm_packs_epi32(v[3], v[2]);
101
102 // stage 2
103 in[0] = _mm_add_epi16(u[0], u[1]);
104 in[1] = _mm_sub_epi16(u[0], u[1]);
105 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
106 }
107
iadst4_sse2(__m128i * in)108 void iadst4_sse2(__m128i *in) {
109 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
110 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
111 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
112 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
113 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114 const __m128i kZero = _mm_set1_epi16(0);
115 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116 __m128i u[8], v[8], in7;
117
118 transpose_16bit_4x4(in);
119 in7 = _mm_srli_si128(in[1], 8);
120 in7 = _mm_add_epi16(in7, in[0]);
121 in7 = _mm_sub_epi16(in7, in[1]);
122
123 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
124 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
125 u[2] = _mm_unpacklo_epi16(in7, kZero);
126 u[3] = _mm_unpackhi_epi16(in[0], kZero);
127
128 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
129 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
130 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
131 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
132 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
133 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
134
135 u[0] = _mm_add_epi32(v[0], v[1]);
136 u[1] = _mm_add_epi32(v[3], v[4]);
137 u[2] = v[2];
138 u[3] = _mm_add_epi32(u[0], u[1]);
139 u[4] = _mm_slli_epi32(v[5], 2);
140 u[5] = _mm_add_epi32(u[3], v[5]);
141 u[6] = _mm_sub_epi32(u[5], u[4]);
142
143 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
144 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
145 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
146 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
147
148 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
149 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
150 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
151 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
152
153 in[0] = _mm_packs_epi32(u[0], u[1]);
154 in[1] = _mm_packs_epi32(u[2], u[3]);
155 }
156
157 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
158 { \
159 tmp0 = _mm_madd_epi16(lo_0, cst0); \
160 tmp1 = _mm_madd_epi16(hi_0, cst0); \
161 tmp2 = _mm_madd_epi16(lo_0, cst1); \
162 tmp3 = _mm_madd_epi16(hi_0, cst1); \
163 \
164 tmp0 = _mm_add_epi32(tmp0, rounding); \
165 tmp1 = _mm_add_epi32(tmp1, rounding); \
166 tmp2 = _mm_add_epi32(tmp2, rounding); \
167 tmp3 = _mm_add_epi32(tmp3, rounding); \
168 \
169 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
170 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
171 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
172 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
173 \
174 res0 = _mm_packs_epi32(tmp0, tmp1); \
175 res1 = _mm_packs_epi32(tmp2, tmp3); \
176 }
177
178 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
179 out4, out5, out6, out7) \
180 { \
181 /* Stage1 */ \
182 { \
183 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
184 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
185 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
186 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
187 \
188 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
189 stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
190 } \
191 \
192 /* Stage2 */ \
193 { \
194 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
195 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
196 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
197 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
198 \
199 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
200 stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
201 \
202 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
203 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
204 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
205 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
206 } \
207 \
208 /* Stage3 */ \
209 { \
210 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
211 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
212 \
213 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
214 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
215 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
216 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
217 \
218 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
219 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
220 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
221 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
222 \
223 tmp0 = _mm_add_epi32(tmp0, rounding); \
224 tmp1 = _mm_add_epi32(tmp1, rounding); \
225 tmp2 = _mm_add_epi32(tmp2, rounding); \
226 tmp3 = _mm_add_epi32(tmp3, rounding); \
227 \
228 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
229 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
230 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
231 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
232 \
233 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
234 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
235 } \
236 \
237 /* Stage4 */ \
238 out0 = _mm_add_epi16(stp1_0, stp2_7); \
239 out1 = _mm_add_epi16(stp1_1, stp1_6); \
240 out2 = _mm_add_epi16(stp1_2, stp1_5); \
241 out3 = _mm_add_epi16(stp1_3, stp2_4); \
242 out4 = _mm_sub_epi16(stp1_3, stp2_4); \
243 out5 = _mm_sub_epi16(stp1_2, stp1_5); \
244 out6 = _mm_sub_epi16(stp1_1, stp1_6); \
245 out7 = _mm_sub_epi16(stp1_0, stp2_7); \
246 }
247
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)248 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
249 int stride) {
250 const __m128i zero = _mm_setzero_si128();
251 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
252 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
253 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
254 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
255 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
256 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
257 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
258 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
259 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
260 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
261
262 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
263 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
264 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
265 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
266 int i;
267
268 // Load input data.
269 in0 = load_input_data(input);
270 in1 = load_input_data(input + 8 * 1);
271 in2 = load_input_data(input + 8 * 2);
272 in3 = load_input_data(input + 8 * 3);
273 in4 = load_input_data(input + 8 * 4);
274 in5 = load_input_data(input + 8 * 5);
275 in6 = load_input_data(input + 8 * 6);
276 in7 = load_input_data(input + 8 * 7);
277
278 // 2-D
279 for (i = 0; i < 2; i++) {
280 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
281 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
282 in4, in5, in6, in7);
283
284 // 4-stage 1D idct8x8
285 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
286 in6, in7);
287 }
288
289 // Final rounding and shift
290 in0 = _mm_adds_epi16(in0, final_rounding);
291 in1 = _mm_adds_epi16(in1, final_rounding);
292 in2 = _mm_adds_epi16(in2, final_rounding);
293 in3 = _mm_adds_epi16(in3, final_rounding);
294 in4 = _mm_adds_epi16(in4, final_rounding);
295 in5 = _mm_adds_epi16(in5, final_rounding);
296 in6 = _mm_adds_epi16(in6, final_rounding);
297 in7 = _mm_adds_epi16(in7, final_rounding);
298
299 in0 = _mm_srai_epi16(in0, 5);
300 in1 = _mm_srai_epi16(in1, 5);
301 in2 = _mm_srai_epi16(in2, 5);
302 in3 = _mm_srai_epi16(in3, 5);
303 in4 = _mm_srai_epi16(in4, 5);
304 in5 = _mm_srai_epi16(in5, 5);
305 in6 = _mm_srai_epi16(in6, 5);
306 in7 = _mm_srai_epi16(in7, 5);
307
308 RECON_AND_STORE(dest + 0 * stride, in0);
309 RECON_AND_STORE(dest + 1 * stride, in1);
310 RECON_AND_STORE(dest + 2 * stride, in2);
311 RECON_AND_STORE(dest + 3 * stride, in3);
312 RECON_AND_STORE(dest + 4 * stride, in4);
313 RECON_AND_STORE(dest + 5 * stride, in5);
314 RECON_AND_STORE(dest + 6 * stride, in6);
315 RECON_AND_STORE(dest + 7 * stride, in7);
316 }
317
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)318 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
319 int stride) {
320 __m128i dc_value;
321 const __m128i zero = _mm_setzero_si128();
322 int a;
323
324 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
325 a = (int)dct_const_round_shift(a * cospi_16_64);
326 a = ROUND_POWER_OF_TWO(a, 5);
327
328 dc_value = _mm_set1_epi16(a);
329
330 RECON_AND_STORE(dest + 0 * stride, dc_value);
331 RECON_AND_STORE(dest + 1 * stride, dc_value);
332 RECON_AND_STORE(dest + 2 * stride, dc_value);
333 RECON_AND_STORE(dest + 3 * stride, dc_value);
334 RECON_AND_STORE(dest + 4 * stride, dc_value);
335 RECON_AND_STORE(dest + 5 * stride, dc_value);
336 RECON_AND_STORE(dest + 6 * stride, dc_value);
337 RECON_AND_STORE(dest + 7 * stride, dc_value);
338 }
339
idct8_sse2(__m128i * in)340 void idct8_sse2(__m128i *in) {
341 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
342 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
343 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
344 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
345 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
346 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
347 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
348 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
349 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
350
351 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
352 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
353 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
354 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
355
356 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
357 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
358 in1, in2, in3, in4, in5, in6, in7);
359
360 // 4-stage 1D idct8x8
361 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
362 in[4], in[5], in[6], in[7]);
363 }
364
iadst8_sse2(__m128i * in)365 void iadst8_sse2(__m128i *in) {
366 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
367 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
368 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
369 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
370 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
371 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
372 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
373 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
374 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
375 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
376 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
377 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
378 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
379 const __m128i k__const_0 = _mm_set1_epi16(0);
380 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
381
382 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
383 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
384 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
385 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
386 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
387
388 // transpose
389 array_transpose_8x8(in, in);
390
391 // properly aligned for butterfly input
392 in0 = in[7];
393 in1 = in[0];
394 in2 = in[5];
395 in3 = in[2];
396 in4 = in[3];
397 in5 = in[4];
398 in6 = in[1];
399 in7 = in[6];
400
401 // column transformation
402 // stage 1
403 // interleave and multiply/add into 32-bit integer
404 s0 = _mm_unpacklo_epi16(in0, in1);
405 s1 = _mm_unpackhi_epi16(in0, in1);
406 s2 = _mm_unpacklo_epi16(in2, in3);
407 s3 = _mm_unpackhi_epi16(in2, in3);
408 s4 = _mm_unpacklo_epi16(in4, in5);
409 s5 = _mm_unpackhi_epi16(in4, in5);
410 s6 = _mm_unpacklo_epi16(in6, in7);
411 s7 = _mm_unpackhi_epi16(in6, in7);
412
413 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
414 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
415 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
416 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
417 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
418 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
419 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
420 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
421 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
422 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
423 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
424 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
425 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
426 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
427 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
428 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
429
430 // addition
431 w0 = _mm_add_epi32(u0, u8);
432 w1 = _mm_add_epi32(u1, u9);
433 w2 = _mm_add_epi32(u2, u10);
434 w3 = _mm_add_epi32(u3, u11);
435 w4 = _mm_add_epi32(u4, u12);
436 w5 = _mm_add_epi32(u5, u13);
437 w6 = _mm_add_epi32(u6, u14);
438 w7 = _mm_add_epi32(u7, u15);
439 w8 = _mm_sub_epi32(u0, u8);
440 w9 = _mm_sub_epi32(u1, u9);
441 w10 = _mm_sub_epi32(u2, u10);
442 w11 = _mm_sub_epi32(u3, u11);
443 w12 = _mm_sub_epi32(u4, u12);
444 w13 = _mm_sub_epi32(u5, u13);
445 w14 = _mm_sub_epi32(u6, u14);
446 w15 = _mm_sub_epi32(u7, u15);
447
448 // shift and rounding
449 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
450 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
451 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
452 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
453 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
454 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
455 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
456 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
457 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
458 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
459 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
460 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
461 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
462 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
463 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
464 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
465
466 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
467 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
468 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
469 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
470 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
471 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
472 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
473 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
474 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
475 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
476 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
477 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
478 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
479 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
480 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
481 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
482
483 // back to 16-bit and pack 8 integers into __m128i
484 in[0] = _mm_packs_epi32(u0, u1);
485 in[1] = _mm_packs_epi32(u2, u3);
486 in[2] = _mm_packs_epi32(u4, u5);
487 in[3] = _mm_packs_epi32(u6, u7);
488 in[4] = _mm_packs_epi32(u8, u9);
489 in[5] = _mm_packs_epi32(u10, u11);
490 in[6] = _mm_packs_epi32(u12, u13);
491 in[7] = _mm_packs_epi32(u14, u15);
492
493 // stage 2
494 s0 = _mm_add_epi16(in[0], in[2]);
495 s1 = _mm_add_epi16(in[1], in[3]);
496 s2 = _mm_sub_epi16(in[0], in[2]);
497 s3 = _mm_sub_epi16(in[1], in[3]);
498 u0 = _mm_unpacklo_epi16(in[4], in[5]);
499 u1 = _mm_unpackhi_epi16(in[4], in[5]);
500 u2 = _mm_unpacklo_epi16(in[6], in[7]);
501 u3 = _mm_unpackhi_epi16(in[6], in[7]);
502
503 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
504 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
505 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
506 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
507 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
508 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
509 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
510 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
511
512 w0 = _mm_add_epi32(v0, v4);
513 w1 = _mm_add_epi32(v1, v5);
514 w2 = _mm_add_epi32(v2, v6);
515 w3 = _mm_add_epi32(v3, v7);
516 w4 = _mm_sub_epi32(v0, v4);
517 w5 = _mm_sub_epi32(v1, v5);
518 w6 = _mm_sub_epi32(v2, v6);
519 w7 = _mm_sub_epi32(v3, v7);
520
521 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
522 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
523 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
524 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
525 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
526 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
527 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
528 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
529
530 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
531 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
532 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
533 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
534 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
535 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
536 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
537 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
538
539 // back to 16-bit intergers
540 s4 = _mm_packs_epi32(u0, u1);
541 s5 = _mm_packs_epi32(u2, u3);
542 s6 = _mm_packs_epi32(u4, u5);
543 s7 = _mm_packs_epi32(u6, u7);
544
545 // stage 3
546 u0 = _mm_unpacklo_epi16(s2, s3);
547 u1 = _mm_unpackhi_epi16(s2, s3);
548 u2 = _mm_unpacklo_epi16(s6, s7);
549 u3 = _mm_unpackhi_epi16(s6, s7);
550
551 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
552 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
553 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
554 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
555 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
556 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
557 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
558 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
559
560 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
561 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
562 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
563 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
564 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
565 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
566 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
567 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
568
569 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
570 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
571 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
572 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
573 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
574 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
575 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
576 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
577
578 s2 = _mm_packs_epi32(v0, v1);
579 s3 = _mm_packs_epi32(v2, v3);
580 s6 = _mm_packs_epi32(v4, v5);
581 s7 = _mm_packs_epi32(v6, v7);
582
583 in[0] = s0;
584 in[1] = _mm_sub_epi16(k__const_0, s4);
585 in[2] = s6;
586 in[3] = _mm_sub_epi16(k__const_0, s2);
587 in[4] = s3;
588 in[5] = _mm_sub_epi16(k__const_0, s7);
589 in[6] = s5;
590 in[7] = _mm_sub_epi16(k__const_0, s1);
591 }
592
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)593 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
594 int stride) {
595 const __m128i zero = _mm_setzero_si128();
596 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
597 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
598 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
599 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
600 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
601 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
602 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
603 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
604 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
605 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
606 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
607
608 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
609 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
610 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
611 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
612
613 // Rows. Load 4-row input data.
614 in0 = load_input_data(input);
615 in1 = load_input_data(input + 8 * 1);
616 in2 = load_input_data(input + 8 * 2);
617 in3 = load_input_data(input + 8 * 3);
618
619 // 8x4 Transpose
620 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
621 // Stage1
622 {
623 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
624 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
625
626 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
627 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
628 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
629 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
630
631 tmp0 = _mm_add_epi32(tmp0, rounding);
632 tmp2 = _mm_add_epi32(tmp2, rounding);
633 tmp4 = _mm_add_epi32(tmp4, rounding);
634 tmp6 = _mm_add_epi32(tmp6, rounding);
635 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
636 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
637 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
638 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
639
640 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
641 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
642 }
643
644 // Stage2
645 {
646 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
647 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
648
649 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
650 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
651 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
652 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
653
654 tmp0 = _mm_add_epi32(tmp0, rounding);
655 tmp2 = _mm_add_epi32(tmp2, rounding);
656 tmp4 = _mm_add_epi32(tmp4, rounding);
657 tmp6 = _mm_add_epi32(tmp6, rounding);
658 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
659 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
660 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
661 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
662
663 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
664 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
665
666 tmp0 = _mm_add_epi16(stp1_4, stp1_5);
667 tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
668
669 stp2_4 = tmp0;
670 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
671 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
672 }
673
674 // Stage3
675 {
676 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
677
678 tmp4 = _mm_add_epi16(stp2_0, stp2_2);
679 tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
680
681 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
682 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
683
684 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
685 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
686
687 tmp0 = _mm_add_epi32(tmp0, rounding);
688 tmp2 = _mm_add_epi32(tmp2, rounding);
689 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
690 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
691
692 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
693 }
694
695 // Stage4
696 tmp0 = _mm_add_epi16(stp1_3, stp2_4);
697 tmp1 = _mm_add_epi16(stp1_2, stp1_5);
698 tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
699 tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
700
701 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
702
703 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
704 in5, in6, in7);
705 // Final rounding and shift
706 in0 = _mm_adds_epi16(in0, final_rounding);
707 in1 = _mm_adds_epi16(in1, final_rounding);
708 in2 = _mm_adds_epi16(in2, final_rounding);
709 in3 = _mm_adds_epi16(in3, final_rounding);
710 in4 = _mm_adds_epi16(in4, final_rounding);
711 in5 = _mm_adds_epi16(in5, final_rounding);
712 in6 = _mm_adds_epi16(in6, final_rounding);
713 in7 = _mm_adds_epi16(in7, final_rounding);
714
715 in0 = _mm_srai_epi16(in0, 5);
716 in1 = _mm_srai_epi16(in1, 5);
717 in2 = _mm_srai_epi16(in2, 5);
718 in3 = _mm_srai_epi16(in3, 5);
719 in4 = _mm_srai_epi16(in4, 5);
720 in5 = _mm_srai_epi16(in5, 5);
721 in6 = _mm_srai_epi16(in6, 5);
722 in7 = _mm_srai_epi16(in7, 5);
723
724 RECON_AND_STORE(dest + 0 * stride, in0);
725 RECON_AND_STORE(dest + 1 * stride, in1);
726 RECON_AND_STORE(dest + 2 * stride, in2);
727 RECON_AND_STORE(dest + 3 * stride, in3);
728 RECON_AND_STORE(dest + 4 * stride, in4);
729 RECON_AND_STORE(dest + 5 * stride, in5);
730 RECON_AND_STORE(dest + 6 * stride, in6);
731 RECON_AND_STORE(dest + 7 * stride, in7);
732 }
733
734 #define IDCT16 \
735 /* Stage2 */ \
736 { \
737 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
738 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
739 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
740 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
741 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
742 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
743 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
744 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
745 \
746 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
747 stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
748 \
749 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
750 stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
751 } \
752 \
753 /* Stage3 */ \
754 { \
755 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
756 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
757 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
758 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
759 \
760 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
761 stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
762 \
763 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
764 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
765 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
766 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
767 \
768 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
769 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
770 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
771 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
772 } \
773 \
774 /* Stage4 */ \
775 { \
776 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
777 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
778 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
779 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
780 \
781 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
782 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
783 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
784 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
785 \
786 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
787 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
788 \
789 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
790 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
791 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
792 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
793 \
794 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
795 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
796 stp2_13) \
797 } \
798 \
799 /* Stage5 */ \
800 { \
801 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
802 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
803 \
804 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
805 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
806 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
807 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
808 \
809 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
810 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
811 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
812 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
813 \
814 tmp0 = _mm_add_epi32(tmp0, rounding); \
815 tmp1 = _mm_add_epi32(tmp1, rounding); \
816 tmp2 = _mm_add_epi32(tmp2, rounding); \
817 tmp3 = _mm_add_epi32(tmp3, rounding); \
818 \
819 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
820 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
821 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
822 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
823 \
824 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
825 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
826 \
827 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
828 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
829 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
830 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
831 \
832 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
833 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
834 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
835 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
836 } \
837 \
838 /* Stage6 */ \
839 { \
840 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
841 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
842 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
843 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
844 \
845 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
846 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
847 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
848 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
849 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
850 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
851 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
852 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
853 \
854 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
855 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
856 stp2_12) \
857 }
858
859 #define IDCT16_10 \
860 /* Stage2 */ \
861 { \
862 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
863 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
864 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
865 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
866 \
867 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
868 stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
869 stp1_12_0) \
870 } \
871 \
872 /* Stage3 */ \
873 { \
874 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
875 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
876 \
877 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
878 \
879 stp1_9 = stp1_8_0; \
880 stp1_10 = stp1_11; \
881 \
882 stp1_13 = stp1_12_0; \
883 stp1_14 = stp1_15; \
884 } \
885 \
886 /* Stage4 */ \
887 { \
888 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
889 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
890 \
891 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
892 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
893 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
894 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
895 \
896 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
897 stp2_5 = stp2_4; \
898 stp2_6 = stp2_7; \
899 \
900 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
901 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
902 stp2_13) \
903 } \
904 \
905 /* Stage5 */ \
906 { \
907 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
908 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
909 \
910 stp1_2 = stp1_1; \
911 stp1_3 = stp1_0; \
912 \
913 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
914 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
915 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
916 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
917 \
918 tmp0 = _mm_add_epi32(tmp0, rounding); \
919 tmp1 = _mm_add_epi32(tmp1, rounding); \
920 tmp2 = _mm_add_epi32(tmp2, rounding); \
921 tmp3 = _mm_add_epi32(tmp3, rounding); \
922 \
923 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
924 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
925 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
926 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
927 \
928 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
929 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
930 \
931 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
932 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
933 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
934 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
935 \
936 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
937 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
938 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
939 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
940 } \
941 \
942 /* Stage6 */ \
943 { \
944 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
945 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
946 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
947 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
948 \
949 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
950 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
951 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
952 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
953 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
954 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
955 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
956 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
957 \
958 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
959 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
960 stp2_12) \
961 }
962
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)963 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
964 int stride) {
965 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
966 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
967 const __m128i zero = _mm_setzero_si128();
968
969 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
970 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
971 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
972 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
973 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
974 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
975 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
976 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
977
978 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
979 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
980 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
981 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
982
983 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
984 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
985 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
986 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
987 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
988 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
989 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
990 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
991
992 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
993
994 __m128i in[16], l[16], r[16], *curr1;
995 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
996 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
997 stp1_8_0, stp1_12_0;
998 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
999 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1000 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1001 int i;
1002
1003 curr1 = l;
1004 for (i = 0; i < 2; i++) {
1005 // 1-D idct
1006
1007 // Load input data.
1008 in[0] = load_input_data(input);
1009 in[8] = load_input_data(input + 8 * 1);
1010 in[1] = load_input_data(input + 8 * 2);
1011 in[9] = load_input_data(input + 8 * 3);
1012 in[2] = load_input_data(input + 8 * 4);
1013 in[10] = load_input_data(input + 8 * 5);
1014 in[3] = load_input_data(input + 8 * 6);
1015 in[11] = load_input_data(input + 8 * 7);
1016 in[4] = load_input_data(input + 8 * 8);
1017 in[12] = load_input_data(input + 8 * 9);
1018 in[5] = load_input_data(input + 8 * 10);
1019 in[13] = load_input_data(input + 8 * 11);
1020 in[6] = load_input_data(input + 8 * 12);
1021 in[14] = load_input_data(input + 8 * 13);
1022 in[7] = load_input_data(input + 8 * 14);
1023 in[15] = load_input_data(input + 8 * 15);
1024
1025 array_transpose_8x8(in, in);
1026 array_transpose_8x8(in + 8, in + 8);
1027
1028 IDCT16
1029
1030 // Stage7
1031 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1032 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1033 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1034 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1035 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1036 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1037 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1038 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1039 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1040 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1041 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1042 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1043 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1044 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1045 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1046 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1047
1048 curr1 = r;
1049 input += 128;
1050 }
1051 for (i = 0; i < 2; i++) {
1052 int j;
1053 // 1-D idct
1054 array_transpose_8x8(l + i * 8, in);
1055 array_transpose_8x8(r + i * 8, in + 8);
1056
1057 IDCT16
1058
1059 // 2-D
1060 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1061 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1062 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1063 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1064 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1065 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1066 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1067 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1068 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1069 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1070 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1071 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1072 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1073 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1074 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1075 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1076
1077 for (j = 0; j < 16; ++j) {
1078 // Final rounding and shift
1079 in[j] = _mm_adds_epi16(in[j], final_rounding);
1080 in[j] = _mm_srai_epi16(in[j], 6);
1081 RECON_AND_STORE(dest + j * stride, in[j]);
1082 }
1083
1084 dest += 8;
1085 }
1086 }
1087
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1088 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1089 int stride) {
1090 __m128i dc_value;
1091 const __m128i zero = _mm_setzero_si128();
1092 int a, i;
1093
1094 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1095 a = (int)dct_const_round_shift(a * cospi_16_64);
1096 a = ROUND_POWER_OF_TWO(a, 6);
1097
1098 dc_value = _mm_set1_epi16(a);
1099
1100 for (i = 0; i < 16; ++i) {
1101 RECON_AND_STORE(dest + 0, dc_value);
1102 RECON_AND_STORE(dest + 8, dc_value);
1103 dest += stride;
1104 }
1105 }
1106
iadst16_8col(__m128i * in)1107 static void iadst16_8col(__m128i *in) {
1108 // perform 16x16 1-D ADST for 8 columns
1109 __m128i s[16], x[16], u[32], v[32];
1110 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1111 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1112 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1113 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1114 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1115 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1116 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1117 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1118 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1119 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1120 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1121 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1122 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1123 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1124 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1125 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1126 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1127 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1128 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1129 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1130 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1131 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1132 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1133 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1134 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1135 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1136 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1137 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1138 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1139 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1140 const __m128i kZero = _mm_set1_epi16(0);
1141
1142 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1143 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1144 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1145 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1146 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1147 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1148 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1149 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1150 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1151 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1152 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1153 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1154 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1155 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1156 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1157 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1158
1159 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1160 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1161 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1162 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1163 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1164 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1165 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1166 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1167 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1168 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1169 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1170 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1171 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1172 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1173 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1174 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1175 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1176 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1177 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1178 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1179 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1180 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1181 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1182 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1183 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1184 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1185 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1186 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1187 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1188 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1189 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1190 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1191
1192 u[0] = _mm_add_epi32(v[0], v[16]);
1193 u[1] = _mm_add_epi32(v[1], v[17]);
1194 u[2] = _mm_add_epi32(v[2], v[18]);
1195 u[3] = _mm_add_epi32(v[3], v[19]);
1196 u[4] = _mm_add_epi32(v[4], v[20]);
1197 u[5] = _mm_add_epi32(v[5], v[21]);
1198 u[6] = _mm_add_epi32(v[6], v[22]);
1199 u[7] = _mm_add_epi32(v[7], v[23]);
1200 u[8] = _mm_add_epi32(v[8], v[24]);
1201 u[9] = _mm_add_epi32(v[9], v[25]);
1202 u[10] = _mm_add_epi32(v[10], v[26]);
1203 u[11] = _mm_add_epi32(v[11], v[27]);
1204 u[12] = _mm_add_epi32(v[12], v[28]);
1205 u[13] = _mm_add_epi32(v[13], v[29]);
1206 u[14] = _mm_add_epi32(v[14], v[30]);
1207 u[15] = _mm_add_epi32(v[15], v[31]);
1208 u[16] = _mm_sub_epi32(v[0], v[16]);
1209 u[17] = _mm_sub_epi32(v[1], v[17]);
1210 u[18] = _mm_sub_epi32(v[2], v[18]);
1211 u[19] = _mm_sub_epi32(v[3], v[19]);
1212 u[20] = _mm_sub_epi32(v[4], v[20]);
1213 u[21] = _mm_sub_epi32(v[5], v[21]);
1214 u[22] = _mm_sub_epi32(v[6], v[22]);
1215 u[23] = _mm_sub_epi32(v[7], v[23]);
1216 u[24] = _mm_sub_epi32(v[8], v[24]);
1217 u[25] = _mm_sub_epi32(v[9], v[25]);
1218 u[26] = _mm_sub_epi32(v[10], v[26]);
1219 u[27] = _mm_sub_epi32(v[11], v[27]);
1220 u[28] = _mm_sub_epi32(v[12], v[28]);
1221 u[29] = _mm_sub_epi32(v[13], v[29]);
1222 u[30] = _mm_sub_epi32(v[14], v[30]);
1223 u[31] = _mm_sub_epi32(v[15], v[31]);
1224
1225 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1226 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1227 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1228 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1229 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1230 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1231 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1232 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1233 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1234 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1235 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1236 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1237 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1238 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1239 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1240 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1241 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1242 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1243 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1244 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1245 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1246 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1247 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1248 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1249 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1250 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1251 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1252 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1253 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1254 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1255 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1256 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1257
1258 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1259 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1260 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1261 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1262 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1263 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1264 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1265 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1266 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1267 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1268 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1269 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1270 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1271 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1272 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1273 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1274 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1275 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1276 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1277 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1278 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1279 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1280 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1281 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1282 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1283 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1284 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1285 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1286 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1287 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1288 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1289 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1290
1291 s[0] = _mm_packs_epi32(u[0], u[1]);
1292 s[1] = _mm_packs_epi32(u[2], u[3]);
1293 s[2] = _mm_packs_epi32(u[4], u[5]);
1294 s[3] = _mm_packs_epi32(u[6], u[7]);
1295 s[4] = _mm_packs_epi32(u[8], u[9]);
1296 s[5] = _mm_packs_epi32(u[10], u[11]);
1297 s[6] = _mm_packs_epi32(u[12], u[13]);
1298 s[7] = _mm_packs_epi32(u[14], u[15]);
1299 s[8] = _mm_packs_epi32(u[16], u[17]);
1300 s[9] = _mm_packs_epi32(u[18], u[19]);
1301 s[10] = _mm_packs_epi32(u[20], u[21]);
1302 s[11] = _mm_packs_epi32(u[22], u[23]);
1303 s[12] = _mm_packs_epi32(u[24], u[25]);
1304 s[13] = _mm_packs_epi32(u[26], u[27]);
1305 s[14] = _mm_packs_epi32(u[28], u[29]);
1306 s[15] = _mm_packs_epi32(u[30], u[31]);
1307
1308 // stage 2
1309 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1310 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1311 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1312 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1313 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1314 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1315 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1316 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1317
1318 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1319 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1320 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1321 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1322 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1323 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1324 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1325 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1326 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1327 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1328 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1329 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1330 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1331 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1332 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1333 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1334
1335 u[0] = _mm_add_epi32(v[0], v[8]);
1336 u[1] = _mm_add_epi32(v[1], v[9]);
1337 u[2] = _mm_add_epi32(v[2], v[10]);
1338 u[3] = _mm_add_epi32(v[3], v[11]);
1339 u[4] = _mm_add_epi32(v[4], v[12]);
1340 u[5] = _mm_add_epi32(v[5], v[13]);
1341 u[6] = _mm_add_epi32(v[6], v[14]);
1342 u[7] = _mm_add_epi32(v[7], v[15]);
1343 u[8] = _mm_sub_epi32(v[0], v[8]);
1344 u[9] = _mm_sub_epi32(v[1], v[9]);
1345 u[10] = _mm_sub_epi32(v[2], v[10]);
1346 u[11] = _mm_sub_epi32(v[3], v[11]);
1347 u[12] = _mm_sub_epi32(v[4], v[12]);
1348 u[13] = _mm_sub_epi32(v[5], v[13]);
1349 u[14] = _mm_sub_epi32(v[6], v[14]);
1350 u[15] = _mm_sub_epi32(v[7], v[15]);
1351
1352 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1353 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1354 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1355 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1356 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1357 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1358 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1359 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1360 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1361 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1362 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1363 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1364 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1365 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1366 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1367 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1368
1369 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1370 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1371 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1372 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1373 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1374 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1375 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1376 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1377 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1378 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1379 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1380 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1381 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1382 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1383 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1384 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1385
1386 x[0] = _mm_add_epi16(s[0], s[4]);
1387 x[1] = _mm_add_epi16(s[1], s[5]);
1388 x[2] = _mm_add_epi16(s[2], s[6]);
1389 x[3] = _mm_add_epi16(s[3], s[7]);
1390 x[4] = _mm_sub_epi16(s[0], s[4]);
1391 x[5] = _mm_sub_epi16(s[1], s[5]);
1392 x[6] = _mm_sub_epi16(s[2], s[6]);
1393 x[7] = _mm_sub_epi16(s[3], s[7]);
1394 x[8] = _mm_packs_epi32(u[0], u[1]);
1395 x[9] = _mm_packs_epi32(u[2], u[3]);
1396 x[10] = _mm_packs_epi32(u[4], u[5]);
1397 x[11] = _mm_packs_epi32(u[6], u[7]);
1398 x[12] = _mm_packs_epi32(u[8], u[9]);
1399 x[13] = _mm_packs_epi32(u[10], u[11]);
1400 x[14] = _mm_packs_epi32(u[12], u[13]);
1401 x[15] = _mm_packs_epi32(u[14], u[15]);
1402
1403 // stage 3
1404 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1405 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1406 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1407 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1408 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1409 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1410 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1411 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1412
1413 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1414 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1415 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1416 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1417 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1418 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1419 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1420 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1421 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1422 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1423 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1424 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1425 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1426 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1427 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1428 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1429
1430 u[0] = _mm_add_epi32(v[0], v[4]);
1431 u[1] = _mm_add_epi32(v[1], v[5]);
1432 u[2] = _mm_add_epi32(v[2], v[6]);
1433 u[3] = _mm_add_epi32(v[3], v[7]);
1434 u[4] = _mm_sub_epi32(v[0], v[4]);
1435 u[5] = _mm_sub_epi32(v[1], v[5]);
1436 u[6] = _mm_sub_epi32(v[2], v[6]);
1437 u[7] = _mm_sub_epi32(v[3], v[7]);
1438 u[8] = _mm_add_epi32(v[8], v[12]);
1439 u[9] = _mm_add_epi32(v[9], v[13]);
1440 u[10] = _mm_add_epi32(v[10], v[14]);
1441 u[11] = _mm_add_epi32(v[11], v[15]);
1442 u[12] = _mm_sub_epi32(v[8], v[12]);
1443 u[13] = _mm_sub_epi32(v[9], v[13]);
1444 u[14] = _mm_sub_epi32(v[10], v[14]);
1445 u[15] = _mm_sub_epi32(v[11], v[15]);
1446
1447 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1448 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1449 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1450 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1451 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1452 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1453 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1454 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1455 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1456 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1457 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1458 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1459 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1460 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1461 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1462 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1463
1464 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1465 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1466 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1467 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1468 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1469 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1470 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1471 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1472 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1473 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1474 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1475 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1476 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1477 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1478 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1479 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1480
1481 s[0] = _mm_add_epi16(x[0], x[2]);
1482 s[1] = _mm_add_epi16(x[1], x[3]);
1483 s[2] = _mm_sub_epi16(x[0], x[2]);
1484 s[3] = _mm_sub_epi16(x[1], x[3]);
1485 s[4] = _mm_packs_epi32(v[0], v[1]);
1486 s[5] = _mm_packs_epi32(v[2], v[3]);
1487 s[6] = _mm_packs_epi32(v[4], v[5]);
1488 s[7] = _mm_packs_epi32(v[6], v[7]);
1489 s[8] = _mm_add_epi16(x[8], x[10]);
1490 s[9] = _mm_add_epi16(x[9], x[11]);
1491 s[10] = _mm_sub_epi16(x[8], x[10]);
1492 s[11] = _mm_sub_epi16(x[9], x[11]);
1493 s[12] = _mm_packs_epi32(v[8], v[9]);
1494 s[13] = _mm_packs_epi32(v[10], v[11]);
1495 s[14] = _mm_packs_epi32(v[12], v[13]);
1496 s[15] = _mm_packs_epi32(v[14], v[15]);
1497
1498 // stage 4
1499 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1500 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1501 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1502 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1503 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1504 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1505 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1506 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1507
1508 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1509 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1510 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1511 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1512 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1513 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1514 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1515 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1516 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1517 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1518 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1519 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1520 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1521 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1522 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1523 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1524
1525 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1526 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1527 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1528 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1529 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1530 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1531 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1532 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1533 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1534 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1535 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1536 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1537 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1538 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1539 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1540 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1541
1542 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1543 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1544 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1545 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1546 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1547 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1548 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1549 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1550 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1551 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1552 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1553 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1554 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1555 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1556 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1557 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1558
1559 in[0] = s[0];
1560 in[1] = _mm_sub_epi16(kZero, s[8]);
1561 in[2] = s[12];
1562 in[3] = _mm_sub_epi16(kZero, s[4]);
1563 in[4] = _mm_packs_epi32(v[4], v[5]);
1564 in[5] = _mm_packs_epi32(v[12], v[13]);
1565 in[6] = _mm_packs_epi32(v[8], v[9]);
1566 in[7] = _mm_packs_epi32(v[0], v[1]);
1567 in[8] = _mm_packs_epi32(v[2], v[3]);
1568 in[9] = _mm_packs_epi32(v[10], v[11]);
1569 in[10] = _mm_packs_epi32(v[14], v[15]);
1570 in[11] = _mm_packs_epi32(v[6], v[7]);
1571 in[12] = s[5];
1572 in[13] = _mm_sub_epi16(kZero, s[13]);
1573 in[14] = s[9];
1574 in[15] = _mm_sub_epi16(kZero, s[1]);
1575 }
1576
idct16_8col(__m128i * in)1577 static void idct16_8col(__m128i *in) {
1578 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1579 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1580 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1581 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1582 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1583 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1584 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1585 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1586 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1587 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1588 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1589 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1590 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1591 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1592 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1593 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1594 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1595 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1596 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1597 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1598 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1599 __m128i v[16], u[16], s[16], t[16];
1600
1601 // stage 1
1602 s[0] = in[0];
1603 s[1] = in[8];
1604 s[2] = in[4];
1605 s[3] = in[12];
1606 s[4] = in[2];
1607 s[5] = in[10];
1608 s[6] = in[6];
1609 s[7] = in[14];
1610 s[8] = in[1];
1611 s[9] = in[9];
1612 s[10] = in[5];
1613 s[11] = in[13];
1614 s[12] = in[3];
1615 s[13] = in[11];
1616 s[14] = in[7];
1617 s[15] = in[15];
1618
1619 // stage 2
1620 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1621 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1622 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1623 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1624 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1625 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1626 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1627 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1628
1629 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1630 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1631 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1632 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1633 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1634 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1635 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1636 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1637 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1638 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1639 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1640 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1641 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1642 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1643 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1644 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1645
1646 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1647 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1648 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1649 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1650 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1651 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1652 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1653 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1654 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1655 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1656 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1657 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1658 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1659 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1660 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1661 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1662
1663 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1664 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1665 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1666 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1667 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1668 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1669 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1670 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1671 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1672 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1673 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1674 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1675 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1676 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1677 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1678 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1679
1680 s[8] = _mm_packs_epi32(u[0], u[1]);
1681 s[15] = _mm_packs_epi32(u[2], u[3]);
1682 s[9] = _mm_packs_epi32(u[4], u[5]);
1683 s[14] = _mm_packs_epi32(u[6], u[7]);
1684 s[10] = _mm_packs_epi32(u[8], u[9]);
1685 s[13] = _mm_packs_epi32(u[10], u[11]);
1686 s[11] = _mm_packs_epi32(u[12], u[13]);
1687 s[12] = _mm_packs_epi32(u[14], u[15]);
1688
1689 // stage 3
1690 t[0] = s[0];
1691 t[1] = s[1];
1692 t[2] = s[2];
1693 t[3] = s[3];
1694 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1695 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1696 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1697 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1698
1699 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1700 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1701 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1702 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1703 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1704 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1705 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1706 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1707
1708 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1709 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1710 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1711 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1712 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1713 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1714 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1715 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1716
1717 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1718 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1719 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1720 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1721 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1722 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1723 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1724 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1725
1726 t[4] = _mm_packs_epi32(u[0], u[1]);
1727 t[7] = _mm_packs_epi32(u[2], u[3]);
1728 t[5] = _mm_packs_epi32(u[4], u[5]);
1729 t[6] = _mm_packs_epi32(u[6], u[7]);
1730 t[8] = _mm_add_epi16(s[8], s[9]);
1731 t[9] = _mm_sub_epi16(s[8], s[9]);
1732 t[10] = _mm_sub_epi16(s[11], s[10]);
1733 t[11] = _mm_add_epi16(s[10], s[11]);
1734 t[12] = _mm_add_epi16(s[12], s[13]);
1735 t[13] = _mm_sub_epi16(s[12], s[13]);
1736 t[14] = _mm_sub_epi16(s[15], s[14]);
1737 t[15] = _mm_add_epi16(s[14], s[15]);
1738
1739 // stage 4
1740 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1741 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1742 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1743 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1744 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1745 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1746 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1747 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1748
1749 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1750 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1751 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1752 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1753 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1754 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1755 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1756 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1757 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1758 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1759 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1760 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1761 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1762 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1763 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1764 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1765
1766 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1767 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1768 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1769 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1770 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1771 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1772 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1773 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1774 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1775 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1776 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1777 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1778 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1779 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1780 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1781 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1782
1783 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1784 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1785 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1786 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1787 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1788 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1789 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1790 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1791 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1792 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1793 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1794 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1795 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1796 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1797 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1798 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1799
1800 s[0] = _mm_packs_epi32(u[0], u[1]);
1801 s[1] = _mm_packs_epi32(u[2], u[3]);
1802 s[2] = _mm_packs_epi32(u[4], u[5]);
1803 s[3] = _mm_packs_epi32(u[6], u[7]);
1804 s[4] = _mm_add_epi16(t[4], t[5]);
1805 s[5] = _mm_sub_epi16(t[4], t[5]);
1806 s[6] = _mm_sub_epi16(t[7], t[6]);
1807 s[7] = _mm_add_epi16(t[6], t[7]);
1808 s[8] = t[8];
1809 s[15] = t[15];
1810 s[9] = _mm_packs_epi32(u[8], u[9]);
1811 s[14] = _mm_packs_epi32(u[10], u[11]);
1812 s[10] = _mm_packs_epi32(u[12], u[13]);
1813 s[13] = _mm_packs_epi32(u[14], u[15]);
1814 s[11] = t[11];
1815 s[12] = t[12];
1816
1817 // stage 5
1818 t[0] = _mm_add_epi16(s[0], s[3]);
1819 t[1] = _mm_add_epi16(s[1], s[2]);
1820 t[2] = _mm_sub_epi16(s[1], s[2]);
1821 t[3] = _mm_sub_epi16(s[0], s[3]);
1822 t[4] = s[4];
1823 t[7] = s[7];
1824
1825 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
1826 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
1827 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1828 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1829 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1830 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1831 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1832 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1833 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1834 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1835 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1836 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1837 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1838 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1839 t[5] = _mm_packs_epi32(u[0], u[1]);
1840 t[6] = _mm_packs_epi32(u[2], u[3]);
1841
1842 t[8] = _mm_add_epi16(s[8], s[11]);
1843 t[9] = _mm_add_epi16(s[9], s[10]);
1844 t[10] = _mm_sub_epi16(s[9], s[10]);
1845 t[11] = _mm_sub_epi16(s[8], s[11]);
1846 t[12] = _mm_sub_epi16(s[15], s[12]);
1847 t[13] = _mm_sub_epi16(s[14], s[13]);
1848 t[14] = _mm_add_epi16(s[13], s[14]);
1849 t[15] = _mm_add_epi16(s[12], s[15]);
1850
1851 // stage 6
1852 s[0] = _mm_add_epi16(t[0], t[7]);
1853 s[1] = _mm_add_epi16(t[1], t[6]);
1854 s[2] = _mm_add_epi16(t[2], t[5]);
1855 s[3] = _mm_add_epi16(t[3], t[4]);
1856 s[4] = _mm_sub_epi16(t[3], t[4]);
1857 s[5] = _mm_sub_epi16(t[2], t[5]);
1858 s[6] = _mm_sub_epi16(t[1], t[6]);
1859 s[7] = _mm_sub_epi16(t[0], t[7]);
1860 s[8] = t[8];
1861 s[9] = t[9];
1862
1863 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
1864 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
1865 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
1866 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
1867
1868 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1869 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1870 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1871 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1872 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1873 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1874 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1875 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1876
1877 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1878 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1879 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1880 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1881 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1882 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1883 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1884 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1885
1886 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1887 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1888 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1889 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1890 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1891 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1892 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1893 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1894
1895 s[10] = _mm_packs_epi32(u[0], u[1]);
1896 s[13] = _mm_packs_epi32(u[2], u[3]);
1897 s[11] = _mm_packs_epi32(u[4], u[5]);
1898 s[12] = _mm_packs_epi32(u[6], u[7]);
1899 s[14] = t[14];
1900 s[15] = t[15];
1901
1902 // stage 7
1903 in[0] = _mm_add_epi16(s[0], s[15]);
1904 in[1] = _mm_add_epi16(s[1], s[14]);
1905 in[2] = _mm_add_epi16(s[2], s[13]);
1906 in[3] = _mm_add_epi16(s[3], s[12]);
1907 in[4] = _mm_add_epi16(s[4], s[11]);
1908 in[5] = _mm_add_epi16(s[5], s[10]);
1909 in[6] = _mm_add_epi16(s[6], s[9]);
1910 in[7] = _mm_add_epi16(s[7], s[8]);
1911 in[8] = _mm_sub_epi16(s[7], s[8]);
1912 in[9] = _mm_sub_epi16(s[6], s[9]);
1913 in[10] = _mm_sub_epi16(s[5], s[10]);
1914 in[11] = _mm_sub_epi16(s[4], s[11]);
1915 in[12] = _mm_sub_epi16(s[3], s[12]);
1916 in[13] = _mm_sub_epi16(s[2], s[13]);
1917 in[14] = _mm_sub_epi16(s[1], s[14]);
1918 in[15] = _mm_sub_epi16(s[0], s[15]);
1919 }
1920
idct16_sse2(__m128i * in0,__m128i * in1)1921 void idct16_sse2(__m128i *in0, __m128i *in1) {
1922 array_transpose_16x16(in0, in1);
1923 idct16_8col(in0);
1924 idct16_8col(in1);
1925 }
1926
iadst16_sse2(__m128i * in0,__m128i * in1)1927 void iadst16_sse2(__m128i *in0, __m128i *in1) {
1928 array_transpose_16x16(in0, in1);
1929 iadst16_8col(in0);
1930 iadst16_8col(in1);
1931 }
1932
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1933 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
1934 int stride) {
1935 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1936 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1937 const __m128i zero = _mm_setzero_si128();
1938
1939 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1940 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1941 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1942 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1943
1944 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1945 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1946
1947 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1948 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1949 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1950 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1951 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1952 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1953
1954 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1955 __m128i in[16], l[16];
1956 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
1957 stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
1958 stp1_12_0;
1959 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1960 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
1961 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1962 int i;
1963 // First 1-D inverse DCT
1964 // Load input data.
1965 in[0] = load_input_data(input);
1966 in[1] = load_input_data(input + 8 * 2);
1967 in[2] = load_input_data(input + 8 * 4);
1968 in[3] = load_input_data(input + 8 * 6);
1969
1970 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
1971
1972 // Stage2
1973 {
1974 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
1975 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
1976
1977 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
1978 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
1979 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
1980 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
1981
1982 tmp0 = _mm_add_epi32(tmp0, rounding);
1983 tmp2 = _mm_add_epi32(tmp2, rounding);
1984 tmp5 = _mm_add_epi32(tmp5, rounding);
1985 tmp7 = _mm_add_epi32(tmp7, rounding);
1986
1987 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1988 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1989 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
1990 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
1991
1992 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
1993 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
1994 }
1995
1996 // Stage3
1997 {
1998 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
1999
2000 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2001 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2002
2003 tmp0 = _mm_add_epi32(tmp0, rounding);
2004 tmp2 = _mm_add_epi32(tmp2, rounding);
2005 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2006 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2007
2008 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2009 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2010
2011 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2012 }
2013
2014 // Stage4
2015 {
2016 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2017 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2018 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2019
2020 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2021 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2022 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2023 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2024 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2025 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2026
2027 tmp0 = _mm_add_epi32(tmp0, rounding);
2028 tmp2 = _mm_add_epi32(tmp2, rounding);
2029 tmp1 = _mm_add_epi32(tmp1, rounding);
2030 tmp3 = _mm_add_epi32(tmp3, rounding);
2031 tmp5 = _mm_add_epi32(tmp5, rounding);
2032 tmp7 = _mm_add_epi32(tmp7, rounding);
2033
2034 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2035 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2036 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2037 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2038 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2039 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2040
2041 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2042 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2043 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2044 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2045
2046 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2047 }
2048
2049 // Stage5 and Stage6
2050 {
2051 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2052 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2053 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2054 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2055
2056 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2057 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2058 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2059 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2060
2061 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2062 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2063 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2064 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2065 }
2066
2067 // Stage6
2068 {
2069 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2070 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2071 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2072
2073 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2074 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2075 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2076 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2077 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2078 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2079
2080 tmp1 = _mm_add_epi32(tmp1, rounding);
2081 tmp3 = _mm_add_epi32(tmp3, rounding);
2082 tmp0 = _mm_add_epi32(tmp0, rounding);
2083 tmp2 = _mm_add_epi32(tmp2, rounding);
2084 tmp4 = _mm_add_epi32(tmp4, rounding);
2085 tmp6 = _mm_add_epi32(tmp6, rounding);
2086
2087 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2088 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2089 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2090 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2091 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2092 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2093
2094 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2095
2096 stp2_10 = _mm_packs_epi32(tmp0, zero);
2097 stp2_13 = _mm_packs_epi32(tmp2, zero);
2098 stp2_11 = _mm_packs_epi32(tmp4, zero);
2099 stp2_12 = _mm_packs_epi32(tmp6, zero);
2100
2101 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2102 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2103 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2104 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2105
2106 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2107 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2108 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2109 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2110 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2111 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2112 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2113 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2114 }
2115
2116 // Stage7. Left 8x16 only.
2117 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2118 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2119 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2120 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2121 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2122 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2123 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2124 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2125 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2126 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2127 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2128 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2129 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2130 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2131 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2132 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2133
2134 // Second 1-D inverse transform, performed per 8x16 block
2135 for (i = 0; i < 2; i++) {
2136 int j;
2137 array_transpose_4X8(l + 8 * i, in);
2138
2139 IDCT16_10
2140
2141 // Stage7
2142 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2143 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2144 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2145 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2146 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2147 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2148 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2149 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2150 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2151 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2152 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2153 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2154 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2155 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2156 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2157 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2158
2159 for (j = 0; j < 16; ++j) {
2160 // Final rounding and shift
2161 in[j] = _mm_adds_epi16(in[j], final_rounding);
2162 in[j] = _mm_srai_epi16(in[j], 6);
2163 RECON_AND_STORE(dest + j * stride, in[j]);
2164 }
2165
2166 dest += 8;
2167 }
2168 }
2169
2170 #define LOAD_DQCOEFF(reg, input) \
2171 { \
2172 reg = load_input_data(input); \
2173 input += 8; \
2174 }
2175
2176 #define IDCT32_34 \
2177 /* Stage1 */ \
2178 { \
2179 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2180 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2181 \
2182 const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
2183 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2184 \
2185 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2186 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2187 \
2188 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2189 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2190 \
2191 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
2192 stp1_31); \
2193 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
2194 stp1_28); \
2195 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
2196 stp1_27); \
2197 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
2198 stp1_24); \
2199 } \
2200 \
2201 /* Stage2 */ \
2202 { \
2203 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2204 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2205 \
2206 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2207 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2208 \
2209 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
2210 stp2_15); \
2211 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
2212 stp2_12); \
2213 \
2214 stp2_16 = stp1_16; \
2215 stp2_19 = stp1_19; \
2216 \
2217 stp2_20 = stp1_20; \
2218 stp2_23 = stp1_23; \
2219 \
2220 stp2_24 = stp1_24; \
2221 stp2_27 = stp1_27; \
2222 \
2223 stp2_28 = stp1_28; \
2224 stp2_31 = stp1_31; \
2225 } \
2226 \
2227 /* Stage3 */ \
2228 { \
2229 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2230 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2231 \
2232 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2233 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2234 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2235 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2236 \
2237 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2238 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2239 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2240 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2241 \
2242 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
2243 stp1_7); \
2244 \
2245 stp1_8 = stp2_8; \
2246 stp1_11 = stp2_11; \
2247 stp1_12 = stp2_12; \
2248 stp1_15 = stp2_15; \
2249 \
2250 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2251 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
2252 stp1_29) \
2253 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2254 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2255 stp1_25) \
2256 \
2257 stp1_16 = stp2_16; \
2258 stp1_31 = stp2_31; \
2259 stp1_19 = stp2_19; \
2260 stp1_20 = stp2_20; \
2261 stp1_23 = stp2_23; \
2262 stp1_24 = stp2_24; \
2263 stp1_27 = stp2_27; \
2264 stp1_28 = stp2_28; \
2265 } \
2266 \
2267 /* Stage4 */ \
2268 { \
2269 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2270 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2271 \
2272 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2273 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2274 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2275 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2276 \
2277 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
2278 stp2_1); \
2279 \
2280 stp2_4 = stp1_4; \
2281 stp2_5 = stp1_4; \
2282 stp2_6 = stp1_7; \
2283 stp2_7 = stp1_7; \
2284 \
2285 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2286 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
2287 stp2_13) \
2288 \
2289 stp2_8 = stp1_8; \
2290 stp2_15 = stp1_15; \
2291 stp2_11 = stp1_11; \
2292 stp2_12 = stp1_12; \
2293 \
2294 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2295 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2296 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2297 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2298 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2299 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2300 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2301 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2302 \
2303 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2304 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2305 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2306 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2307 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2308 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2309 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2310 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2311 } \
2312 \
2313 /* Stage5 */ \
2314 { \
2315 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2316 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2317 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2318 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2319 \
2320 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2321 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2322 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2323 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2324 \
2325 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2326 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2327 \
2328 stp1_0 = stp2_0; \
2329 stp1_1 = stp2_1; \
2330 stp1_2 = stp2_1; \
2331 stp1_3 = stp2_0; \
2332 \
2333 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2334 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2335 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2336 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2337 \
2338 tmp0 = _mm_add_epi32(tmp0, rounding); \
2339 tmp1 = _mm_add_epi32(tmp1, rounding); \
2340 tmp2 = _mm_add_epi32(tmp2, rounding); \
2341 tmp3 = _mm_add_epi32(tmp3, rounding); \
2342 \
2343 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2344 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2345 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2346 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2347 \
2348 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2349 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2350 \
2351 stp1_4 = stp2_4; \
2352 stp1_7 = stp2_7; \
2353 \
2354 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2355 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2356 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2357 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2358 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2359 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2360 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2361 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2362 \
2363 stp1_16 = stp2_16; \
2364 stp1_17 = stp2_17; \
2365 \
2366 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2367 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
2368 stp1_28) \
2369 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2370 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
2371 stp1_26) \
2372 \
2373 stp1_22 = stp2_22; \
2374 stp1_23 = stp2_23; \
2375 stp1_24 = stp2_24; \
2376 stp1_25 = stp2_25; \
2377 stp1_30 = stp2_30; \
2378 stp1_31 = stp2_31; \
2379 } \
2380 \
2381 /* Stage6 */ \
2382 { \
2383 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2384 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2385 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2386 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2387 \
2388 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2389 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2390 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2391 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2392 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2393 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2394 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2395 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2396 \
2397 stp2_8 = stp1_8; \
2398 stp2_9 = stp1_9; \
2399 stp2_14 = stp1_14; \
2400 stp2_15 = stp1_15; \
2401 \
2402 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
2403 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
2404 stp2_12) \
2405 \
2406 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2407 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2408 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2409 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2410 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2411 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2412 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2413 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2414 \
2415 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2416 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2417 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2418 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2419 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2420 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2421 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2422 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2423 } \
2424 \
2425 /* Stage7 */ \
2426 { \
2427 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2428 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2429 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2430 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2431 \
2432 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2433 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2434 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2435 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2436 \
2437 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2438 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2439 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2440 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2441 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2442 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2443 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2444 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2445 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2446 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2447 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2448 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2449 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2450 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2451 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2452 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2453 \
2454 stp1_16 = stp2_16; \
2455 stp1_17 = stp2_17; \
2456 stp1_18 = stp2_18; \
2457 stp1_19 = stp2_19; \
2458 \
2459 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2460 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2461 stp1_26) \
2462 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2463 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
2464 stp1_24) \
2465 \
2466 stp1_28 = stp2_28; \
2467 stp1_29 = stp2_29; \
2468 stp1_30 = stp2_30; \
2469 stp1_31 = stp2_31; \
2470 }
2471
2472 #define IDCT32 \
2473 /* Stage1 */ \
2474 { \
2475 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2476 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2477 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2478 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2479 \
2480 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2481 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2482 const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
2483 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2484 \
2485 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2486 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2487 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2488 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2489 \
2490 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2491 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2492 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2493 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2494 \
2495 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2496 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
2497 stp1_30) \
2498 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2499 stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2500 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2501 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2502 stp1_21, stp1_26) \
2503 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2504 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2505 stp1_23, stp1_24) \
2506 } \
2507 \
2508 /* Stage2 */ \
2509 { \
2510 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2511 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2512 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2513 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2514 \
2515 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2516 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2517 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2518 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2519 \
2520 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2521 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2522 stp2_14) \
2523 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2524 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
2525 stp2_12) \
2526 \
2527 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2528 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2529 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2530 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2531 \
2532 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2533 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2534 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2535 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2536 \
2537 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2538 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2539 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2540 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2541 \
2542 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2543 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2544 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2545 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2546 } \
2547 \
2548 /* Stage3 */ \
2549 { \
2550 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2551 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2552 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2553 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2554 \
2555 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2556 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2557 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2558 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2559 \
2560 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2561 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2562 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2563 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2564 \
2565 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2566 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2567 stp1_6) \
2568 \
2569 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2570 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2571 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2572 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2573 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2574 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2575 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2576 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2577 \
2578 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2579 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
2580 stp1_29) \
2581 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2582 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2583 stp1_25) \
2584 \
2585 stp1_16 = stp2_16; \
2586 stp1_31 = stp2_31; \
2587 stp1_19 = stp2_19; \
2588 stp1_20 = stp2_20; \
2589 stp1_23 = stp2_23; \
2590 stp1_24 = stp2_24; \
2591 stp1_27 = stp2_27; \
2592 stp1_28 = stp2_28; \
2593 } \
2594 \
2595 /* Stage4 */ \
2596 { \
2597 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2598 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2599 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2600 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2601 \
2602 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2603 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2604 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2605 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2606 \
2607 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2608 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
2609 \
2610 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2611 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2612 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2613 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2614 \
2615 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2616 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
2617 stp2_13) \
2618 \
2619 stp2_8 = stp1_8; \
2620 stp2_15 = stp1_15; \
2621 stp2_11 = stp1_11; \
2622 stp2_12 = stp1_12; \
2623 \
2624 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2625 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2626 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2627 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2628 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2629 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2630 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2631 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2632 \
2633 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2634 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2635 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2636 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2637 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2638 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2639 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2640 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2641 } \
2642 \
2643 /* Stage5 */ \
2644 { \
2645 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2646 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2647 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2648 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2649 \
2650 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2651 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2652 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2653 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2654 \
2655 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2656 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2657 \
2658 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2659 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2660 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2661 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2662 \
2663 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2664 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2665 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2666 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2667 \
2668 tmp0 = _mm_add_epi32(tmp0, rounding); \
2669 tmp1 = _mm_add_epi32(tmp1, rounding); \
2670 tmp2 = _mm_add_epi32(tmp2, rounding); \
2671 tmp3 = _mm_add_epi32(tmp3, rounding); \
2672 \
2673 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2674 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2675 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2676 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2677 \
2678 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2679 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2680 \
2681 stp1_4 = stp2_4; \
2682 stp1_7 = stp2_7; \
2683 \
2684 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2685 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2686 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2687 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2688 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2689 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2690 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2691 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2692 \
2693 stp1_16 = stp2_16; \
2694 stp1_17 = stp2_17; \
2695 \
2696 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2697 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
2698 stp1_28) \
2699 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2700 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
2701 stp1_26) \
2702 \
2703 stp1_22 = stp2_22; \
2704 stp1_23 = stp2_23; \
2705 stp1_24 = stp2_24; \
2706 stp1_25 = stp2_25; \
2707 stp1_30 = stp2_30; \
2708 stp1_31 = stp2_31; \
2709 } \
2710 \
2711 /* Stage6 */ \
2712 { \
2713 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2714 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2715 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2716 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2717 \
2718 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2719 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2720 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2721 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2722 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2723 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2724 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2725 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2726 \
2727 stp2_8 = stp1_8; \
2728 stp2_9 = stp1_9; \
2729 stp2_14 = stp1_14; \
2730 stp2_15 = stp1_15; \
2731 \
2732 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
2733 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
2734 stp2_12) \
2735 \
2736 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2737 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2738 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2739 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2740 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2741 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2742 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2743 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2744 \
2745 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2746 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2747 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2748 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2749 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2750 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2751 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2752 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2753 } \
2754 \
2755 /* Stage7 */ \
2756 { \
2757 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2758 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2759 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2760 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2761 \
2762 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2763 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2764 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2765 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2766 \
2767 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2768 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2769 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2770 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2771 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2772 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2773 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2774 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2775 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2776 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2777 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2778 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2779 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2780 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2781 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2782 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2783 \
2784 stp1_16 = stp2_16; \
2785 stp1_17 = stp2_17; \
2786 stp1_18 = stp2_18; \
2787 stp1_19 = stp2_19; \
2788 \
2789 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2790 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2791 stp1_26) \
2792 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2793 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
2794 stp1_24) \
2795 \
2796 stp1_28 = stp2_28; \
2797 stp1_29 = stp2_29; \
2798 stp1_30 = stp2_30; \
2799 stp1_31 = stp2_31; \
2800 }
2801
2802 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2803 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
2804 int stride) {
2805 const __m128i zero = _mm_setzero_si128();
2806 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2807 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2808
2809 // idct constants for each stage
2810 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2811 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2812 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2813 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2814 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2815 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2816 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2817 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2818
2819 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2820 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2821 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2822 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2823
2824 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2825 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2826 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2827 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2828 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2829 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2830 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2831 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2832
2833 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2834 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2835 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2836 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2837 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2838
2839 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2840
2841 __m128i in[32], col[32];
2842 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2843 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2844 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2845 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2846 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2847 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2848 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2849 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2850 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2851 int i;
2852
2853 // Load input data. Only need to load the top left 8x8 block.
2854 in[0] = load_input_data(input);
2855 in[1] = load_input_data(input + 32);
2856 in[2] = load_input_data(input + 64);
2857 in[3] = load_input_data(input + 96);
2858 in[4] = load_input_data(input + 128);
2859 in[5] = load_input_data(input + 160);
2860 in[6] = load_input_data(input + 192);
2861 in[7] = load_input_data(input + 224);
2862
2863 array_transpose_8x8(in, in);
2864 IDCT32_34
2865
2866 // 1_D: Store 32 intermediate results for each 8x32 block.
2867 col[0] = _mm_add_epi16(stp1_0, stp1_31);
2868 col[1] = _mm_add_epi16(stp1_1, stp1_30);
2869 col[2] = _mm_add_epi16(stp1_2, stp1_29);
2870 col[3] = _mm_add_epi16(stp1_3, stp1_28);
2871 col[4] = _mm_add_epi16(stp1_4, stp1_27);
2872 col[5] = _mm_add_epi16(stp1_5, stp1_26);
2873 col[6] = _mm_add_epi16(stp1_6, stp1_25);
2874 col[7] = _mm_add_epi16(stp1_7, stp1_24);
2875 col[8] = _mm_add_epi16(stp1_8, stp1_23);
2876 col[9] = _mm_add_epi16(stp1_9, stp1_22);
2877 col[10] = _mm_add_epi16(stp1_10, stp1_21);
2878 col[11] = _mm_add_epi16(stp1_11, stp1_20);
2879 col[12] = _mm_add_epi16(stp1_12, stp1_19);
2880 col[13] = _mm_add_epi16(stp1_13, stp1_18);
2881 col[14] = _mm_add_epi16(stp1_14, stp1_17);
2882 col[15] = _mm_add_epi16(stp1_15, stp1_16);
2883 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
2884 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
2885 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
2886 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
2887 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
2888 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
2889 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
2890 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
2891 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
2892 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
2893 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
2894 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
2895 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
2896 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
2897 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
2898 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
2899 for (i = 0; i < 4; i++) {
2900 int j;
2901 // Transpose 32x8 block to 8x32 block
2902 array_transpose_8x8(col + i * 8, in);
2903 IDCT32_34
2904
2905 // 2_D: Calculate the results and store them to destination.
2906 in[0] = _mm_add_epi16(stp1_0, stp1_31);
2907 in[1] = _mm_add_epi16(stp1_1, stp1_30);
2908 in[2] = _mm_add_epi16(stp1_2, stp1_29);
2909 in[3] = _mm_add_epi16(stp1_3, stp1_28);
2910 in[4] = _mm_add_epi16(stp1_4, stp1_27);
2911 in[5] = _mm_add_epi16(stp1_5, stp1_26);
2912 in[6] = _mm_add_epi16(stp1_6, stp1_25);
2913 in[7] = _mm_add_epi16(stp1_7, stp1_24);
2914 in[8] = _mm_add_epi16(stp1_8, stp1_23);
2915 in[9] = _mm_add_epi16(stp1_9, stp1_22);
2916 in[10] = _mm_add_epi16(stp1_10, stp1_21);
2917 in[11] = _mm_add_epi16(stp1_11, stp1_20);
2918 in[12] = _mm_add_epi16(stp1_12, stp1_19);
2919 in[13] = _mm_add_epi16(stp1_13, stp1_18);
2920 in[14] = _mm_add_epi16(stp1_14, stp1_17);
2921 in[15] = _mm_add_epi16(stp1_15, stp1_16);
2922 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2923 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2924 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2925 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2926 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2927 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2928 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2929 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2930 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2931 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2932 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2933 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2934 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2935 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2936 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2937 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2938
2939 for (j = 0; j < 32; ++j) {
2940 // Final rounding and shift
2941 in[j] = _mm_adds_epi16(in[j], final_rounding);
2942 in[j] = _mm_srai_epi16(in[j], 6);
2943 RECON_AND_STORE(dest + j * stride, in[j]);
2944 }
2945
2946 dest += 8;
2947 }
2948 }
2949
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2950 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
2951 int stride) {
2952 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2953 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2954 const __m128i zero = _mm_setzero_si128();
2955
2956 // idct constants for each stage
2957 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2958 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2959 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2960 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2961 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2962 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2963 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2964 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2965 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2966 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2967 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2968 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2969 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2970 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2971 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2972 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2973
2974 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2975 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2976 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2977 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2978 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2979 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2980 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2981 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2982
2983 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2984 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2985 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2986 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2987 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2988 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2989 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2990 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2991 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2992 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2993
2994 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2995 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2996 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2997 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2998 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2999 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3000 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3001
3002 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3003
3004 __m128i in[32], col[128], zero_idx[16];
3005 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3006 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3007 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
3008 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
3009 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3010 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3011 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
3012 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
3013 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3014 int i, j, i32;
3015
3016 for (i = 0; i < 4; i++) {
3017 i32 = (i << 5);
3018 // First 1-D idct
3019 // Load input data.
3020 LOAD_DQCOEFF(in[0], input);
3021 LOAD_DQCOEFF(in[8], input);
3022 LOAD_DQCOEFF(in[16], input);
3023 LOAD_DQCOEFF(in[24], input);
3024 LOAD_DQCOEFF(in[1], input);
3025 LOAD_DQCOEFF(in[9], input);
3026 LOAD_DQCOEFF(in[17], input);
3027 LOAD_DQCOEFF(in[25], input);
3028 LOAD_DQCOEFF(in[2], input);
3029 LOAD_DQCOEFF(in[10], input);
3030 LOAD_DQCOEFF(in[18], input);
3031 LOAD_DQCOEFF(in[26], input);
3032 LOAD_DQCOEFF(in[3], input);
3033 LOAD_DQCOEFF(in[11], input);
3034 LOAD_DQCOEFF(in[19], input);
3035 LOAD_DQCOEFF(in[27], input);
3036
3037 LOAD_DQCOEFF(in[4], input);
3038 LOAD_DQCOEFF(in[12], input);
3039 LOAD_DQCOEFF(in[20], input);
3040 LOAD_DQCOEFF(in[28], input);
3041 LOAD_DQCOEFF(in[5], input);
3042 LOAD_DQCOEFF(in[13], input);
3043 LOAD_DQCOEFF(in[21], input);
3044 LOAD_DQCOEFF(in[29], input);
3045 LOAD_DQCOEFF(in[6], input);
3046 LOAD_DQCOEFF(in[14], input);
3047 LOAD_DQCOEFF(in[22], input);
3048 LOAD_DQCOEFF(in[30], input);
3049 LOAD_DQCOEFF(in[7], input);
3050 LOAD_DQCOEFF(in[15], input);
3051 LOAD_DQCOEFF(in[23], input);
3052 LOAD_DQCOEFF(in[31], input);
3053
3054 // checking if all entries are zero
3055 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3056 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3057 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3058 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3059 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3060 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3061 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3062 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3063 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3064 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3065 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3066 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3067 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3068 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3069 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3070 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3071
3072 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3073 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3074 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3075 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3076 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3077 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3078 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3079 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3080
3081 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3082 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3083 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3084 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3085 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3086 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3087 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3088
3089 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3090 col[i32 + 0] = _mm_setzero_si128();
3091 col[i32 + 1] = _mm_setzero_si128();
3092 col[i32 + 2] = _mm_setzero_si128();
3093 col[i32 + 3] = _mm_setzero_si128();
3094 col[i32 + 4] = _mm_setzero_si128();
3095 col[i32 + 5] = _mm_setzero_si128();
3096 col[i32 + 6] = _mm_setzero_si128();
3097 col[i32 + 7] = _mm_setzero_si128();
3098 col[i32 + 8] = _mm_setzero_si128();
3099 col[i32 + 9] = _mm_setzero_si128();
3100 col[i32 + 10] = _mm_setzero_si128();
3101 col[i32 + 11] = _mm_setzero_si128();
3102 col[i32 + 12] = _mm_setzero_si128();
3103 col[i32 + 13] = _mm_setzero_si128();
3104 col[i32 + 14] = _mm_setzero_si128();
3105 col[i32 + 15] = _mm_setzero_si128();
3106 col[i32 + 16] = _mm_setzero_si128();
3107 col[i32 + 17] = _mm_setzero_si128();
3108 col[i32 + 18] = _mm_setzero_si128();
3109 col[i32 + 19] = _mm_setzero_si128();
3110 col[i32 + 20] = _mm_setzero_si128();
3111 col[i32 + 21] = _mm_setzero_si128();
3112 col[i32 + 22] = _mm_setzero_si128();
3113 col[i32 + 23] = _mm_setzero_si128();
3114 col[i32 + 24] = _mm_setzero_si128();
3115 col[i32 + 25] = _mm_setzero_si128();
3116 col[i32 + 26] = _mm_setzero_si128();
3117 col[i32 + 27] = _mm_setzero_si128();
3118 col[i32 + 28] = _mm_setzero_si128();
3119 col[i32 + 29] = _mm_setzero_si128();
3120 col[i32 + 30] = _mm_setzero_si128();
3121 col[i32 + 31] = _mm_setzero_si128();
3122 continue;
3123 }
3124
3125 // Transpose 32x8 block to 8x32 block
3126 array_transpose_8x8(in, in);
3127 array_transpose_8x8(in + 8, in + 8);
3128 array_transpose_8x8(in + 16, in + 16);
3129 array_transpose_8x8(in + 24, in + 24);
3130
3131 IDCT32
3132
3133 // 1_D: Store 32 intermediate results for each 8x32 block.
3134 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3135 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3136 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3137 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3138 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3139 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3140 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3141 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3142 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3143 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3144 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3145 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3146 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3147 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3148 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3149 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3150 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3151 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3152 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3153 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3154 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3155 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3156 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3157 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3158 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3159 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3160 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3161 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3162 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3163 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3164 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3165 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3166 }
3167 for (i = 0; i < 4; i++) {
3168 // Second 1-D idct
3169 j = i << 3;
3170
3171 // Transpose 32x8 block to 8x32 block
3172 array_transpose_8x8(col + j, in);
3173 array_transpose_8x8(col + j + 32, in + 8);
3174 array_transpose_8x8(col + j + 64, in + 16);
3175 array_transpose_8x8(col + j + 96, in + 24);
3176
3177 IDCT32
3178
3179 // 2_D: Calculate the results and store them to destination.
3180 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3181 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3182 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3183 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3184 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3185 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3186 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3187 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3188 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3189 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3190 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3191 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3192 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3193 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3194 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3195 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3196 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3197 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3198 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3199 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3200 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3201 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3202 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3203 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3204 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3205 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3206 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3207 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3208 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3209 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3210 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3211 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3212
3213 for (j = 0; j < 32; ++j) {
3214 // Final rounding and shift
3215 in[j] = _mm_adds_epi16(in[j], final_rounding);
3216 in[j] = _mm_srai_epi16(in[j], 6);
3217 RECON_AND_STORE(dest + j * stride, in[j]);
3218 }
3219
3220 dest += 8;
3221 }
3222 }
3223
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3224 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3225 int stride) {
3226 __m128i dc_value;
3227 const __m128i zero = _mm_setzero_si128();
3228 int a, j;
3229
3230 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3231 a = (int)dct_const_round_shift(a * cospi_16_64);
3232 a = ROUND_POWER_OF_TWO(a, 6);
3233
3234 dc_value = _mm_set1_epi16(a);
3235
3236 for (j = 0; j < 32; ++j) {
3237 RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3238 RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3239 RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3240 RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3241 }
3242 }
3243