• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>  // SSE2
13 #include "./vpx_config.h"
14 #include "vpx/vpx_integer.h"
15 #include "vp9/common/vp9_common.h"
16 #include "vp9/common/vp9_idct.h"
17 
18 #define RECON_AND_STORE4X4(dest, in_x) \
19 {                                                     \
20   __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
21   d0 = _mm_unpacklo_epi8(d0, zero); \
22   d0 = _mm_add_epi16(in_x, d0); \
23   d0 = _mm_packus_epi16(d0, d0); \
24   *(int *)dest = _mm_cvtsi128_si32(d0); \
25   dest += stride; \
26 }
27 
vp9_idct4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride)28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
29   const __m128i zero = _mm_setzero_si128();
30   const __m128i eight = _mm_set1_epi16(8);
31   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
32                                     (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
33                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
34                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
35   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
36   __m128i input0, input1, input2, input3;
37 
38   // Rows
39   input0 = _mm_load_si128((const __m128i *)input);
40   input2 = _mm_load_si128((const __m128i *)(input + 8));
41 
42   // Construct i3, i1, i3, i1, i2, i0, i2, i0
43   input0 = _mm_shufflelo_epi16(input0, 0xd8);
44   input0 = _mm_shufflehi_epi16(input0, 0xd8);
45   input2 = _mm_shufflelo_epi16(input2, 0xd8);
46   input2 = _mm_shufflehi_epi16(input2, 0xd8);
47 
48   input1 = _mm_unpackhi_epi32(input0, input0);
49   input0 = _mm_unpacklo_epi32(input0, input0);
50   input3 = _mm_unpackhi_epi32(input2, input2);
51   input2 = _mm_unpacklo_epi32(input2, input2);
52 
53   // Stage 1
54   input0 = _mm_madd_epi16(input0, cst);
55   input1 = _mm_madd_epi16(input1, cst);
56   input2 = _mm_madd_epi16(input2, cst);
57   input3 = _mm_madd_epi16(input3, cst);
58 
59   input0 = _mm_add_epi32(input0, rounding);
60   input1 = _mm_add_epi32(input1, rounding);
61   input2 = _mm_add_epi32(input2, rounding);
62   input3 = _mm_add_epi32(input3, rounding);
63 
64   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
65   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
66   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
67   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
68 
69   // Stage 2
70   input0 = _mm_packs_epi32(input0, input1);
71   input1 = _mm_packs_epi32(input2, input3);
72 
73   // Transpose
74   input2 = _mm_unpacklo_epi16(input0, input1);
75   input3 = _mm_unpackhi_epi16(input0, input1);
76   input0 = _mm_unpacklo_epi32(input2, input3);
77   input1 = _mm_unpackhi_epi32(input2, input3);
78 
79   // Switch column2, column 3, and then, we got:
80   // input2: column1, column 0;  input3: column2, column 3.
81   input1 = _mm_shuffle_epi32(input1, 0x4e);
82   input2 = _mm_add_epi16(input0, input1);
83   input3 = _mm_sub_epi16(input0, input1);
84 
85   // Columns
86   // Construct i3, i1, i3, i1, i2, i0, i2, i0
87   input0 = _mm_unpacklo_epi32(input2, input2);
88   input1 = _mm_unpackhi_epi32(input2, input2);
89   input2 = _mm_unpackhi_epi32(input3, input3);
90   input3 = _mm_unpacklo_epi32(input3, input3);
91 
92   // Stage 1
93   input0 = _mm_madd_epi16(input0, cst);
94   input1 = _mm_madd_epi16(input1, cst);
95   input2 = _mm_madd_epi16(input2, cst);
96   input3 = _mm_madd_epi16(input3, cst);
97 
98   input0 = _mm_add_epi32(input0, rounding);
99   input1 = _mm_add_epi32(input1, rounding);
100   input2 = _mm_add_epi32(input2, rounding);
101   input3 = _mm_add_epi32(input3, rounding);
102 
103   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
104   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
105   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
106   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
107 
108   // Stage 2
109   input0 = _mm_packs_epi32(input0, input2);
110   input1 = _mm_packs_epi32(input1, input3);
111 
112   // Transpose
113   input2 = _mm_unpacklo_epi16(input0, input1);
114   input3 = _mm_unpackhi_epi16(input0, input1);
115   input0 = _mm_unpacklo_epi32(input2, input3);
116   input1 = _mm_unpackhi_epi32(input2, input3);
117 
118   // Switch column2, column 3, and then, we got:
119   // input2: column1, column 0;  input3: column2, column 3.
120   input1 = _mm_shuffle_epi32(input1, 0x4e);
121   input2 = _mm_add_epi16(input0, input1);
122   input3 = _mm_sub_epi16(input0, input1);
123 
124   // Final round and shift
125   input2 = _mm_add_epi16(input2, eight);
126   input3 = _mm_add_epi16(input3, eight);
127 
128   input2 = _mm_srai_epi16(input2, 4);
129   input3 = _mm_srai_epi16(input3, 4);
130 
131   // Reconstruction and Store
132   {
133      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
134      __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
135      d0 = _mm_unpacklo_epi32(d0,
136           _mm_cvtsi32_si128(*(const int *) (dest + stride)));
137      d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
138                     *(const int *) (dest + stride * 3)), d2);
139      d0 = _mm_unpacklo_epi8(d0, zero);
140      d2 = _mm_unpacklo_epi8(d2, zero);
141      d0 = _mm_add_epi16(d0, input2);
142      d2 = _mm_add_epi16(d2, input3);
143      d0 = _mm_packus_epi16(d0, d2);
144      // store input0
145      *(int *)dest = _mm_cvtsi128_si32(d0);
146      // store input1
147      d0 = _mm_srli_si128(d0, 4);
148      *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
149      // store input2
150      d0 = _mm_srli_si128(d0, 4);
151      *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
152      // store input3
153      d0 = _mm_srli_si128(d0, 4);
154      *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155   }
156 }
157 
vp9_idct4x4_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
159   __m128i dc_value;
160   const __m128i zero = _mm_setzero_si128();
161   int a;
162 
163   a = dct_const_round_shift(input[0] * cospi_16_64);
164   a = dct_const_round_shift(a * cospi_16_64);
165   a = ROUND_POWER_OF_TWO(a, 4);
166 
167   dc_value = _mm_set1_epi16(a);
168 
169   RECON_AND_STORE4X4(dest, dc_value);
170   RECON_AND_STORE4X4(dest, dc_value);
171   RECON_AND_STORE4X4(dest, dc_value);
172   RECON_AND_STORE4X4(dest, dc_value);
173 }
174 
transpose_4x4(__m128i * res)175 static INLINE void transpose_4x4(__m128i *res) {
176   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
177   const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
178   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
179   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
180 
181   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
182   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
183 }
184 
idct4_1d_sse2(__m128i * in)185 static void idct4_1d_sse2(__m128i *in) {
186   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
187   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
188   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
189   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
190   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
191   __m128i u[8], v[8];
192 
193   transpose_4x4(in);
194   // stage 1
195   u[0] = _mm_unpacklo_epi16(in[0], in[2]);
196   u[1] = _mm_unpacklo_epi16(in[1], in[3]);
197   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
198   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
199   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
200   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
201 
202   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
203   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
204   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
205   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
206 
207   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
208   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
209   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
210   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
211 
212   u[0] = _mm_packs_epi32(v[0], v[2]);
213   u[1] = _mm_packs_epi32(v[1], v[3]);
214   u[2] = _mm_unpackhi_epi64(u[0], u[0]);
215   u[3] = _mm_unpackhi_epi64(u[1], u[1]);
216 
217   // stage 2
218   in[0] = _mm_add_epi16(u[0], u[3]);
219   in[1] = _mm_add_epi16(u[1], u[2]);
220   in[2] = _mm_sub_epi16(u[1], u[2]);
221   in[3] = _mm_sub_epi16(u[0], u[3]);
222 }
223 
iadst4_1d_sse2(__m128i * in)224 static void iadst4_1d_sse2(__m128i *in) {
225   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
226   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
227   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
228   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
229   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
230   const __m128i kZero = _mm_set1_epi16(0);
231   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
232   __m128i u[8], v[8], in7;
233 
234   transpose_4x4(in);
235   in7 = _mm_add_epi16(in[0], in[3]);
236   in7 = _mm_sub_epi16(in7, in[2]);
237 
238   u[0] = _mm_unpacklo_epi16(in[0], in[2]);
239   u[1] = _mm_unpacklo_epi16(in[1], in[3]);
240   u[2] = _mm_unpacklo_epi16(in7, kZero);
241   u[3] = _mm_unpacklo_epi16(in[1], kZero);
242 
243   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
244   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
245   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
246   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
247   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
248   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
249 
250   u[0] = _mm_add_epi32(v[0], v[1]);
251   u[1] = _mm_add_epi32(v[3], v[4]);
252   u[2] = v[2];
253   u[3] = _mm_add_epi32(u[0], u[1]);
254   u[4] = _mm_slli_epi32(v[5], 2);
255   u[5] = _mm_add_epi32(u[3], v[5]);
256   u[6] = _mm_sub_epi32(u[5], u[4]);
257 
258   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
259   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
260   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
261   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
262 
263   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
264   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
265   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
266   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
267 
268   in[0] = _mm_packs_epi32(u[0], u[2]);
269   in[1] = _mm_packs_epi32(u[1], u[3]);
270   in[2] = _mm_unpackhi_epi64(in[0], in[0]);
271   in[3] = _mm_unpackhi_epi64(in[1], in[1]);
272 }
273 
vp9_iht4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)274 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
275                             int tx_type) {
276   __m128i in[4];
277   const __m128i zero = _mm_setzero_si128();
278   const __m128i eight = _mm_set1_epi16(8);
279 
280   in[0] = _mm_loadl_epi64((const __m128i *)input);
281   in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
282   in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
283   in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
284 
285   switch (tx_type) {
286     case 0:  // DCT_DCT
287       idct4_1d_sse2(in);
288       idct4_1d_sse2(in);
289       break;
290     case 1:  // ADST_DCT
291       idct4_1d_sse2(in);
292       iadst4_1d_sse2(in);
293       break;
294     case 2:  // DCT_ADST
295       iadst4_1d_sse2(in);
296       idct4_1d_sse2(in);
297       break;
298     case 3:  // ADST_ADST
299       iadst4_1d_sse2(in);
300       iadst4_1d_sse2(in);
301       break;
302     default:
303       assert(0);
304       break;
305   }
306 
307   // Final round and shift
308   in[0] = _mm_add_epi16(in[0], eight);
309   in[1] = _mm_add_epi16(in[1], eight);
310   in[2] = _mm_add_epi16(in[2], eight);
311   in[3] = _mm_add_epi16(in[3], eight);
312 
313   in[0] = _mm_srai_epi16(in[0], 4);
314   in[1] = _mm_srai_epi16(in[1], 4);
315   in[2] = _mm_srai_epi16(in[2], 4);
316   in[3] = _mm_srai_epi16(in[3], 4);
317 
318   RECON_AND_STORE4X4(dest, in[0]);
319   RECON_AND_STORE4X4(dest, in[1]);
320   RECON_AND_STORE4X4(dest, in[2]);
321   RECON_AND_STORE4X4(dest, in[3]);
322 }
323 
324 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
325                       out0, out1, out2, out3, out4, out5, out6, out7) \
326   {                                                     \
327     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
328     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
329     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
330     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
331     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
332     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
333     const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
334     const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
335                                                         \
336     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
337     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
338     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
339     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
340     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
341     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
342     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
343     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
344                                                             \
345     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
346     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
347     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
348     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
349     out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
350     out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
351     out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
352     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
353   }
354 
355 #define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
356                       out0, out1, out2, out3, out4, out5, out6, out7) \
357   {                                                     \
358     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
359     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
360     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
361     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
362                                                         \
363     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
364     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
365     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
366     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
367                                                             \
368     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
369     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
370     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
371     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
372     out4 = out5 = out6 = out7 = zero; \
373   }
374 
375 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
376   {                                                     \
377     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
378     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
379     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
380     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
381                                                         \
382     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \
383     in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \
384     in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \
385     in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
386   }
387 
388 // Define Macro for multiplying elements by constants and adding them together.
389 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
390                                cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
391   {   \
392       tmp0 = _mm_madd_epi16(lo_0, cst0); \
393       tmp1 = _mm_madd_epi16(hi_0, cst0); \
394       tmp2 = _mm_madd_epi16(lo_0, cst1); \
395       tmp3 = _mm_madd_epi16(hi_0, cst1); \
396       tmp4 = _mm_madd_epi16(lo_1, cst2); \
397       tmp5 = _mm_madd_epi16(hi_1, cst2); \
398       tmp6 = _mm_madd_epi16(lo_1, cst3); \
399       tmp7 = _mm_madd_epi16(hi_1, cst3); \
400       \
401       tmp0 = _mm_add_epi32(tmp0, rounding); \
402       tmp1 = _mm_add_epi32(tmp1, rounding); \
403       tmp2 = _mm_add_epi32(tmp2, rounding); \
404       tmp3 = _mm_add_epi32(tmp3, rounding); \
405       tmp4 = _mm_add_epi32(tmp4, rounding); \
406       tmp5 = _mm_add_epi32(tmp5, rounding); \
407       tmp6 = _mm_add_epi32(tmp6, rounding); \
408       tmp7 = _mm_add_epi32(tmp7, rounding); \
409       \
410       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
411       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
412       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
413       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
414       tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
415       tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
416       tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
417       tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
418       \
419       res0 = _mm_packs_epi32(tmp0, tmp1); \
420       res1 = _mm_packs_epi32(tmp2, tmp3); \
421       res2 = _mm_packs_epi32(tmp4, tmp5); \
422       res3 = _mm_packs_epi32(tmp6, tmp7); \
423   }
424 
425 #define IDCT8_1D  \
426   /* Stage1 */      \
427   { \
428     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
429     const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
430     const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
431     const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
432     \
433     MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
434                           stg1_1, stg1_2, stg1_3, stp1_4,      \
435                           stp1_7, stp1_5, stp1_6)              \
436   } \
437     \
438   /* Stage2 */ \
439   { \
440     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
441     const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
442     const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
443     const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
444     \
445     MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
446                            stg2_1, stg2_2, stg2_3, stp2_0,     \
447                            stp2_1, stp2_2, stp2_3)             \
448     \
449     stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
450     stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
451     stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
452     stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
453   } \
454     \
455   /* Stage3 */ \
456   { \
457     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
458     const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
459     \
460     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
461     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
462     stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
463     stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
464     \
465     tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
466     tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
467     tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
468     tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
469     \
470     tmp0 = _mm_add_epi32(tmp0, rounding); \
471     tmp1 = _mm_add_epi32(tmp1, rounding); \
472     tmp2 = _mm_add_epi32(tmp2, rounding); \
473     tmp3 = _mm_add_epi32(tmp3, rounding); \
474     \
475     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
476     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
477     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
478     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
479     \
480     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
481     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
482   } \
483   \
484   /* Stage4  */ \
485   in0 = _mm_adds_epi16(stp1_0, stp2_7); \
486   in1 = _mm_adds_epi16(stp1_1, stp1_6); \
487   in2 = _mm_adds_epi16(stp1_2, stp1_5); \
488   in3 = _mm_adds_epi16(stp1_3, stp2_4); \
489   in4 = _mm_subs_epi16(stp1_3, stp2_4); \
490   in5 = _mm_subs_epi16(stp1_2, stp1_5); \
491   in6 = _mm_subs_epi16(stp1_1, stp1_6); \
492   in7 = _mm_subs_epi16(stp1_0, stp2_7);
493 
494 #define RECON_AND_STORE(dest, in_x) \
495   {                                                     \
496      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
497       d0 = _mm_unpacklo_epi8(d0, zero); \
498       d0 = _mm_add_epi16(in_x, d0); \
499       d0 = _mm_packus_epi16(d0, d0); \
500       _mm_storel_epi64((__m128i *)(dest), d0); \
501       dest += stride; \
502   }
503 
vp9_idct8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride)504 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
505   const __m128i zero = _mm_setzero_si128();
506   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
507   const __m128i final_rounding = _mm_set1_epi16(1<<4);
508   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
509   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
510   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
511   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
512   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
513   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
514   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
515   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
516 
517   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
518   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
519   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
520   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
521   int i;
522 
523   // Load input data.
524   in0 = _mm_load_si128((const __m128i *)input);
525   in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
526   in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
527   in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
528   in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
529   in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
530   in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
531   in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
532 
533   // 2-D
534   for (i = 0; i < 2; i++) {
535     // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
536     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
537                   in4, in5, in6, in7);
538 
539     // 4-stage 1D idct8x8
540     IDCT8_1D
541   }
542 
543   // Final rounding and shift
544   in0 = _mm_adds_epi16(in0, final_rounding);
545   in1 = _mm_adds_epi16(in1, final_rounding);
546   in2 = _mm_adds_epi16(in2, final_rounding);
547   in3 = _mm_adds_epi16(in3, final_rounding);
548   in4 = _mm_adds_epi16(in4, final_rounding);
549   in5 = _mm_adds_epi16(in5, final_rounding);
550   in6 = _mm_adds_epi16(in6, final_rounding);
551   in7 = _mm_adds_epi16(in7, final_rounding);
552 
553   in0 = _mm_srai_epi16(in0, 5);
554   in1 = _mm_srai_epi16(in1, 5);
555   in2 = _mm_srai_epi16(in2, 5);
556   in3 = _mm_srai_epi16(in3, 5);
557   in4 = _mm_srai_epi16(in4, 5);
558   in5 = _mm_srai_epi16(in5, 5);
559   in6 = _mm_srai_epi16(in6, 5);
560   in7 = _mm_srai_epi16(in7, 5);
561 
562   RECON_AND_STORE(dest, in0);
563   RECON_AND_STORE(dest, in1);
564   RECON_AND_STORE(dest, in2);
565   RECON_AND_STORE(dest, in3);
566   RECON_AND_STORE(dest, in4);
567   RECON_AND_STORE(dest, in5);
568   RECON_AND_STORE(dest, in6);
569   RECON_AND_STORE(dest, in7);
570 }
571 
vp9_idct8x8_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)572 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
573   __m128i dc_value;
574   const __m128i zero = _mm_setzero_si128();
575   int a;
576 
577   a = dct_const_round_shift(input[0] * cospi_16_64);
578   a = dct_const_round_shift(a * cospi_16_64);
579   a = ROUND_POWER_OF_TWO(a, 5);
580 
581   dc_value = _mm_set1_epi16(a);
582 
583   RECON_AND_STORE(dest, dc_value);
584   RECON_AND_STORE(dest, dc_value);
585   RECON_AND_STORE(dest, dc_value);
586   RECON_AND_STORE(dest, dc_value);
587   RECON_AND_STORE(dest, dc_value);
588   RECON_AND_STORE(dest, dc_value);
589   RECON_AND_STORE(dest, dc_value);
590   RECON_AND_STORE(dest, dc_value);
591 }
592 
593 // perform 8x8 transpose
array_transpose_8x8(__m128i * in,__m128i * res)594 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
595   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
596   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
597   const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
598   const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
599   const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
600   const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
601   const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
602   const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
603 
604   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
605   const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
606   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
607   const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
608   const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
609   const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
610   const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
611   const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
612 
613   res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
614   res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
615   res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
616   res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
617   res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
618   res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
619   res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
620   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
621 }
622 
idct8_1d_sse2(__m128i * in)623 static void idct8_1d_sse2(__m128i *in) {
624   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
625   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
626   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
627   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
628   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
629   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
630   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
631   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
632   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
633 
634   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
635   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
636   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
637   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
638 
639   in0 = in[0];
640   in1 = in[1];
641   in2 = in[2];
642   in3 = in[3];
643   in4 = in[4];
644   in5 = in[5];
645   in6 = in[6];
646   in7 = in[7];
647 
648   // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
649   TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
650                 in4, in5, in6, in7);
651 
652   // 4-stage 1D idct8x8
653   IDCT8_1D
654   in[0] = in0;
655   in[1] = in1;
656   in[2] = in2;
657   in[3] = in3;
658   in[4] = in4;
659   in[5] = in5;
660   in[6] = in6;
661   in[7] = in7;
662 }
663 
iadst8_1d_sse2(__m128i * in)664 static void iadst8_1d_sse2(__m128i *in) {
665   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
666   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
667   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
668   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
669   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
670   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
671   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
672   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
673   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
674   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
675   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
676   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
677   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
678   const __m128i k__const_0 = _mm_set1_epi16(0);
679   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
680 
681   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
682   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
683   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
684   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
685   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
686 
687   // transpose
688   array_transpose_8x8(in, in);
689 
690   // properly aligned for butterfly input
691   in0  = in[7];
692   in1  = in[0];
693   in2  = in[5];
694   in3  = in[2];
695   in4  = in[3];
696   in5  = in[4];
697   in6  = in[1];
698   in7  = in[6];
699 
700   // column transformation
701   // stage 1
702   // interleave and multiply/add into 32-bit integer
703   s0 = _mm_unpacklo_epi16(in0, in1);
704   s1 = _mm_unpackhi_epi16(in0, in1);
705   s2 = _mm_unpacklo_epi16(in2, in3);
706   s3 = _mm_unpackhi_epi16(in2, in3);
707   s4 = _mm_unpacklo_epi16(in4, in5);
708   s5 = _mm_unpackhi_epi16(in4, in5);
709   s6 = _mm_unpacklo_epi16(in6, in7);
710   s7 = _mm_unpackhi_epi16(in6, in7);
711 
712   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
713   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
714   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
715   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
716   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
717   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
718   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
719   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
720   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
721   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
722   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
723   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
724   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
725   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
726   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
727   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
728 
729   // addition
730   w0 = _mm_add_epi32(u0, u8);
731   w1 = _mm_add_epi32(u1, u9);
732   w2 = _mm_add_epi32(u2, u10);
733   w3 = _mm_add_epi32(u3, u11);
734   w4 = _mm_add_epi32(u4, u12);
735   w5 = _mm_add_epi32(u5, u13);
736   w6 = _mm_add_epi32(u6, u14);
737   w7 = _mm_add_epi32(u7, u15);
738   w8 = _mm_sub_epi32(u0, u8);
739   w9 = _mm_sub_epi32(u1, u9);
740   w10 = _mm_sub_epi32(u2, u10);
741   w11 = _mm_sub_epi32(u3, u11);
742   w12 = _mm_sub_epi32(u4, u12);
743   w13 = _mm_sub_epi32(u5, u13);
744   w14 = _mm_sub_epi32(u6, u14);
745   w15 = _mm_sub_epi32(u7, u15);
746 
747   // shift and rounding
748   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
749   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
750   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
751   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
752   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
753   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
754   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
755   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
756   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
757   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
758   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
759   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
760   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
761   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
762   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
763   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
764 
765   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
766   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
767   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
768   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
769   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
770   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
771   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
772   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
773   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
774   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
775   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
776   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
777   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
778   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
779   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
780   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
781 
782   // back to 16-bit and pack 8 integers into __m128i
783   in[0] = _mm_packs_epi32(u0, u1);
784   in[1] = _mm_packs_epi32(u2, u3);
785   in[2] = _mm_packs_epi32(u4, u5);
786   in[3] = _mm_packs_epi32(u6, u7);
787   in[4] = _mm_packs_epi32(u8, u9);
788   in[5] = _mm_packs_epi32(u10, u11);
789   in[6] = _mm_packs_epi32(u12, u13);
790   in[7] = _mm_packs_epi32(u14, u15);
791 
792   // stage 2
793   s0 = _mm_add_epi16(in[0], in[2]);
794   s1 = _mm_add_epi16(in[1], in[3]);
795   s2 = _mm_sub_epi16(in[0], in[2]);
796   s3 = _mm_sub_epi16(in[1], in[3]);
797   u0 = _mm_unpacklo_epi16(in[4], in[5]);
798   u1 = _mm_unpackhi_epi16(in[4], in[5]);
799   u2 = _mm_unpacklo_epi16(in[6], in[7]);
800   u3 = _mm_unpackhi_epi16(in[6], in[7]);
801 
802   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
803   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
804   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
805   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
806   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
807   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
808   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
809   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
810 
811   w0 = _mm_add_epi32(v0, v4);
812   w1 = _mm_add_epi32(v1, v5);
813   w2 = _mm_add_epi32(v2, v6);
814   w3 = _mm_add_epi32(v3, v7);
815   w4 = _mm_sub_epi32(v0, v4);
816   w5 = _mm_sub_epi32(v1, v5);
817   w6 = _mm_sub_epi32(v2, v6);
818   w7 = _mm_sub_epi32(v3, v7);
819 
820   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
821   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
822   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
823   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
824   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
825   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
826   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
827   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
828 
829   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
830   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
831   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
832   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
833   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
834   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
835   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
836   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
837 
838   // back to 16-bit intergers
839   s4 = _mm_packs_epi32(u0, u1);
840   s5 = _mm_packs_epi32(u2, u3);
841   s6 = _mm_packs_epi32(u4, u5);
842   s7 = _mm_packs_epi32(u6, u7);
843 
844   // stage 3
845   u0 = _mm_unpacklo_epi16(s2, s3);
846   u1 = _mm_unpackhi_epi16(s2, s3);
847   u2 = _mm_unpacklo_epi16(s6, s7);
848   u3 = _mm_unpackhi_epi16(s6, s7);
849 
850   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
851   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
852   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
853   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
854   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
855   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
856   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
857   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
858 
859   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
860   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
861   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
862   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
863   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
864   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
865   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
866   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
867 
868   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
869   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
870   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
871   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
872   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
873   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
874   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
875   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
876 
877   s2 = _mm_packs_epi32(v0, v1);
878   s3 = _mm_packs_epi32(v2, v3);
879   s6 = _mm_packs_epi32(v4, v5);
880   s7 = _mm_packs_epi32(v6, v7);
881 
882   in[0] = s0;
883   in[1] = _mm_sub_epi16(k__const_0, s4);
884   in[2] = s6;
885   in[3] = _mm_sub_epi16(k__const_0, s2);
886   in[4] = s3;
887   in[5] = _mm_sub_epi16(k__const_0, s7);
888   in[6] = s5;
889   in[7] = _mm_sub_epi16(k__const_0, s1);
890 }
891 
892 
vp9_iht8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)893 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
894                             int tx_type) {
895   __m128i in[8];
896   const __m128i zero = _mm_setzero_si128();
897   const __m128i final_rounding = _mm_set1_epi16(1<<4);
898 
899   // load input data
900   in[0] = _mm_load_si128((const __m128i *)input);
901   in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
902   in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
903   in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
904   in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
905   in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
906   in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
907   in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
908 
909   switch (tx_type) {
910     case 0:  // DCT_DCT
911       idct8_1d_sse2(in);
912       idct8_1d_sse2(in);
913       break;
914     case 1:  // ADST_DCT
915       idct8_1d_sse2(in);
916       iadst8_1d_sse2(in);
917       break;
918     case 2:  // DCT_ADST
919       iadst8_1d_sse2(in);
920       idct8_1d_sse2(in);
921       break;
922     case 3:  // ADST_ADST
923       iadst8_1d_sse2(in);
924       iadst8_1d_sse2(in);
925       break;
926     default:
927       assert(0);
928       break;
929   }
930 
931   // Final rounding and shift
932   in[0] = _mm_adds_epi16(in[0], final_rounding);
933   in[1] = _mm_adds_epi16(in[1], final_rounding);
934   in[2] = _mm_adds_epi16(in[2], final_rounding);
935   in[3] = _mm_adds_epi16(in[3], final_rounding);
936   in[4] = _mm_adds_epi16(in[4], final_rounding);
937   in[5] = _mm_adds_epi16(in[5], final_rounding);
938   in[6] = _mm_adds_epi16(in[6], final_rounding);
939   in[7] = _mm_adds_epi16(in[7], final_rounding);
940 
941   in[0] = _mm_srai_epi16(in[0], 5);
942   in[1] = _mm_srai_epi16(in[1], 5);
943   in[2] = _mm_srai_epi16(in[2], 5);
944   in[3] = _mm_srai_epi16(in[3], 5);
945   in[4] = _mm_srai_epi16(in[4], 5);
946   in[5] = _mm_srai_epi16(in[5], 5);
947   in[6] = _mm_srai_epi16(in[6], 5);
948   in[7] = _mm_srai_epi16(in[7], 5);
949 
950   RECON_AND_STORE(dest, in[0]);
951   RECON_AND_STORE(dest, in[1]);
952   RECON_AND_STORE(dest, in[2]);
953   RECON_AND_STORE(dest, in[3]);
954   RECON_AND_STORE(dest, in[4]);
955   RECON_AND_STORE(dest, in[5]);
956   RECON_AND_STORE(dest, in[6]);
957   RECON_AND_STORE(dest, in[7]);
958 }
959 
vp9_idct8x8_10_add_sse2(const int16_t * input,uint8_t * dest,int stride)960 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
961   const __m128i zero = _mm_setzero_si128();
962   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
963   const __m128i final_rounding = _mm_set1_epi16(1<<4);
964   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
965   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
966   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
967   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
968   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
969   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
970   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
971   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
972   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
973 
974   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
975   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
976   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
977   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
978 
979   // Rows. Load 4-row input data.
980   in0 = _mm_load_si128((const __m128i *)input);
981   in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
982   in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
983   in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
984 
985   // 8x4 Transpose
986   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
987 
988   // Stage1
989   { //NOLINT
990     const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
991     const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
992 
993     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
994     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
995     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
996     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
997 
998     tmp0 = _mm_add_epi32(tmp0, rounding);
999     tmp2 = _mm_add_epi32(tmp2, rounding);
1000     tmp4 = _mm_add_epi32(tmp4, rounding);
1001     tmp6 = _mm_add_epi32(tmp6, rounding);
1002     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1003     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1004     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1005     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1006 
1007     stp1_4 = _mm_packs_epi32(tmp0, zero);
1008     stp1_7 = _mm_packs_epi32(tmp2, zero);
1009     stp1_5 = _mm_packs_epi32(tmp4, zero);
1010     stp1_6 = _mm_packs_epi32(tmp6, zero);
1011   }
1012 
1013   // Stage2
1014   { //NOLINT
1015     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
1016     const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
1017 
1018     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
1019     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
1020     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
1021     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
1022 
1023     tmp0 = _mm_add_epi32(tmp0, rounding);
1024     tmp2 = _mm_add_epi32(tmp2, rounding);
1025     tmp4 = _mm_add_epi32(tmp4, rounding);
1026     tmp6 = _mm_add_epi32(tmp6, rounding);
1027     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1028     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1029     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
1030     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
1031 
1032     stp2_0 = _mm_packs_epi32(tmp0, zero);
1033     stp2_1 = _mm_packs_epi32(tmp2, zero);
1034     stp2_2 = _mm_packs_epi32(tmp4, zero);
1035     stp2_3 = _mm_packs_epi32(tmp6, zero);
1036 
1037     stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
1038     stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
1039     stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
1040     stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
1041   }
1042 
1043   // Stage3
1044   { //NOLINT
1045     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
1046     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
1047     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
1048     stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
1049     stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
1050 
1051     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
1052     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
1053 
1054     tmp0 = _mm_add_epi32(tmp0, rounding);
1055     tmp2 = _mm_add_epi32(tmp2, rounding);
1056     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
1057     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
1058 
1059     stp1_5 = _mm_packs_epi32(tmp0, zero);
1060     stp1_6 = _mm_packs_epi32(tmp2, zero);
1061   }
1062 
1063   // Stage4
1064   in0 = _mm_adds_epi16(stp1_0, stp2_7);
1065   in1 = _mm_adds_epi16(stp1_1, stp1_6);
1066   in2 = _mm_adds_epi16(stp1_2, stp1_5);
1067   in3 = _mm_adds_epi16(stp1_3, stp2_4);
1068   in4 = _mm_subs_epi16(stp1_3, stp2_4);
1069   in5 = _mm_subs_epi16(stp1_2, stp1_5);
1070   in6 = _mm_subs_epi16(stp1_1, stp1_6);
1071   in7 = _mm_subs_epi16(stp1_0, stp2_7);
1072 
1073   // Columns. 4x8 Transpose
1074   TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1075                 in4, in5, in6, in7)
1076 
1077   // 1D idct8x8
1078   IDCT8_1D
1079 
1080   // Final rounding and shift
1081   in0 = _mm_adds_epi16(in0, final_rounding);
1082   in1 = _mm_adds_epi16(in1, final_rounding);
1083   in2 = _mm_adds_epi16(in2, final_rounding);
1084   in3 = _mm_adds_epi16(in3, final_rounding);
1085   in4 = _mm_adds_epi16(in4, final_rounding);
1086   in5 = _mm_adds_epi16(in5, final_rounding);
1087   in6 = _mm_adds_epi16(in6, final_rounding);
1088   in7 = _mm_adds_epi16(in7, final_rounding);
1089 
1090   in0 = _mm_srai_epi16(in0, 5);
1091   in1 = _mm_srai_epi16(in1, 5);
1092   in2 = _mm_srai_epi16(in2, 5);
1093   in3 = _mm_srai_epi16(in3, 5);
1094   in4 = _mm_srai_epi16(in4, 5);
1095   in5 = _mm_srai_epi16(in5, 5);
1096   in6 = _mm_srai_epi16(in6, 5);
1097   in7 = _mm_srai_epi16(in7, 5);
1098 
1099   RECON_AND_STORE(dest, in0);
1100   RECON_AND_STORE(dest, in1);
1101   RECON_AND_STORE(dest, in2);
1102   RECON_AND_STORE(dest, in3);
1103   RECON_AND_STORE(dest, in4);
1104   RECON_AND_STORE(dest, in5);
1105   RECON_AND_STORE(dest, in6);
1106   RECON_AND_STORE(dest, in7);
1107 }
1108 
1109 #define IDCT16_1D \
1110   /* Stage2 */ \
1111   { \
1112     const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
1113     const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
1114     const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
1115     const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
1116     const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
1117     const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
1118     const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
1119     const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
1120     \
1121     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
1122                            stg2_0, stg2_1, stg2_2, stg2_3, \
1123                            stp2_8, stp2_15, stp2_9, stp2_14) \
1124     \
1125     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
1126                            stg2_4, stg2_5, stg2_6, stg2_7, \
1127                            stp2_10, stp2_13, stp2_11, stp2_12) \
1128   } \
1129     \
1130   /* Stage3 */ \
1131   { \
1132     const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
1133     const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
1134     const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
1135     const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
1136     \
1137     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
1138                            stg3_0, stg3_1, stg3_2, stg3_3, \
1139                            stp1_4, stp1_7, stp1_5, stp1_6) \
1140     \
1141     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
1142     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
1143     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
1144     stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
1145     \
1146     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
1147     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
1148     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
1149     stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
1150   } \
1151   \
1152   /* Stage4 */ \
1153   { \
1154     const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
1155     const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
1156     const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
1157     const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
1158     \
1159     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1160     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1161     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1162     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1163     \
1164     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
1165                            stg4_0, stg4_1, stg4_2, stg4_3, \
1166                            stp2_0, stp2_1, stp2_2, stp2_3) \
1167     \
1168     stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1169     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1170     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1171     stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1172     \
1173     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1174                            stg4_4, stg4_5, stg4_6, stg4_7, \
1175                            stp2_9, stp2_14, stp2_10, stp2_13) \
1176   } \
1177     \
1178   /* Stage5 */ \
1179   { \
1180     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1181     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1182     \
1183     stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1184     stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1185     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1186     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1187     \
1188     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1189     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1190     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1191     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1192     \
1193     tmp0 = _mm_add_epi32(tmp0, rounding); \
1194     tmp1 = _mm_add_epi32(tmp1, rounding); \
1195     tmp2 = _mm_add_epi32(tmp2, rounding); \
1196     tmp3 = _mm_add_epi32(tmp3, rounding); \
1197     \
1198     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1199     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1200     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1201     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1202     \
1203     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1204     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1205     \
1206     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1207     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1208     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1209     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1210     \
1211     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1212     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1213     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1214     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1215   } \
1216     \
1217   /* Stage6 */ \
1218   { \
1219     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1220     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1221     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1222     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1223     \
1224     stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1225     stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1226     stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1227     stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1228     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1229     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1230     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1231     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1232     \
1233     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1234                            stg6_0, stg4_0, stg6_0, stg4_0, \
1235                            stp2_10, stp2_13, stp2_11, stp2_12) \
1236   }
1237 
vp9_idct16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride)1238 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
1239                                 int stride) {
1240   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1241   const __m128i final_rounding = _mm_set1_epi16(1<<5);
1242   const __m128i zero = _mm_setzero_si128();
1243 
1244   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1245   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1246   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1247   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1248   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1249   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1250   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1251   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1252 
1253   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1254   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1255   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1256   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1257 
1258   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1259   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1260   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1261   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1262   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1263   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1264   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1265   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1266 
1267   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1268 
1269   __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
1270           in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
1271           in10 = zero, in11 = zero, in12 = zero, in13 = zero,
1272           in14 = zero, in15 = zero;
1273   __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
1274           l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
1275           l12 = zero, l13 = zero, l14 = zero, l15 = zero;
1276   __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
1277           r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
1278           r12 = zero, r13 = zero, r14 = zero, r15 = zero;
1279   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1280           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1281           stp1_8_0, stp1_12_0;
1282   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1283           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1284   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1285   int i;
1286 
1287   // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
1288   for (i = 0; i < 4; i++) {
1289     // 1-D idct
1290     if (i < 2) {
1291       if (i == 1) input += 128;
1292 
1293       // Load input data.
1294       in0 = _mm_load_si128((const __m128i *)input);
1295       in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
1296       in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
1297       in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
1298       in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
1299       in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
1300       in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
1301       in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
1302       in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
1303       in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
1304       in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
1305       in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
1306       in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
1307       in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
1308       in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
1309       in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
1310 
1311       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
1312                     in4, in5, in6, in7);
1313       TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
1314                     in10, in11, in12, in13, in14, in15);
1315     }
1316 
1317     if (i == 2) {
1318       TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
1319                     in5, in6, in7);
1320       TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
1321                     in13, in14, in15);
1322     }
1323 
1324     if (i == 3) {
1325       TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
1326                     in4, in5, in6, in7);
1327       TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
1328                     in12, in13, in14, in15);
1329     }
1330 
1331     IDCT16_1D
1332 
1333     // Stage7
1334     if (i == 0) {
1335       // Left 8x16
1336       l0 = _mm_add_epi16(stp2_0, stp1_15);
1337       l1 = _mm_add_epi16(stp2_1, stp1_14);
1338       l2 = _mm_add_epi16(stp2_2, stp2_13);
1339       l3 = _mm_add_epi16(stp2_3, stp2_12);
1340       l4 = _mm_add_epi16(stp2_4, stp2_11);
1341       l5 = _mm_add_epi16(stp2_5, stp2_10);
1342       l6 = _mm_add_epi16(stp2_6, stp1_9);
1343       l7 = _mm_add_epi16(stp2_7, stp1_8);
1344       l8 = _mm_sub_epi16(stp2_7, stp1_8);
1345       l9 = _mm_sub_epi16(stp2_6, stp1_9);
1346       l10 = _mm_sub_epi16(stp2_5, stp2_10);
1347       l11 = _mm_sub_epi16(stp2_4, stp2_11);
1348       l12 = _mm_sub_epi16(stp2_3, stp2_12);
1349       l13 = _mm_sub_epi16(stp2_2, stp2_13);
1350       l14 = _mm_sub_epi16(stp2_1, stp1_14);
1351       l15 = _mm_sub_epi16(stp2_0, stp1_15);
1352     } else if (i == 1) {
1353       // Right 8x16
1354       r0 = _mm_add_epi16(stp2_0, stp1_15);
1355       r1 = _mm_add_epi16(stp2_1, stp1_14);
1356       r2 = _mm_add_epi16(stp2_2, stp2_13);
1357       r3 = _mm_add_epi16(stp2_3, stp2_12);
1358       r4 = _mm_add_epi16(stp2_4, stp2_11);
1359       r5 = _mm_add_epi16(stp2_5, stp2_10);
1360       r6 = _mm_add_epi16(stp2_6, stp1_9);
1361       r7 = _mm_add_epi16(stp2_7, stp1_8);
1362       r8 = _mm_sub_epi16(stp2_7, stp1_8);
1363       r9 = _mm_sub_epi16(stp2_6, stp1_9);
1364       r10 = _mm_sub_epi16(stp2_5, stp2_10);
1365       r11 = _mm_sub_epi16(stp2_4, stp2_11);
1366       r12 = _mm_sub_epi16(stp2_3, stp2_12);
1367       r13 = _mm_sub_epi16(stp2_2, stp2_13);
1368       r14 = _mm_sub_epi16(stp2_1, stp1_14);
1369       r15 = _mm_sub_epi16(stp2_0, stp1_15);
1370     } else {
1371       // 2-D
1372       in0 = _mm_add_epi16(stp2_0, stp1_15);
1373       in1 = _mm_add_epi16(stp2_1, stp1_14);
1374       in2 = _mm_add_epi16(stp2_2, stp2_13);
1375       in3 = _mm_add_epi16(stp2_3, stp2_12);
1376       in4 = _mm_add_epi16(stp2_4, stp2_11);
1377       in5 = _mm_add_epi16(stp2_5, stp2_10);
1378       in6 = _mm_add_epi16(stp2_6, stp1_9);
1379       in7 = _mm_add_epi16(stp2_7, stp1_8);
1380       in8 = _mm_sub_epi16(stp2_7, stp1_8);
1381       in9 = _mm_sub_epi16(stp2_6, stp1_9);
1382       in10 = _mm_sub_epi16(stp2_5, stp2_10);
1383       in11 = _mm_sub_epi16(stp2_4, stp2_11);
1384       in12 = _mm_sub_epi16(stp2_3, stp2_12);
1385       in13 = _mm_sub_epi16(stp2_2, stp2_13);
1386       in14 = _mm_sub_epi16(stp2_1, stp1_14);
1387       in15 = _mm_sub_epi16(stp2_0, stp1_15);
1388 
1389       // Final rounding and shift
1390       in0 = _mm_adds_epi16(in0, final_rounding);
1391       in1 = _mm_adds_epi16(in1, final_rounding);
1392       in2 = _mm_adds_epi16(in2, final_rounding);
1393       in3 = _mm_adds_epi16(in3, final_rounding);
1394       in4 = _mm_adds_epi16(in4, final_rounding);
1395       in5 = _mm_adds_epi16(in5, final_rounding);
1396       in6 = _mm_adds_epi16(in6, final_rounding);
1397       in7 = _mm_adds_epi16(in7, final_rounding);
1398       in8 = _mm_adds_epi16(in8, final_rounding);
1399       in9 = _mm_adds_epi16(in9, final_rounding);
1400       in10 = _mm_adds_epi16(in10, final_rounding);
1401       in11 = _mm_adds_epi16(in11, final_rounding);
1402       in12 = _mm_adds_epi16(in12, final_rounding);
1403       in13 = _mm_adds_epi16(in13, final_rounding);
1404       in14 = _mm_adds_epi16(in14, final_rounding);
1405       in15 = _mm_adds_epi16(in15, final_rounding);
1406 
1407       in0 = _mm_srai_epi16(in0, 6);
1408       in1 = _mm_srai_epi16(in1, 6);
1409       in2 = _mm_srai_epi16(in2, 6);
1410       in3 = _mm_srai_epi16(in3, 6);
1411       in4 = _mm_srai_epi16(in4, 6);
1412       in5 = _mm_srai_epi16(in5, 6);
1413       in6 = _mm_srai_epi16(in6, 6);
1414       in7 = _mm_srai_epi16(in7, 6);
1415       in8 = _mm_srai_epi16(in8, 6);
1416       in9 = _mm_srai_epi16(in9, 6);
1417       in10 = _mm_srai_epi16(in10, 6);
1418       in11 = _mm_srai_epi16(in11, 6);
1419       in12 = _mm_srai_epi16(in12, 6);
1420       in13 = _mm_srai_epi16(in13, 6);
1421       in14 = _mm_srai_epi16(in14, 6);
1422       in15 = _mm_srai_epi16(in15, 6);
1423 
1424       RECON_AND_STORE(dest, in0);
1425       RECON_AND_STORE(dest, in1);
1426       RECON_AND_STORE(dest, in2);
1427       RECON_AND_STORE(dest, in3);
1428       RECON_AND_STORE(dest, in4);
1429       RECON_AND_STORE(dest, in5);
1430       RECON_AND_STORE(dest, in6);
1431       RECON_AND_STORE(dest, in7);
1432       RECON_AND_STORE(dest, in8);
1433       RECON_AND_STORE(dest, in9);
1434       RECON_AND_STORE(dest, in10);
1435       RECON_AND_STORE(dest, in11);
1436       RECON_AND_STORE(dest, in12);
1437       RECON_AND_STORE(dest, in13);
1438       RECON_AND_STORE(dest, in14);
1439       RECON_AND_STORE(dest, in15);
1440 
1441       dest += 8 - (stride * 16);
1442     }
1443   }
1444 }
1445 
vp9_idct16x16_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)1446 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
1447   __m128i dc_value;
1448   const __m128i zero = _mm_setzero_si128();
1449   int a, i;
1450 
1451   a = dct_const_round_shift(input[0] * cospi_16_64);
1452   a = dct_const_round_shift(a * cospi_16_64);
1453   a = ROUND_POWER_OF_TWO(a, 6);
1454 
1455   dc_value = _mm_set1_epi16(a);
1456 
1457   for (i = 0; i < 2; ++i) {
1458     RECON_AND_STORE(dest, dc_value);
1459     RECON_AND_STORE(dest, dc_value);
1460     RECON_AND_STORE(dest, dc_value);
1461     RECON_AND_STORE(dest, dc_value);
1462     RECON_AND_STORE(dest, dc_value);
1463     RECON_AND_STORE(dest, dc_value);
1464     RECON_AND_STORE(dest, dc_value);
1465     RECON_AND_STORE(dest, dc_value);
1466     RECON_AND_STORE(dest, dc_value);
1467     RECON_AND_STORE(dest, dc_value);
1468     RECON_AND_STORE(dest, dc_value);
1469     RECON_AND_STORE(dest, dc_value);
1470     RECON_AND_STORE(dest, dc_value);
1471     RECON_AND_STORE(dest, dc_value);
1472     RECON_AND_STORE(dest, dc_value);
1473     RECON_AND_STORE(dest, dc_value);
1474     dest += 8 - (stride * 16);
1475   }
1476 }
1477 
array_transpose_16x16(__m128i * res0,__m128i * res1)1478 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
1479   __m128i tbuf[8];
1480   array_transpose_8x8(res0, res0);
1481   array_transpose_8x8(res1, tbuf);
1482   array_transpose_8x8(res0 + 8, res1);
1483   array_transpose_8x8(res1 + 8, res1 + 8);
1484 
1485   res0[8] = tbuf[0];
1486   res0[9] = tbuf[1];
1487   res0[10] = tbuf[2];
1488   res0[11] = tbuf[3];
1489   res0[12] = tbuf[4];
1490   res0[13] = tbuf[5];
1491   res0[14] = tbuf[6];
1492   res0[15] = tbuf[7];
1493 }
1494 
iadst16_1d_8col(__m128i * in)1495 static void iadst16_1d_8col(__m128i *in) {
1496   // perform 16x16 1-D ADST for 8 columns
1497   __m128i s[16], x[16], u[32], v[32];
1498   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1499   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1500   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1501   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1502   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1503   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1504   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1505   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1506   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1507   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1508   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1509   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1510   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1511   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1512   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1513   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1514   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1515   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1516   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1517   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1518   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1519   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1520   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1521   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1522   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1523   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1524   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1525   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1526   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1527   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1528   const __m128i kZero = _mm_set1_epi16(0);
1529 
1530   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1531   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1532   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1533   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1534   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1535   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1536   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1537   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1538   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1539   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1540   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1541   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1542   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1543   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1544   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1545   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1546 
1547   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1548   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1549   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1550   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1551   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1552   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1553   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1554   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1555   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1556   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1557   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1558   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1559   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1560   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1561   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1562   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1563   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1564   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1565   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1566   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1567   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1568   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1569   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1570   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1571   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1572   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1573   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1574   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1575   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1576   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1577   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1578   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1579 
1580   u[0] = _mm_add_epi32(v[0], v[16]);
1581   u[1] = _mm_add_epi32(v[1], v[17]);
1582   u[2] = _mm_add_epi32(v[2], v[18]);
1583   u[3] = _mm_add_epi32(v[3], v[19]);
1584   u[4] = _mm_add_epi32(v[4], v[20]);
1585   u[5] = _mm_add_epi32(v[5], v[21]);
1586   u[6] = _mm_add_epi32(v[6], v[22]);
1587   u[7] = _mm_add_epi32(v[7], v[23]);
1588   u[8] = _mm_add_epi32(v[8], v[24]);
1589   u[9] = _mm_add_epi32(v[9], v[25]);
1590   u[10] = _mm_add_epi32(v[10], v[26]);
1591   u[11] = _mm_add_epi32(v[11], v[27]);
1592   u[12] = _mm_add_epi32(v[12], v[28]);
1593   u[13] = _mm_add_epi32(v[13], v[29]);
1594   u[14] = _mm_add_epi32(v[14], v[30]);
1595   u[15] = _mm_add_epi32(v[15], v[31]);
1596   u[16] = _mm_sub_epi32(v[0], v[16]);
1597   u[17] = _mm_sub_epi32(v[1], v[17]);
1598   u[18] = _mm_sub_epi32(v[2], v[18]);
1599   u[19] = _mm_sub_epi32(v[3], v[19]);
1600   u[20] = _mm_sub_epi32(v[4], v[20]);
1601   u[21] = _mm_sub_epi32(v[5], v[21]);
1602   u[22] = _mm_sub_epi32(v[6], v[22]);
1603   u[23] = _mm_sub_epi32(v[7], v[23]);
1604   u[24] = _mm_sub_epi32(v[8], v[24]);
1605   u[25] = _mm_sub_epi32(v[9], v[25]);
1606   u[26] = _mm_sub_epi32(v[10], v[26]);
1607   u[27] = _mm_sub_epi32(v[11], v[27]);
1608   u[28] = _mm_sub_epi32(v[12], v[28]);
1609   u[29] = _mm_sub_epi32(v[13], v[29]);
1610   u[30] = _mm_sub_epi32(v[14], v[30]);
1611   u[31] = _mm_sub_epi32(v[15], v[31]);
1612 
1613   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1614   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1615   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1616   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1617   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1618   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1619   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1620   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1621   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1622   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1623   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1624   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1625   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1626   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1627   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1628   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1629   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1630   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1631   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1632   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1633   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1634   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1635   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1636   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1637   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1638   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1639   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1640   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1641   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1642   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1643   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1644   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1645 
1646   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1647   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1648   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1649   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1650   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1651   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1652   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1653   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1654   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1655   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1656   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1657   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1658   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1659   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1660   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1661   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1662   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1663   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1664   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1665   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1666   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1667   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1668   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1669   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1670   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1671   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1672   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1673   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1674   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1675   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1676   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1677   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1678 
1679   s[0] = _mm_packs_epi32(u[0], u[1]);
1680   s[1] = _mm_packs_epi32(u[2], u[3]);
1681   s[2] = _mm_packs_epi32(u[4], u[5]);
1682   s[3] = _mm_packs_epi32(u[6], u[7]);
1683   s[4] = _mm_packs_epi32(u[8], u[9]);
1684   s[5] = _mm_packs_epi32(u[10], u[11]);
1685   s[6] = _mm_packs_epi32(u[12], u[13]);
1686   s[7] = _mm_packs_epi32(u[14], u[15]);
1687   s[8] = _mm_packs_epi32(u[16], u[17]);
1688   s[9] = _mm_packs_epi32(u[18], u[19]);
1689   s[10] = _mm_packs_epi32(u[20], u[21]);
1690   s[11] = _mm_packs_epi32(u[22], u[23]);
1691   s[12] = _mm_packs_epi32(u[24], u[25]);
1692   s[13] = _mm_packs_epi32(u[26], u[27]);
1693   s[14] = _mm_packs_epi32(u[28], u[29]);
1694   s[15] = _mm_packs_epi32(u[30], u[31]);
1695 
1696   // stage 2
1697   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1698   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1699   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1700   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1701   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1702   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1703   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1704   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1705 
1706   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1707   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1708   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1709   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1710   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1711   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1712   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1713   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1714   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1715   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1716   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1717   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1718   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1719   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1720   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1721   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1722 
1723   u[0] = _mm_add_epi32(v[0], v[8]);
1724   u[1] = _mm_add_epi32(v[1], v[9]);
1725   u[2] = _mm_add_epi32(v[2], v[10]);
1726   u[3] = _mm_add_epi32(v[3], v[11]);
1727   u[4] = _mm_add_epi32(v[4], v[12]);
1728   u[5] = _mm_add_epi32(v[5], v[13]);
1729   u[6] = _mm_add_epi32(v[6], v[14]);
1730   u[7] = _mm_add_epi32(v[7], v[15]);
1731   u[8] = _mm_sub_epi32(v[0], v[8]);
1732   u[9] = _mm_sub_epi32(v[1], v[9]);
1733   u[10] = _mm_sub_epi32(v[2], v[10]);
1734   u[11] = _mm_sub_epi32(v[3], v[11]);
1735   u[12] = _mm_sub_epi32(v[4], v[12]);
1736   u[13] = _mm_sub_epi32(v[5], v[13]);
1737   u[14] = _mm_sub_epi32(v[6], v[14]);
1738   u[15] = _mm_sub_epi32(v[7], v[15]);
1739 
1740   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1741   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1742   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1743   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1744   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1745   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1746   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1747   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1748   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1749   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1750   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1751   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1752   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1753   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1754   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1755   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1756 
1757   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1758   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1759   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1760   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1761   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1762   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1763   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1764   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1765   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1766   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1767   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1768   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1769   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1770   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1771   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1772   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1773 
1774   x[0] = _mm_add_epi16(s[0], s[4]);
1775   x[1] = _mm_add_epi16(s[1], s[5]);
1776   x[2] = _mm_add_epi16(s[2], s[6]);
1777   x[3] = _mm_add_epi16(s[3], s[7]);
1778   x[4] = _mm_sub_epi16(s[0], s[4]);
1779   x[5] = _mm_sub_epi16(s[1], s[5]);
1780   x[6] = _mm_sub_epi16(s[2], s[6]);
1781   x[7] = _mm_sub_epi16(s[3], s[7]);
1782   x[8] = _mm_packs_epi32(u[0], u[1]);
1783   x[9] = _mm_packs_epi32(u[2], u[3]);
1784   x[10] = _mm_packs_epi32(u[4], u[5]);
1785   x[11] = _mm_packs_epi32(u[6], u[7]);
1786   x[12] = _mm_packs_epi32(u[8], u[9]);
1787   x[13] = _mm_packs_epi32(u[10], u[11]);
1788   x[14] = _mm_packs_epi32(u[12], u[13]);
1789   x[15] = _mm_packs_epi32(u[14], u[15]);
1790 
1791   // stage 3
1792   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1793   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1794   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1795   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1796   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1797   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1798   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1799   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1800 
1801   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1802   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1803   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1804   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1805   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1806   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1807   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1808   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1809   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1810   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1811   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1812   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1813   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1814   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1815   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1816   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1817 
1818   u[0] = _mm_add_epi32(v[0], v[4]);
1819   u[1] = _mm_add_epi32(v[1], v[5]);
1820   u[2] = _mm_add_epi32(v[2], v[6]);
1821   u[3] = _mm_add_epi32(v[3], v[7]);
1822   u[4] = _mm_sub_epi32(v[0], v[4]);
1823   u[5] = _mm_sub_epi32(v[1], v[5]);
1824   u[6] = _mm_sub_epi32(v[2], v[6]);
1825   u[7] = _mm_sub_epi32(v[3], v[7]);
1826   u[8] = _mm_add_epi32(v[8], v[12]);
1827   u[9] = _mm_add_epi32(v[9], v[13]);
1828   u[10] = _mm_add_epi32(v[10], v[14]);
1829   u[11] = _mm_add_epi32(v[11], v[15]);
1830   u[12] = _mm_sub_epi32(v[8], v[12]);
1831   u[13] = _mm_sub_epi32(v[9], v[13]);
1832   u[14] = _mm_sub_epi32(v[10], v[14]);
1833   u[15] = _mm_sub_epi32(v[11], v[15]);
1834 
1835   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1836   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1837   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1838   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1839   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1840   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1841   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1842   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1843   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1844   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1845   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1846   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1847   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1848   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1849   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1850   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1851 
1852   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1853   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1854   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1855   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1856   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1857   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1858   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1859   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1860   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1861   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1862   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1863   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1864   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1865   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1866   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1867   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1868 
1869   s[0] = _mm_add_epi16(x[0], x[2]);
1870   s[1] = _mm_add_epi16(x[1], x[3]);
1871   s[2] = _mm_sub_epi16(x[0], x[2]);
1872   s[3] = _mm_sub_epi16(x[1], x[3]);
1873   s[4] = _mm_packs_epi32(v[0], v[1]);
1874   s[5] = _mm_packs_epi32(v[2], v[3]);
1875   s[6] = _mm_packs_epi32(v[4], v[5]);
1876   s[7] = _mm_packs_epi32(v[6], v[7]);
1877   s[8] = _mm_add_epi16(x[8], x[10]);
1878   s[9] = _mm_add_epi16(x[9], x[11]);
1879   s[10] = _mm_sub_epi16(x[8], x[10]);
1880   s[11] = _mm_sub_epi16(x[9], x[11]);
1881   s[12] = _mm_packs_epi32(v[8], v[9]);
1882   s[13] = _mm_packs_epi32(v[10], v[11]);
1883   s[14] = _mm_packs_epi32(v[12], v[13]);
1884   s[15] = _mm_packs_epi32(v[14], v[15]);
1885 
1886   // stage 4
1887   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1888   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1889   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1890   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1891   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1892   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1893   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1894   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1895 
1896   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1897   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1898   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1899   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1900   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1901   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1902   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1903   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1904   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1905   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1906   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1907   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1908   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1909   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1910   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1911   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1912 
1913   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1914   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1915   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1916   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1917   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1918   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1919   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1920   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1921   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1922   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1923   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1924   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1925   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1926   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1927   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1928   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1929 
1930   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1931   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1932   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1933   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1934   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1935   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1936   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1937   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1938   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1939   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1940   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1941   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1942   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1943   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1944   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1945   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1946 
1947   in[0] = s[0];
1948   in[1] = _mm_sub_epi16(kZero, s[8]);
1949   in[2] = s[12];
1950   in[3] = _mm_sub_epi16(kZero, s[4]);
1951   in[4] = _mm_packs_epi32(v[4], v[5]);
1952   in[5] = _mm_packs_epi32(v[12], v[13]);
1953   in[6] = _mm_packs_epi32(v[8], v[9]);
1954   in[7] = _mm_packs_epi32(v[0], v[1]);
1955   in[8] = _mm_packs_epi32(v[2], v[3]);
1956   in[9] = _mm_packs_epi32(v[10], v[11]);
1957   in[10] = _mm_packs_epi32(v[14], v[15]);
1958   in[11] = _mm_packs_epi32(v[6], v[7]);
1959   in[12] = s[5];
1960   in[13] = _mm_sub_epi16(kZero, s[13]);
1961   in[14] = s[9];
1962   in[15] = _mm_sub_epi16(kZero, s[1]);
1963 }
1964 
idct16_1d_8col(__m128i * in)1965 static void idct16_1d_8col(__m128i *in) {
1966   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1967   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1968   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1969   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1970   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1971   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1972   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1973   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1974   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1975   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1976   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1977   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1978   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1979   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1980   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1981   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1982   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1983   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1984   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1985   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1986   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1987   __m128i v[16], u[16], s[16], t[16];
1988 
1989   // stage 1
1990   s[0] = in[0];
1991   s[1] = in[8];
1992   s[2] = in[4];
1993   s[3] = in[12];
1994   s[4] = in[2];
1995   s[5] = in[10];
1996   s[6] = in[6];
1997   s[7] = in[14];
1998   s[8] = in[1];
1999   s[9] = in[9];
2000   s[10] = in[5];
2001   s[11] = in[13];
2002   s[12] = in[3];
2003   s[13] = in[11];
2004   s[14] = in[7];
2005   s[15] = in[15];
2006 
2007   // stage 2
2008   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
2009   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
2010   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
2011   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
2012   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
2013   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
2014   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
2015   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
2016 
2017   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
2018   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
2019   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
2020   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
2021   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
2022   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
2023   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
2024   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
2025   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
2026   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
2027   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
2028   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
2029   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
2030   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
2031   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
2032   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
2033 
2034   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2035   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2036   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2037   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2038   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2039   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2040   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2041   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2042   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2043   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2044   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2045   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2046   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2047   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2048   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2049   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2050 
2051   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2052   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2053   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2054   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2055   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2056   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2057   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2058   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2059   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2060   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2061   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2062   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2063   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2064   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2065   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2066   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2067 
2068   s[8]  = _mm_packs_epi32(u[0], u[1]);
2069   s[15] = _mm_packs_epi32(u[2], u[3]);
2070   s[9]  = _mm_packs_epi32(u[4], u[5]);
2071   s[14] = _mm_packs_epi32(u[6], u[7]);
2072   s[10] = _mm_packs_epi32(u[8], u[9]);
2073   s[13] = _mm_packs_epi32(u[10], u[11]);
2074   s[11] = _mm_packs_epi32(u[12], u[13]);
2075   s[12] = _mm_packs_epi32(u[14], u[15]);
2076 
2077   // stage 3
2078   t[0] = s[0];
2079   t[1] = s[1];
2080   t[2] = s[2];
2081   t[3] = s[3];
2082   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
2083   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
2084   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
2085   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
2086 
2087   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
2088   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
2089   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
2090   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
2091   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
2092   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
2093   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
2094   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
2095 
2096   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2097   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2098   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2099   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2100   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2101   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2102   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2103   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2104 
2105   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2106   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2107   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2108   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2109   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2110   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2111   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2112   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2113 
2114   t[4] = _mm_packs_epi32(u[0], u[1]);
2115   t[7] = _mm_packs_epi32(u[2], u[3]);
2116   t[5] = _mm_packs_epi32(u[4], u[5]);
2117   t[6] = _mm_packs_epi32(u[6], u[7]);
2118   t[8] = _mm_add_epi16(s[8], s[9]);
2119   t[9] = _mm_sub_epi16(s[8], s[9]);
2120   t[10] = _mm_sub_epi16(s[11], s[10]);
2121   t[11] = _mm_add_epi16(s[10], s[11]);
2122   t[12] = _mm_add_epi16(s[12], s[13]);
2123   t[13] = _mm_sub_epi16(s[12], s[13]);
2124   t[14] = _mm_sub_epi16(s[15], s[14]);
2125   t[15] = _mm_add_epi16(s[14], s[15]);
2126 
2127   // stage 4
2128   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
2129   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
2130   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
2131   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
2132   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
2133   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
2134   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
2135   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
2136 
2137   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2138   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2139   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
2140   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
2141   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
2142   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
2143   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
2144   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
2145   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
2146   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
2147   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
2148   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
2149   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
2150   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
2151   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
2152   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
2153 
2154   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2155   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2156   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2157   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2158   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2159   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2160   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2161   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2162   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
2163   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
2164   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
2165   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
2166   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
2167   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
2168   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
2169   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
2170 
2171   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2172   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2173   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2174   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2175   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2176   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2177   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2178   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2179   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2180   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2181   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2182   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2183   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2184   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2185   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2186   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2187 
2188   s[0] = _mm_packs_epi32(u[0], u[1]);
2189   s[1] = _mm_packs_epi32(u[2], u[3]);
2190   s[2] = _mm_packs_epi32(u[4], u[5]);
2191   s[3] = _mm_packs_epi32(u[6], u[7]);
2192   s[4] = _mm_add_epi16(t[4], t[5]);
2193   s[5] = _mm_sub_epi16(t[4], t[5]);
2194   s[6] = _mm_sub_epi16(t[7], t[6]);
2195   s[7] = _mm_add_epi16(t[6], t[7]);
2196   s[8] = t[8];
2197   s[15] = t[15];
2198   s[9]  = _mm_packs_epi32(u[8], u[9]);
2199   s[14] = _mm_packs_epi32(u[10], u[11]);
2200   s[10] = _mm_packs_epi32(u[12], u[13]);
2201   s[13] = _mm_packs_epi32(u[14], u[15]);
2202   s[11] = t[11];
2203   s[12] = t[12];
2204 
2205   // stage 5
2206   t[0] = _mm_add_epi16(s[0], s[3]);
2207   t[1] = _mm_add_epi16(s[1], s[2]);
2208   t[2] = _mm_sub_epi16(s[1], s[2]);
2209   t[3] = _mm_sub_epi16(s[0], s[3]);
2210   t[4] = s[4];
2211   t[7] = s[7];
2212 
2213   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2214   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2215   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2216   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2217   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2218   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2219   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2220   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2221   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2222   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2223   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2224   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2225   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2226   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2227   t[5] = _mm_packs_epi32(u[0], u[1]);
2228   t[6] = _mm_packs_epi32(u[2], u[3]);
2229 
2230   t[8] = _mm_add_epi16(s[8], s[11]);
2231   t[9] = _mm_add_epi16(s[9], s[10]);
2232   t[10] = _mm_sub_epi16(s[9], s[10]);
2233   t[11] = _mm_sub_epi16(s[8], s[11]);
2234   t[12] = _mm_sub_epi16(s[15], s[12]);
2235   t[13] = _mm_sub_epi16(s[14], s[13]);
2236   t[14] = _mm_add_epi16(s[13], s[14]);
2237   t[15] = _mm_add_epi16(s[12], s[15]);
2238 
2239   // stage 6
2240   s[0] = _mm_add_epi16(t[0], t[7]);
2241   s[1] = _mm_add_epi16(t[1], t[6]);
2242   s[2] = _mm_add_epi16(t[2], t[5]);
2243   s[3] = _mm_add_epi16(t[3], t[4]);
2244   s[4] = _mm_sub_epi16(t[3], t[4]);
2245   s[5] = _mm_sub_epi16(t[2], t[5]);
2246   s[6] = _mm_sub_epi16(t[1], t[6]);
2247   s[7] = _mm_sub_epi16(t[0], t[7]);
2248   s[8] = t[8];
2249   s[9] = t[9];
2250 
2251   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2252   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2253   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2254   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2255 
2256   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2257   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2258   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2259   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2260   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2261   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2262   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2263   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2264 
2265   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2266   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2267   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2268   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2269   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2270   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2271   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2272   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2273 
2274   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2275   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2276   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2277   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2278   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2279   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2280   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2281   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2282 
2283   s[10] = _mm_packs_epi32(u[0], u[1]);
2284   s[13] = _mm_packs_epi32(u[2], u[3]);
2285   s[11] = _mm_packs_epi32(u[4], u[5]);
2286   s[12] = _mm_packs_epi32(u[6], u[7]);
2287   s[14] = t[14];
2288   s[15] = t[15];
2289 
2290   // stage 7
2291   in[0] = _mm_add_epi16(s[0], s[15]);
2292   in[1] = _mm_add_epi16(s[1], s[14]);
2293   in[2] = _mm_add_epi16(s[2], s[13]);
2294   in[3] = _mm_add_epi16(s[3], s[12]);
2295   in[4] = _mm_add_epi16(s[4], s[11]);
2296   in[5] = _mm_add_epi16(s[5], s[10]);
2297   in[6] = _mm_add_epi16(s[6], s[9]);
2298   in[7] = _mm_add_epi16(s[7], s[8]);
2299   in[8] = _mm_sub_epi16(s[7], s[8]);
2300   in[9] = _mm_sub_epi16(s[6], s[9]);
2301   in[10] = _mm_sub_epi16(s[5], s[10]);
2302   in[11] = _mm_sub_epi16(s[4], s[11]);
2303   in[12] = _mm_sub_epi16(s[3], s[12]);
2304   in[13] = _mm_sub_epi16(s[2], s[13]);
2305   in[14] = _mm_sub_epi16(s[1], s[14]);
2306   in[15] = _mm_sub_epi16(s[0], s[15]);
2307 }
2308 
idct16_1d_sse2(__m128i * in0,__m128i * in1)2309 static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
2310   array_transpose_16x16(in0, in1);
2311   idct16_1d_8col(in0);
2312   idct16_1d_8col(in1);
2313 }
2314 
iadst16_1d_sse2(__m128i * in0,__m128i * in1)2315 static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
2316   array_transpose_16x16(in0, in1);
2317   iadst16_1d_8col(in0);
2318   iadst16_1d_8col(in1);
2319 }
2320 
load_buffer_8x16(const int16_t * input,__m128i * in)2321 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
2322   in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
2323   in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
2324   in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
2325   in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
2326   in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
2327   in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
2328   in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
2329   in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
2330 
2331   in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
2332   in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
2333   in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
2334   in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
2335   in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
2336   in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
2337   in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
2338   in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
2339 }
2340 
write_buffer_8x16(uint8_t * dest,__m128i * in,int stride)2341 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
2342   const __m128i final_rounding = _mm_set1_epi16(1<<5);
2343   const __m128i zero = _mm_setzero_si128();
2344   // Final rounding and shift
2345   in[0] = _mm_adds_epi16(in[0], final_rounding);
2346   in[1] = _mm_adds_epi16(in[1], final_rounding);
2347   in[2] = _mm_adds_epi16(in[2], final_rounding);
2348   in[3] = _mm_adds_epi16(in[3], final_rounding);
2349   in[4] = _mm_adds_epi16(in[4], final_rounding);
2350   in[5] = _mm_adds_epi16(in[5], final_rounding);
2351   in[6] = _mm_adds_epi16(in[6], final_rounding);
2352   in[7] = _mm_adds_epi16(in[7], final_rounding);
2353   in[8] = _mm_adds_epi16(in[8], final_rounding);
2354   in[9] = _mm_adds_epi16(in[9], final_rounding);
2355   in[10] = _mm_adds_epi16(in[10], final_rounding);
2356   in[11] = _mm_adds_epi16(in[11], final_rounding);
2357   in[12] = _mm_adds_epi16(in[12], final_rounding);
2358   in[13] = _mm_adds_epi16(in[13], final_rounding);
2359   in[14] = _mm_adds_epi16(in[14], final_rounding);
2360   in[15] = _mm_adds_epi16(in[15], final_rounding);
2361 
2362   in[0] = _mm_srai_epi16(in[0], 6);
2363   in[1] = _mm_srai_epi16(in[1], 6);
2364   in[2] = _mm_srai_epi16(in[2], 6);
2365   in[3] = _mm_srai_epi16(in[3], 6);
2366   in[4] = _mm_srai_epi16(in[4], 6);
2367   in[5] = _mm_srai_epi16(in[5], 6);
2368   in[6] = _mm_srai_epi16(in[6], 6);
2369   in[7] = _mm_srai_epi16(in[7], 6);
2370   in[8] = _mm_srai_epi16(in[8], 6);
2371   in[9] = _mm_srai_epi16(in[9], 6);
2372   in[10] = _mm_srai_epi16(in[10], 6);
2373   in[11] = _mm_srai_epi16(in[11], 6);
2374   in[12] = _mm_srai_epi16(in[12], 6);
2375   in[13] = _mm_srai_epi16(in[13], 6);
2376   in[14] = _mm_srai_epi16(in[14], 6);
2377   in[15] = _mm_srai_epi16(in[15], 6);
2378 
2379   RECON_AND_STORE(dest, in[0]);
2380   RECON_AND_STORE(dest, in[1]);
2381   RECON_AND_STORE(dest, in[2]);
2382   RECON_AND_STORE(dest, in[3]);
2383   RECON_AND_STORE(dest, in[4]);
2384   RECON_AND_STORE(dest, in[5]);
2385   RECON_AND_STORE(dest, in[6]);
2386   RECON_AND_STORE(dest, in[7]);
2387   RECON_AND_STORE(dest, in[8]);
2388   RECON_AND_STORE(dest, in[9]);
2389   RECON_AND_STORE(dest, in[10]);
2390   RECON_AND_STORE(dest, in[11]);
2391   RECON_AND_STORE(dest, in[12]);
2392   RECON_AND_STORE(dest, in[13]);
2393   RECON_AND_STORE(dest, in[14]);
2394   RECON_AND_STORE(dest, in[15]);
2395 }
2396 
vp9_iht16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)2397 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
2398                                int tx_type) {
2399   __m128i in0[16], in1[16];
2400 
2401   load_buffer_8x16(input, in0);
2402   input += 8;
2403   load_buffer_8x16(input, in1);
2404 
2405   switch (tx_type) {
2406     case 0:  // DCT_DCT
2407       idct16_1d_sse2(in0, in1);
2408       idct16_1d_sse2(in0, in1);
2409       break;
2410     case 1:  // ADST_DCT
2411       idct16_1d_sse2(in0, in1);
2412       iadst16_1d_sse2(in0, in1);
2413       break;
2414     case 2:  // DCT_ADST
2415       iadst16_1d_sse2(in0, in1);
2416       idct16_1d_sse2(in0, in1);
2417       break;
2418     case 3:  // ADST_ADST
2419       iadst16_1d_sse2(in0, in1);
2420       iadst16_1d_sse2(in0, in1);
2421       break;
2422     default:
2423       assert(0);
2424       break;
2425   }
2426 
2427   write_buffer_8x16(dest, in0, stride);
2428   dest += 8;
2429   write_buffer_8x16(dest, in1, stride);
2430 }
2431 
vp9_idct16x16_10_add_sse2(const int16_t * input,uint8_t * dest,int stride)2432 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
2433                                int stride) {
2434   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2435   const __m128i final_rounding = _mm_set1_epi16(1<<5);
2436   const __m128i zero = _mm_setzero_si128();
2437 
2438   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2439   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2440   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2441   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2442   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2443   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2444   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2445   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2446 
2447   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2448   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2449   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2450   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2451 
2452   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2453   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2454   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2455   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2456   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2457   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2458   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2459   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2460 
2461   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2462 
2463   __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
2464           in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
2465           in10 = zero, in11 = zero, in12 = zero, in13 = zero,
2466           in14 = zero, in15 = zero;
2467   __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
2468           l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
2469           l12 = zero, l13 = zero, l14 = zero, l15 = zero;
2470 
2471   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2472           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2473           stp1_8_0, stp1_12_0;
2474   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2475           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
2476   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2477   int i;
2478   // 1-D idct. Load input data.
2479   in0 = _mm_load_si128((const __m128i *)input);
2480   in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
2481   in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
2482   in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
2483   in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
2484   in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
2485   in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
2486   in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
2487 
2488   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
2489   TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
2490 
2491   // Stage2
2492   {
2493     const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
2494     const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
2495     const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
2496     const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
2497 
2498     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2499     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2500     tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
2501     tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
2502     tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
2503     tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
2504     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2505     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2506 
2507     tmp0 = _mm_add_epi32(tmp0, rounding);
2508     tmp2 = _mm_add_epi32(tmp2, rounding);
2509     tmp4 = _mm_add_epi32(tmp4, rounding);
2510     tmp6 = _mm_add_epi32(tmp6, rounding);
2511     tmp1 = _mm_add_epi32(tmp1, rounding);
2512     tmp3 = _mm_add_epi32(tmp3, rounding);
2513     tmp5 = _mm_add_epi32(tmp5, rounding);
2514     tmp7 = _mm_add_epi32(tmp7, rounding);
2515 
2516     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2517     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2518     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2519     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2520     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2521     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2522     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2523     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2524 
2525     stp2_8 = _mm_packs_epi32(tmp0, zero);
2526     stp2_15 = _mm_packs_epi32(tmp2, zero);
2527     stp2_9 = _mm_packs_epi32(tmp4, zero);
2528     stp2_14 = _mm_packs_epi32(tmp6, zero);
2529 
2530     stp2_10 = _mm_packs_epi32(tmp1, zero);
2531     stp2_13 = _mm_packs_epi32(tmp3, zero);
2532     stp2_11 = _mm_packs_epi32(tmp5, zero);
2533     stp2_12 = _mm_packs_epi32(tmp7, zero);
2534   }
2535 
2536   // Stage3
2537   {
2538     const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
2539     const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
2540 
2541     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2542     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2543     tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
2544     tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
2545 
2546     tmp0 = _mm_add_epi32(tmp0, rounding);
2547     tmp2 = _mm_add_epi32(tmp2, rounding);
2548     tmp4 = _mm_add_epi32(tmp4, rounding);
2549     tmp6 = _mm_add_epi32(tmp6, rounding);
2550 
2551     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2552     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2553     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2554     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2555 
2556     stp1_4 = _mm_packs_epi32(tmp0, zero);
2557     stp1_7 = _mm_packs_epi32(tmp2, zero);
2558     stp1_5 = _mm_packs_epi32(tmp4, zero);
2559     stp1_6 = _mm_packs_epi32(tmp6, zero);
2560 
2561     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
2562     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
2563     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
2564     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
2565 
2566     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
2567     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
2568     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
2569     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
2570   }
2571 
2572   // Stage4
2573   {
2574     const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
2575     const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
2576     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
2577     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2578 
2579     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2580     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2581     tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
2582     tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
2583     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2584     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2585     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2586     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2587 
2588     tmp0 = _mm_add_epi32(tmp0, rounding);
2589     tmp2 = _mm_add_epi32(tmp2, rounding);
2590     tmp4 = _mm_add_epi32(tmp4, rounding);
2591     tmp6 = _mm_add_epi32(tmp6, rounding);
2592     tmp1 = _mm_add_epi32(tmp1, rounding);
2593     tmp3 = _mm_add_epi32(tmp3, rounding);
2594     tmp5 = _mm_add_epi32(tmp5, rounding);
2595     tmp7 = _mm_add_epi32(tmp7, rounding);
2596 
2597     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2598     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2599     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2600     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2601     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2602     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2603     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2604     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2605 
2606     stp2_0 = _mm_packs_epi32(tmp0, zero);
2607     stp2_1 = _mm_packs_epi32(tmp2, zero);
2608     stp2_2 = _mm_packs_epi32(tmp4, zero);
2609     stp2_3 = _mm_packs_epi32(tmp6, zero);
2610     stp2_9 = _mm_packs_epi32(tmp1, zero);
2611     stp2_14 = _mm_packs_epi32(tmp3, zero);
2612     stp2_10 = _mm_packs_epi32(tmp5, zero);
2613     stp2_13 = _mm_packs_epi32(tmp7, zero);
2614 
2615     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
2616     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
2617     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
2618     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
2619   }
2620 
2621   // Stage5 and Stage6
2622   {
2623     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
2624     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
2625     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
2626     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
2627 
2628     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
2629     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
2630     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
2631     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
2632 
2633     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
2634     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
2635     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
2636     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
2637   }
2638 
2639   // Stage6
2640   {
2641     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
2642     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2643     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2644 
2645     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2646     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2647     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2648     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2649     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2650     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2651 
2652     tmp1 = _mm_add_epi32(tmp1, rounding);
2653     tmp3 = _mm_add_epi32(tmp3, rounding);
2654     tmp0 = _mm_add_epi32(tmp0, rounding);
2655     tmp2 = _mm_add_epi32(tmp2, rounding);
2656     tmp4 = _mm_add_epi32(tmp4, rounding);
2657     tmp6 = _mm_add_epi32(tmp6, rounding);
2658 
2659     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2660     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2661     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2662     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2663     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2664     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2665 
2666     stp1_5 = _mm_packs_epi32(tmp1, zero);
2667     stp1_6 = _mm_packs_epi32(tmp3, zero);
2668     stp2_10 = _mm_packs_epi32(tmp0, zero);
2669     stp2_13 = _mm_packs_epi32(tmp2, zero);
2670     stp2_11 = _mm_packs_epi32(tmp4, zero);
2671     stp2_12 = _mm_packs_epi32(tmp6, zero);
2672 
2673     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
2674     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
2675     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
2676     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
2677     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
2678     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
2679     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
2680     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
2681   }
2682 
2683   // Stage7. Left 8x16 only.
2684   l0 = _mm_add_epi16(stp2_0, stp1_15);
2685   l1 = _mm_add_epi16(stp2_1, stp1_14);
2686   l2 = _mm_add_epi16(stp2_2, stp2_13);
2687   l3 = _mm_add_epi16(stp2_3, stp2_12);
2688   l4 = _mm_add_epi16(stp2_4, stp2_11);
2689   l5 = _mm_add_epi16(stp2_5, stp2_10);
2690   l6 = _mm_add_epi16(stp2_6, stp1_9);
2691   l7 = _mm_add_epi16(stp2_7, stp1_8);
2692   l8 = _mm_sub_epi16(stp2_7, stp1_8);
2693   l9 = _mm_sub_epi16(stp2_6, stp1_9);
2694   l10 = _mm_sub_epi16(stp2_5, stp2_10);
2695   l11 = _mm_sub_epi16(stp2_4, stp2_11);
2696   l12 = _mm_sub_epi16(stp2_3, stp2_12);
2697   l13 = _mm_sub_epi16(stp2_2, stp2_13);
2698   l14 = _mm_sub_epi16(stp2_1, stp1_14);
2699   l15 = _mm_sub_epi16(stp2_0, stp1_15);
2700 
2701   // 2-D idct. We do 2 8x16 blocks.
2702   for (i = 0; i < 2; i++) {
2703     if (i == 0)
2704       TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
2705                     in5, in6, in7);
2706 
2707     if (i == 1)
2708       TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
2709                     in4, in5, in6, in7);
2710 
2711     in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
2712 
2713     IDCT16_1D
2714 
2715     // Stage7
2716     in0 = _mm_add_epi16(stp2_0, stp1_15);
2717     in1 = _mm_add_epi16(stp2_1, stp1_14);
2718     in2 = _mm_add_epi16(stp2_2, stp2_13);
2719     in3 = _mm_add_epi16(stp2_3, stp2_12);
2720     in4 = _mm_add_epi16(stp2_4, stp2_11);
2721     in5 = _mm_add_epi16(stp2_5, stp2_10);
2722     in6 = _mm_add_epi16(stp2_6, stp1_9);
2723     in7 = _mm_add_epi16(stp2_7, stp1_8);
2724     in8 = _mm_sub_epi16(stp2_7, stp1_8);
2725     in9 = _mm_sub_epi16(stp2_6, stp1_9);
2726     in10 = _mm_sub_epi16(stp2_5, stp2_10);
2727     in11 = _mm_sub_epi16(stp2_4, stp2_11);
2728     in12 = _mm_sub_epi16(stp2_3, stp2_12);
2729     in13 = _mm_sub_epi16(stp2_2, stp2_13);
2730     in14 = _mm_sub_epi16(stp2_1, stp1_14);
2731     in15 = _mm_sub_epi16(stp2_0, stp1_15);
2732 
2733     // Final rounding and shift
2734     in0 = _mm_adds_epi16(in0, final_rounding);
2735     in1 = _mm_adds_epi16(in1, final_rounding);
2736     in2 = _mm_adds_epi16(in2, final_rounding);
2737     in3 = _mm_adds_epi16(in3, final_rounding);
2738     in4 = _mm_adds_epi16(in4, final_rounding);
2739     in5 = _mm_adds_epi16(in5, final_rounding);
2740     in6 = _mm_adds_epi16(in6, final_rounding);
2741     in7 = _mm_adds_epi16(in7, final_rounding);
2742     in8 = _mm_adds_epi16(in8, final_rounding);
2743     in9 = _mm_adds_epi16(in9, final_rounding);
2744     in10 = _mm_adds_epi16(in10, final_rounding);
2745     in11 = _mm_adds_epi16(in11, final_rounding);
2746     in12 = _mm_adds_epi16(in12, final_rounding);
2747     in13 = _mm_adds_epi16(in13, final_rounding);
2748     in14 = _mm_adds_epi16(in14, final_rounding);
2749     in15 = _mm_adds_epi16(in15, final_rounding);
2750 
2751     in0 = _mm_srai_epi16(in0, 6);
2752     in1 = _mm_srai_epi16(in1, 6);
2753     in2 = _mm_srai_epi16(in2, 6);
2754     in3 = _mm_srai_epi16(in3, 6);
2755     in4 = _mm_srai_epi16(in4, 6);
2756     in5 = _mm_srai_epi16(in5, 6);
2757     in6 = _mm_srai_epi16(in6, 6);
2758     in7 = _mm_srai_epi16(in7, 6);
2759     in8 = _mm_srai_epi16(in8, 6);
2760     in9 = _mm_srai_epi16(in9, 6);
2761     in10 = _mm_srai_epi16(in10, 6);
2762     in11 = _mm_srai_epi16(in11, 6);
2763     in12 = _mm_srai_epi16(in12, 6);
2764     in13 = _mm_srai_epi16(in13, 6);
2765     in14 = _mm_srai_epi16(in14, 6);
2766     in15 = _mm_srai_epi16(in15, 6);
2767 
2768     RECON_AND_STORE(dest, in0);
2769     RECON_AND_STORE(dest, in1);
2770     RECON_AND_STORE(dest, in2);
2771     RECON_AND_STORE(dest, in3);
2772     RECON_AND_STORE(dest, in4);
2773     RECON_AND_STORE(dest, in5);
2774     RECON_AND_STORE(dest, in6);
2775     RECON_AND_STORE(dest, in7);
2776     RECON_AND_STORE(dest, in8);
2777     RECON_AND_STORE(dest, in9);
2778     RECON_AND_STORE(dest, in10);
2779     RECON_AND_STORE(dest, in11);
2780     RECON_AND_STORE(dest, in12);
2781     RECON_AND_STORE(dest, in13);
2782     RECON_AND_STORE(dest, in14);
2783     RECON_AND_STORE(dest, in15);
2784 
2785     dest += 8 - (stride * 16);
2786   }
2787 }
2788 
2789 #define LOAD_DQCOEFF(reg, input) \
2790   {  \
2791     reg = _mm_load_si128((const __m128i *) input); \
2792     input += 8; \
2793   }  \
2794 
2795 #define IDCT32_1D \
2796 /* Stage1 */ \
2797 { \
2798   const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
2799   const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
2800   const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
2801   const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
2802   \
2803   const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
2804   const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
2805   const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
2806   const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
2807   \
2808   const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
2809   const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
2810   const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
2811   const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
2812   \
2813   const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
2814   const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
2815   const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
2816   const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
2817   \
2818   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2819                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2820                          stp1_17, stp1_30) \
2821   MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2822                          stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2823                          stp1_19, stp1_28) \
2824   MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2825                          stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2826                          stp1_21, stp1_26) \
2827   MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2828                          stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2829                          stp1_23, stp1_24) \
2830 } \
2831 \
2832 /* Stage2 */ \
2833 { \
2834   const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
2835   const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
2836   const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
2837   const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
2838   \
2839   const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
2840   const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
2841   const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
2842   const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
2843   \
2844   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2845                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2846                          stp2_14) \
2847   MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2848                          stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2849                          stp2_11, stp2_12) \
2850   \
2851   stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2852   stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2853   stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2854   stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2855   \
2856   stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2857   stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2858   stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2859   stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2860   \
2861   stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2862   stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2863   stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2864   stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2865   \
2866   stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2867   stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2868   stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2869   stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2870 } \
2871 \
2872 /* Stage3 */ \
2873 { \
2874   const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
2875   const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
2876   const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
2877   const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
2878   \
2879   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2880   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2881   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2882   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2883   \
2884   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2885   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2886   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2887   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2888   \
2889   MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2890                          stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2891                          stp1_6) \
2892   \
2893   stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2894   stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2895   stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2896   stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2897   stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2898   stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2899   stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2900   stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2901   \
2902   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2903                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2904                          stp1_18, stp1_29) \
2905   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2906                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2907                          stp1_22, stp1_25) \
2908   \
2909   stp1_16 = stp2_16; \
2910   stp1_31 = stp2_31; \
2911   stp1_19 = stp2_19; \
2912   stp1_20 = stp2_20; \
2913   stp1_23 = stp2_23; \
2914   stp1_24 = stp2_24; \
2915   stp1_27 = stp2_27; \
2916   stp1_28 = stp2_28; \
2917 } \
2918 \
2919 /* Stage4 */ \
2920 { \
2921   const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
2922   const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
2923   const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
2924   const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
2925   \
2926   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2927   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2928   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2929   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2930   \
2931   MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2932                          stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2933                          stp2_2, stp2_3) \
2934   \
2935   stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2936   stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2937   stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2938   stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2939   \
2940   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2941                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2942                          stp2_10, stp2_13) \
2943   \
2944   stp2_8 = stp1_8; \
2945   stp2_15 = stp1_15; \
2946   stp2_11 = stp1_11; \
2947   stp2_12 = stp1_12; \
2948   \
2949   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2950   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2951   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2952   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2953   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2954   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2955   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2956   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2957   \
2958   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2959   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2960   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2961   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2962   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2963   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2964   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2965   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2966 } \
2967 \
2968 /* Stage5 */ \
2969 { \
2970   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2971   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2972   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2973   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2974   \
2975   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2976   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2977   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2978   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2979   \
2980   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2981   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2982   \
2983   stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2984   stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2985   stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2986   stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2987   \
2988   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2989   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2990   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2991   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2992   \
2993   tmp0 = _mm_add_epi32(tmp0, rounding); \
2994   tmp1 = _mm_add_epi32(tmp1, rounding); \
2995   tmp2 = _mm_add_epi32(tmp2, rounding); \
2996   tmp3 = _mm_add_epi32(tmp3, rounding); \
2997   \
2998   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2999   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
3000   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
3001   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
3002   \
3003   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
3004   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
3005   \
3006   stp1_4 = stp2_4; \
3007   stp1_7 = stp2_7; \
3008   \
3009   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
3010   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
3011   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
3012   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
3013   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
3014   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
3015   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
3016   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
3017   \
3018   stp1_16 = stp2_16; \
3019   stp1_17 = stp2_17; \
3020   \
3021   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
3022                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
3023                          stp1_19, stp1_28) \
3024   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
3025                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
3026                          stp1_21, stp1_26) \
3027   \
3028   stp1_22 = stp2_22; \
3029   stp1_23 = stp2_23; \
3030   stp1_24 = stp2_24; \
3031   stp1_25 = stp2_25; \
3032   stp1_30 = stp2_30; \
3033   stp1_31 = stp2_31; \
3034 } \
3035 \
3036 /* Stage6 */ \
3037 { \
3038   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
3039   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
3040   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
3041   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
3042   \
3043   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
3044   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
3045   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
3046   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
3047   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
3048   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
3049   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
3050   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
3051   \
3052   stp2_8 = stp1_8; \
3053   stp2_9 = stp1_9; \
3054   stp2_14 = stp1_14; \
3055   stp2_15 = stp1_15; \
3056   \
3057   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
3058                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
3059                          stp2_13, stp2_11, stp2_12) \
3060   \
3061   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
3062   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
3063   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
3064   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
3065   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
3066   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
3067   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
3068   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
3069   \
3070   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
3071   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
3072   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
3073   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
3074   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
3075   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
3076   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
3077   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
3078 } \
3079 \
3080 /* Stage7 */ \
3081 { \
3082   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
3083   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
3084   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
3085   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
3086   \
3087   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
3088   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
3089   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
3090   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
3091   \
3092   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
3093   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
3094   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
3095   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
3096   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
3097   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
3098   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
3099   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
3100   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
3101   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
3102   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
3103   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3104   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3105   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3106   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3107   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3108   \
3109   stp1_16 = stp2_16; \
3110   stp1_17 = stp2_17; \
3111   stp1_18 = stp2_18; \
3112   stp1_19 = stp2_19; \
3113   \
3114   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3115                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3116                          stp1_21, stp1_26) \
3117   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3118                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3119                          stp1_23, stp1_24) \
3120   \
3121   stp1_28 = stp2_28; \
3122   stp1_29 = stp2_29; \
3123   stp1_30 = stp2_30; \
3124   stp1_31 = stp2_31; \
3125 }
3126 
3127 // Only upper-left 8x8 has non-zero coeff
vp9_idct32x32_34_add_sse2(const int16_t * input,uint8_t * dest,int stride)3128 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3129                                  int stride) {
3130   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3131   const __m128i final_rounding = _mm_set1_epi16(1<<5);
3132 
3133   // idct constants for each stage
3134   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3135   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3136   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3137   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3138   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3139   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3140   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3141   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3142   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3143   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3144   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3145   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3146   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3147   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3148   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3149   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3150 
3151   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3152   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3153   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3154   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3155   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3156   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3157   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3158   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3159 
3160   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3161   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3162   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3163   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3164   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3165   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3166   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3167   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3168   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3169   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3170 
3171   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3172   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3173   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3174   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3175   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3176   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3177   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3178 
3179   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3180 
3181   __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3182           in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3183           in24, in25, in26, in27, in28, in29, in30, in31;
3184   __m128i col[128];
3185   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3186           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3187           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3188           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3189           stp1_30, stp1_31;
3190   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3191           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3192           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3193           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3194           stp2_30, stp2_31;
3195   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3196   int i, j, i32;
3197 
3198   // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3199   for (i = 0; i < 8; i++) {
3200     i32 = (i << 5);
3201     if (i == 0) {
3202       // First 1-D idct: first 8 rows
3203       // Load input data.
3204       LOAD_DQCOEFF(in0, input);
3205       LOAD_DQCOEFF(in8, input);
3206       LOAD_DQCOEFF(in16, input);
3207       LOAD_DQCOEFF(in24, input);
3208       LOAD_DQCOEFF(in1, input);
3209       LOAD_DQCOEFF(in9, input);
3210       LOAD_DQCOEFF(in17, input);
3211       LOAD_DQCOEFF(in25, input);
3212       LOAD_DQCOEFF(in2, input);
3213       LOAD_DQCOEFF(in10, input);
3214       LOAD_DQCOEFF(in18, input);
3215       LOAD_DQCOEFF(in26, input);
3216       LOAD_DQCOEFF(in3, input);
3217       LOAD_DQCOEFF(in11, input);
3218       LOAD_DQCOEFF(in19, input);
3219       LOAD_DQCOEFF(in27, input);
3220 
3221       LOAD_DQCOEFF(in4, input);
3222       LOAD_DQCOEFF(in12, input);
3223       LOAD_DQCOEFF(in20, input);
3224       LOAD_DQCOEFF(in28, input);
3225       LOAD_DQCOEFF(in5, input);
3226       LOAD_DQCOEFF(in13, input);
3227       LOAD_DQCOEFF(in21, input);
3228       LOAD_DQCOEFF(in29, input);
3229       LOAD_DQCOEFF(in6, input);
3230       LOAD_DQCOEFF(in14, input);
3231       LOAD_DQCOEFF(in22, input);
3232       LOAD_DQCOEFF(in30, input);
3233       LOAD_DQCOEFF(in7, input);
3234       LOAD_DQCOEFF(in15, input);
3235       LOAD_DQCOEFF(in23, input);
3236       LOAD_DQCOEFF(in31, input);
3237 
3238       // Transpose 32x8 block to 8x32 block
3239       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3240                     in4, in5, in6, in7);
3241       TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3242                     in10, in11, in12, in13, in14, in15);
3243       TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3244                     in18, in19, in20, in21, in22, in23);
3245       TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3246                     in26, in27, in28, in29, in30, in31);
3247     } else if (i < 4) {
3248       // First 1-D idct: next 24 zero-coeff rows
3249       col[i32 + 0] = _mm_setzero_si128();
3250       col[i32 + 1] = _mm_setzero_si128();
3251       col[i32 + 2] = _mm_setzero_si128();
3252       col[i32 + 3] = _mm_setzero_si128();
3253       col[i32 + 4] = _mm_setzero_si128();
3254       col[i32 + 5] = _mm_setzero_si128();
3255       col[i32 + 6] = _mm_setzero_si128();
3256       col[i32 + 7] = _mm_setzero_si128();
3257       col[i32 + 8] = _mm_setzero_si128();
3258       col[i32 + 9] = _mm_setzero_si128();
3259       col[i32 + 10] = _mm_setzero_si128();
3260       col[i32 + 11] = _mm_setzero_si128();
3261       col[i32 + 12] = _mm_setzero_si128();
3262       col[i32 + 13] = _mm_setzero_si128();
3263       col[i32 + 14] = _mm_setzero_si128();
3264       col[i32 + 15] = _mm_setzero_si128();
3265       col[i32 + 16] = _mm_setzero_si128();
3266       col[i32 + 17] = _mm_setzero_si128();
3267       col[i32 + 18] = _mm_setzero_si128();
3268       col[i32 + 19] = _mm_setzero_si128();
3269       col[i32 + 20] = _mm_setzero_si128();
3270       col[i32 + 21] = _mm_setzero_si128();
3271       col[i32 + 22] = _mm_setzero_si128();
3272       col[i32 + 23] = _mm_setzero_si128();
3273       col[i32 + 24] = _mm_setzero_si128();
3274       col[i32 + 25] = _mm_setzero_si128();
3275       col[i32 + 26] = _mm_setzero_si128();
3276       col[i32 + 27] = _mm_setzero_si128();
3277       col[i32 + 28] = _mm_setzero_si128();
3278       col[i32 + 29] = _mm_setzero_si128();
3279       col[i32 + 30] = _mm_setzero_si128();
3280       col[i32 + 31] = _mm_setzero_si128();
3281       continue;
3282     } else {
3283       // Second 1-D idct
3284       j = i - 4;
3285 
3286       // Transpose 32x8 block to 8x32 block
3287       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3288                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3289                     col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3290                     in5, in6, in7);
3291       j += 4;
3292       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3293                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3294                     col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3295                     in11, in12, in13, in14, in15);
3296       j += 4;
3297       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3298                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3299                     col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3300                     in19, in20, in21, in22, in23);
3301       j += 4;
3302       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3303                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3304                     col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3305                     in28, in29, in30, in31);
3306     }
3307 
3308     IDCT32_1D
3309 
3310     // final stage
3311     if (i < 4) {
3312       // 1_D: Store 32 intermediate results for each 8x32 block.
3313       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3314       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3315       col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3316       col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3317       col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3318       col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3319       col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3320       col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3321       col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3322       col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3323       col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3324       col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3325       col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3326       col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3327       col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3328       col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3329       col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3330       col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3331       col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3332       col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3333       col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3334       col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3335       col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3336       col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3337       col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3338       col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3339       col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3340       col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3341       col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3342       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3343       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3344       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3345     } else {
3346       const __m128i zero = _mm_setzero_si128();
3347 
3348       // 2_D: Calculate the results and store them to destination.
3349       in0 = _mm_add_epi16(stp1_0, stp1_31);
3350       in1 = _mm_add_epi16(stp1_1, stp1_30);
3351       in2 = _mm_add_epi16(stp1_2, stp1_29);
3352       in3 = _mm_add_epi16(stp1_3, stp1_28);
3353       in4 = _mm_add_epi16(stp1_4, stp1_27);
3354       in5 = _mm_add_epi16(stp1_5, stp1_26);
3355       in6 = _mm_add_epi16(stp1_6, stp1_25);
3356       in7 = _mm_add_epi16(stp1_7, stp1_24);
3357       in8 = _mm_add_epi16(stp1_8, stp1_23);
3358       in9 = _mm_add_epi16(stp1_9, stp1_22);
3359       in10 = _mm_add_epi16(stp1_10, stp1_21);
3360       in11 = _mm_add_epi16(stp1_11, stp1_20);
3361       in12 = _mm_add_epi16(stp1_12, stp1_19);
3362       in13 = _mm_add_epi16(stp1_13, stp1_18);
3363       in14 = _mm_add_epi16(stp1_14, stp1_17);
3364       in15 = _mm_add_epi16(stp1_15, stp1_16);
3365       in16 = _mm_sub_epi16(stp1_15, stp1_16);
3366       in17 = _mm_sub_epi16(stp1_14, stp1_17);
3367       in18 = _mm_sub_epi16(stp1_13, stp1_18);
3368       in19 = _mm_sub_epi16(stp1_12, stp1_19);
3369       in20 = _mm_sub_epi16(stp1_11, stp1_20);
3370       in21 = _mm_sub_epi16(stp1_10, stp1_21);
3371       in22 = _mm_sub_epi16(stp1_9, stp1_22);
3372       in23 = _mm_sub_epi16(stp1_8, stp1_23);
3373       in24 = _mm_sub_epi16(stp1_7, stp1_24);
3374       in25 = _mm_sub_epi16(stp1_6, stp1_25);
3375       in26 = _mm_sub_epi16(stp1_5, stp1_26);
3376       in27 = _mm_sub_epi16(stp1_4, stp1_27);
3377       in28 = _mm_sub_epi16(stp1_3, stp1_28);
3378       in29 = _mm_sub_epi16(stp1_2, stp1_29);
3379       in30 = _mm_sub_epi16(stp1_1, stp1_30);
3380       in31 = _mm_sub_epi16(stp1_0, stp1_31);
3381 
3382       // Final rounding and shift
3383       in0 = _mm_adds_epi16(in0, final_rounding);
3384       in1 = _mm_adds_epi16(in1, final_rounding);
3385       in2 = _mm_adds_epi16(in2, final_rounding);
3386       in3 = _mm_adds_epi16(in3, final_rounding);
3387       in4 = _mm_adds_epi16(in4, final_rounding);
3388       in5 = _mm_adds_epi16(in5, final_rounding);
3389       in6 = _mm_adds_epi16(in6, final_rounding);
3390       in7 = _mm_adds_epi16(in7, final_rounding);
3391       in8 = _mm_adds_epi16(in8, final_rounding);
3392       in9 = _mm_adds_epi16(in9, final_rounding);
3393       in10 = _mm_adds_epi16(in10, final_rounding);
3394       in11 = _mm_adds_epi16(in11, final_rounding);
3395       in12 = _mm_adds_epi16(in12, final_rounding);
3396       in13 = _mm_adds_epi16(in13, final_rounding);
3397       in14 = _mm_adds_epi16(in14, final_rounding);
3398       in15 = _mm_adds_epi16(in15, final_rounding);
3399       in16 = _mm_adds_epi16(in16, final_rounding);
3400       in17 = _mm_adds_epi16(in17, final_rounding);
3401       in18 = _mm_adds_epi16(in18, final_rounding);
3402       in19 = _mm_adds_epi16(in19, final_rounding);
3403       in20 = _mm_adds_epi16(in20, final_rounding);
3404       in21 = _mm_adds_epi16(in21, final_rounding);
3405       in22 = _mm_adds_epi16(in22, final_rounding);
3406       in23 = _mm_adds_epi16(in23, final_rounding);
3407       in24 = _mm_adds_epi16(in24, final_rounding);
3408       in25 = _mm_adds_epi16(in25, final_rounding);
3409       in26 = _mm_adds_epi16(in26, final_rounding);
3410       in27 = _mm_adds_epi16(in27, final_rounding);
3411       in28 = _mm_adds_epi16(in28, final_rounding);
3412       in29 = _mm_adds_epi16(in29, final_rounding);
3413       in30 = _mm_adds_epi16(in30, final_rounding);
3414       in31 = _mm_adds_epi16(in31, final_rounding);
3415 
3416       in0 = _mm_srai_epi16(in0, 6);
3417       in1 = _mm_srai_epi16(in1, 6);
3418       in2 = _mm_srai_epi16(in2, 6);
3419       in3 = _mm_srai_epi16(in3, 6);
3420       in4 = _mm_srai_epi16(in4, 6);
3421       in5 = _mm_srai_epi16(in5, 6);
3422       in6 = _mm_srai_epi16(in6, 6);
3423       in7 = _mm_srai_epi16(in7, 6);
3424       in8 = _mm_srai_epi16(in8, 6);
3425       in9 = _mm_srai_epi16(in9, 6);
3426       in10 = _mm_srai_epi16(in10, 6);
3427       in11 = _mm_srai_epi16(in11, 6);
3428       in12 = _mm_srai_epi16(in12, 6);
3429       in13 = _mm_srai_epi16(in13, 6);
3430       in14 = _mm_srai_epi16(in14, 6);
3431       in15 = _mm_srai_epi16(in15, 6);
3432       in16 = _mm_srai_epi16(in16, 6);
3433       in17 = _mm_srai_epi16(in17, 6);
3434       in18 = _mm_srai_epi16(in18, 6);
3435       in19 = _mm_srai_epi16(in19, 6);
3436       in20 = _mm_srai_epi16(in20, 6);
3437       in21 = _mm_srai_epi16(in21, 6);
3438       in22 = _mm_srai_epi16(in22, 6);
3439       in23 = _mm_srai_epi16(in23, 6);
3440       in24 = _mm_srai_epi16(in24, 6);
3441       in25 = _mm_srai_epi16(in25, 6);
3442       in26 = _mm_srai_epi16(in26, 6);
3443       in27 = _mm_srai_epi16(in27, 6);
3444       in28 = _mm_srai_epi16(in28, 6);
3445       in29 = _mm_srai_epi16(in29, 6);
3446       in30 = _mm_srai_epi16(in30, 6);
3447       in31 = _mm_srai_epi16(in31, 6);
3448 
3449       RECON_AND_STORE(dest, in0);
3450       RECON_AND_STORE(dest, in1);
3451       RECON_AND_STORE(dest, in2);
3452       RECON_AND_STORE(dest, in3);
3453       RECON_AND_STORE(dest, in4);
3454       RECON_AND_STORE(dest, in5);
3455       RECON_AND_STORE(dest, in6);
3456       RECON_AND_STORE(dest, in7);
3457       RECON_AND_STORE(dest, in8);
3458       RECON_AND_STORE(dest, in9);
3459       RECON_AND_STORE(dest, in10);
3460       RECON_AND_STORE(dest, in11);
3461       RECON_AND_STORE(dest, in12);
3462       RECON_AND_STORE(dest, in13);
3463       RECON_AND_STORE(dest, in14);
3464       RECON_AND_STORE(dest, in15);
3465       RECON_AND_STORE(dest, in16);
3466       RECON_AND_STORE(dest, in17);
3467       RECON_AND_STORE(dest, in18);
3468       RECON_AND_STORE(dest, in19);
3469       RECON_AND_STORE(dest, in20);
3470       RECON_AND_STORE(dest, in21);
3471       RECON_AND_STORE(dest, in22);
3472       RECON_AND_STORE(dest, in23);
3473       RECON_AND_STORE(dest, in24);
3474       RECON_AND_STORE(dest, in25);
3475       RECON_AND_STORE(dest, in26);
3476       RECON_AND_STORE(dest, in27);
3477       RECON_AND_STORE(dest, in28);
3478       RECON_AND_STORE(dest, in29);
3479       RECON_AND_STORE(dest, in30);
3480       RECON_AND_STORE(dest, in31);
3481 
3482       dest += 8 - (stride * 32);
3483     }
3484   }
3485 }
3486 
vp9_idct32x32_1024_add_sse2(const int16_t * input,uint8_t * dest,int stride)3487 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
3488                                  int stride) {
3489   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3490   const __m128i final_rounding = _mm_set1_epi16(1<<5);
3491 
3492   // idct constants for each stage
3493   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3494   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3495   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3496   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3497   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3498   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3499   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3500   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3501   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3502   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3503   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3504   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3505   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3506   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3507   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3508   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3509 
3510   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3511   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3512   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3513   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3514   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3515   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3516   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3517   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3518 
3519   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3520   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3521   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3522   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3523   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3524   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3525   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3526   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3527   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3528   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3529 
3530   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3531   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3532   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3533   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3534   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3535   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3536   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3537 
3538   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3539 
3540   __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
3541           in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
3542           in24, in25, in26, in27, in28, in29, in30, in31;
3543   __m128i col[128];
3544   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3545           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3546           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3547           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3548           stp1_30, stp1_31;
3549   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3550           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3551           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3552           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3553           stp2_30, stp2_31;
3554   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3555   int i, j, i32;
3556   __m128i zero_idx[16];
3557   int zero_flag[2];
3558 
3559   // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
3560   for (i = 0; i < 8; i++) {
3561     i32 = (i << 5);
3562     if (i < 4) {
3563       // First 1-D idct
3564       // Load input data.
3565       LOAD_DQCOEFF(in0, input);
3566       LOAD_DQCOEFF(in8, input);
3567       LOAD_DQCOEFF(in16, input);
3568       LOAD_DQCOEFF(in24, input);
3569       LOAD_DQCOEFF(in1, input);
3570       LOAD_DQCOEFF(in9, input);
3571       LOAD_DQCOEFF(in17, input);
3572       LOAD_DQCOEFF(in25, input);
3573       LOAD_DQCOEFF(in2, input);
3574       LOAD_DQCOEFF(in10, input);
3575       LOAD_DQCOEFF(in18, input);
3576       LOAD_DQCOEFF(in26, input);
3577       LOAD_DQCOEFF(in3, input);
3578       LOAD_DQCOEFF(in11, input);
3579       LOAD_DQCOEFF(in19, input);
3580       LOAD_DQCOEFF(in27, input);
3581 
3582       LOAD_DQCOEFF(in4, input);
3583       LOAD_DQCOEFF(in12, input);
3584       LOAD_DQCOEFF(in20, input);
3585       LOAD_DQCOEFF(in28, input);
3586       LOAD_DQCOEFF(in5, input);
3587       LOAD_DQCOEFF(in13, input);
3588       LOAD_DQCOEFF(in21, input);
3589       LOAD_DQCOEFF(in29, input);
3590       LOAD_DQCOEFF(in6, input);
3591       LOAD_DQCOEFF(in14, input);
3592       LOAD_DQCOEFF(in22, input);
3593       LOAD_DQCOEFF(in30, input);
3594       LOAD_DQCOEFF(in7, input);
3595       LOAD_DQCOEFF(in15, input);
3596       LOAD_DQCOEFF(in23, input);
3597       LOAD_DQCOEFF(in31, input);
3598 
3599       // checking if all entries are zero
3600       zero_idx[0] = _mm_or_si128(in0, in1);
3601       zero_idx[1] = _mm_or_si128(in2, in3);
3602       zero_idx[2] = _mm_or_si128(in4, in5);
3603       zero_idx[3] = _mm_or_si128(in6, in7);
3604       zero_idx[4] = _mm_or_si128(in8, in9);
3605       zero_idx[5] = _mm_or_si128(in10, in11);
3606       zero_idx[6] = _mm_or_si128(in12, in13);
3607       zero_idx[7] = _mm_or_si128(in14, in15);
3608       zero_idx[8] = _mm_or_si128(in16, in17);
3609       zero_idx[9] = _mm_or_si128(in18, in19);
3610       zero_idx[10] = _mm_or_si128(in20, in21);
3611       zero_idx[11] = _mm_or_si128(in22, in23);
3612       zero_idx[12] = _mm_or_si128(in24, in25);
3613       zero_idx[13] = _mm_or_si128(in26, in27);
3614       zero_idx[14] = _mm_or_si128(in28, in29);
3615       zero_idx[15] = _mm_or_si128(in30, in31);
3616 
3617       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3618       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3619       zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3620       zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3621       zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3622       zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3623       zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3624       zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3625 
3626       zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3627       zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3628       zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3629       zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3630       zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3631       zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3632       zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3633 
3634       zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
3635       zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
3636       zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
3637       zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
3638       zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
3639 
3640       if (!zero_flag[0] && !zero_flag[1]) {
3641         col[i32 + 0] = _mm_setzero_si128();
3642         col[i32 + 1] = _mm_setzero_si128();
3643         col[i32 + 2] = _mm_setzero_si128();
3644         col[i32 + 3] = _mm_setzero_si128();
3645         col[i32 + 4] = _mm_setzero_si128();
3646         col[i32 + 5] = _mm_setzero_si128();
3647         col[i32 + 6] = _mm_setzero_si128();
3648         col[i32 + 7] = _mm_setzero_si128();
3649         col[i32 + 8] = _mm_setzero_si128();
3650         col[i32 + 9] = _mm_setzero_si128();
3651         col[i32 + 10] = _mm_setzero_si128();
3652         col[i32 + 11] = _mm_setzero_si128();
3653         col[i32 + 12] = _mm_setzero_si128();
3654         col[i32 + 13] = _mm_setzero_si128();
3655         col[i32 + 14] = _mm_setzero_si128();
3656         col[i32 + 15] = _mm_setzero_si128();
3657         col[i32 + 16] = _mm_setzero_si128();
3658         col[i32 + 17] = _mm_setzero_si128();
3659         col[i32 + 18] = _mm_setzero_si128();
3660         col[i32 + 19] = _mm_setzero_si128();
3661         col[i32 + 20] = _mm_setzero_si128();
3662         col[i32 + 21] = _mm_setzero_si128();
3663         col[i32 + 22] = _mm_setzero_si128();
3664         col[i32 + 23] = _mm_setzero_si128();
3665         col[i32 + 24] = _mm_setzero_si128();
3666         col[i32 + 25] = _mm_setzero_si128();
3667         col[i32 + 26] = _mm_setzero_si128();
3668         col[i32 + 27] = _mm_setzero_si128();
3669         col[i32 + 28] = _mm_setzero_si128();
3670         col[i32 + 29] = _mm_setzero_si128();
3671         col[i32 + 30] = _mm_setzero_si128();
3672         col[i32 + 31] = _mm_setzero_si128();
3673         continue;
3674       }
3675 
3676       // Transpose 32x8 block to 8x32 block
3677       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
3678                     in4, in5, in6, in7);
3679       TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
3680                     in10, in11, in12, in13, in14, in15);
3681       TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
3682                     in18, in19, in20, in21, in22, in23);
3683       TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
3684                     in26, in27, in28, in29, in30, in31);
3685     } else {
3686       // Second 1-D idct
3687       j = i - 4;
3688 
3689       // Transpose 32x8 block to 8x32 block
3690       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3691                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3692                     col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
3693                     in5, in6, in7);
3694       j += 4;
3695       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3696                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3697                     col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
3698                     in11, in12, in13, in14, in15);
3699       j += 4;
3700       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3701                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3702                     col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
3703                     in19, in20, in21, in22, in23);
3704       j += 4;
3705       TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
3706                     col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
3707                     col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
3708                     in28, in29, in30, in31);
3709     }
3710 
3711     IDCT32_1D
3712 
3713     // final stage
3714     if (i < 4) {
3715       // 1_D: Store 32 intermediate results for each 8x32 block.
3716       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3717       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3718       col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3719       col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3720       col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3721       col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3722       col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3723       col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3724       col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3725       col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3726       col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3727       col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3728       col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3729       col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3730       col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3731       col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3732       col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3733       col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3734       col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3735       col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3736       col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3737       col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3738       col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3739       col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3740       col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3741       col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3742       col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3743       col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3744       col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3745       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3746       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3747       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3748     } else {
3749       const __m128i zero = _mm_setzero_si128();
3750 
3751       // 2_D: Calculate the results and store them to destination.
3752       in0 = _mm_add_epi16(stp1_0, stp1_31);
3753       in1 = _mm_add_epi16(stp1_1, stp1_30);
3754       in2 = _mm_add_epi16(stp1_2, stp1_29);
3755       in3 = _mm_add_epi16(stp1_3, stp1_28);
3756       in4 = _mm_add_epi16(stp1_4, stp1_27);
3757       in5 = _mm_add_epi16(stp1_5, stp1_26);
3758       in6 = _mm_add_epi16(stp1_6, stp1_25);
3759       in7 = _mm_add_epi16(stp1_7, stp1_24);
3760       in8 = _mm_add_epi16(stp1_8, stp1_23);
3761       in9 = _mm_add_epi16(stp1_9, stp1_22);
3762       in10 = _mm_add_epi16(stp1_10, stp1_21);
3763       in11 = _mm_add_epi16(stp1_11, stp1_20);
3764       in12 = _mm_add_epi16(stp1_12, stp1_19);
3765       in13 = _mm_add_epi16(stp1_13, stp1_18);
3766       in14 = _mm_add_epi16(stp1_14, stp1_17);
3767       in15 = _mm_add_epi16(stp1_15, stp1_16);
3768       in16 = _mm_sub_epi16(stp1_15, stp1_16);
3769       in17 = _mm_sub_epi16(stp1_14, stp1_17);
3770       in18 = _mm_sub_epi16(stp1_13, stp1_18);
3771       in19 = _mm_sub_epi16(stp1_12, stp1_19);
3772       in20 = _mm_sub_epi16(stp1_11, stp1_20);
3773       in21 = _mm_sub_epi16(stp1_10, stp1_21);
3774       in22 = _mm_sub_epi16(stp1_9, stp1_22);
3775       in23 = _mm_sub_epi16(stp1_8, stp1_23);
3776       in24 = _mm_sub_epi16(stp1_7, stp1_24);
3777       in25 = _mm_sub_epi16(stp1_6, stp1_25);
3778       in26 = _mm_sub_epi16(stp1_5, stp1_26);
3779       in27 = _mm_sub_epi16(stp1_4, stp1_27);
3780       in28 = _mm_sub_epi16(stp1_3, stp1_28);
3781       in29 = _mm_sub_epi16(stp1_2, stp1_29);
3782       in30 = _mm_sub_epi16(stp1_1, stp1_30);
3783       in31 = _mm_sub_epi16(stp1_0, stp1_31);
3784 
3785       // Final rounding and shift
3786       in0 = _mm_adds_epi16(in0, final_rounding);
3787       in1 = _mm_adds_epi16(in1, final_rounding);
3788       in2 = _mm_adds_epi16(in2, final_rounding);
3789       in3 = _mm_adds_epi16(in3, final_rounding);
3790       in4 = _mm_adds_epi16(in4, final_rounding);
3791       in5 = _mm_adds_epi16(in5, final_rounding);
3792       in6 = _mm_adds_epi16(in6, final_rounding);
3793       in7 = _mm_adds_epi16(in7, final_rounding);
3794       in8 = _mm_adds_epi16(in8, final_rounding);
3795       in9 = _mm_adds_epi16(in9, final_rounding);
3796       in10 = _mm_adds_epi16(in10, final_rounding);
3797       in11 = _mm_adds_epi16(in11, final_rounding);
3798       in12 = _mm_adds_epi16(in12, final_rounding);
3799       in13 = _mm_adds_epi16(in13, final_rounding);
3800       in14 = _mm_adds_epi16(in14, final_rounding);
3801       in15 = _mm_adds_epi16(in15, final_rounding);
3802       in16 = _mm_adds_epi16(in16, final_rounding);
3803       in17 = _mm_adds_epi16(in17, final_rounding);
3804       in18 = _mm_adds_epi16(in18, final_rounding);
3805       in19 = _mm_adds_epi16(in19, final_rounding);
3806       in20 = _mm_adds_epi16(in20, final_rounding);
3807       in21 = _mm_adds_epi16(in21, final_rounding);
3808       in22 = _mm_adds_epi16(in22, final_rounding);
3809       in23 = _mm_adds_epi16(in23, final_rounding);
3810       in24 = _mm_adds_epi16(in24, final_rounding);
3811       in25 = _mm_adds_epi16(in25, final_rounding);
3812       in26 = _mm_adds_epi16(in26, final_rounding);
3813       in27 = _mm_adds_epi16(in27, final_rounding);
3814       in28 = _mm_adds_epi16(in28, final_rounding);
3815       in29 = _mm_adds_epi16(in29, final_rounding);
3816       in30 = _mm_adds_epi16(in30, final_rounding);
3817       in31 = _mm_adds_epi16(in31, final_rounding);
3818 
3819       in0 = _mm_srai_epi16(in0, 6);
3820       in1 = _mm_srai_epi16(in1, 6);
3821       in2 = _mm_srai_epi16(in2, 6);
3822       in3 = _mm_srai_epi16(in3, 6);
3823       in4 = _mm_srai_epi16(in4, 6);
3824       in5 = _mm_srai_epi16(in5, 6);
3825       in6 = _mm_srai_epi16(in6, 6);
3826       in7 = _mm_srai_epi16(in7, 6);
3827       in8 = _mm_srai_epi16(in8, 6);
3828       in9 = _mm_srai_epi16(in9, 6);
3829       in10 = _mm_srai_epi16(in10, 6);
3830       in11 = _mm_srai_epi16(in11, 6);
3831       in12 = _mm_srai_epi16(in12, 6);
3832       in13 = _mm_srai_epi16(in13, 6);
3833       in14 = _mm_srai_epi16(in14, 6);
3834       in15 = _mm_srai_epi16(in15, 6);
3835       in16 = _mm_srai_epi16(in16, 6);
3836       in17 = _mm_srai_epi16(in17, 6);
3837       in18 = _mm_srai_epi16(in18, 6);
3838       in19 = _mm_srai_epi16(in19, 6);
3839       in20 = _mm_srai_epi16(in20, 6);
3840       in21 = _mm_srai_epi16(in21, 6);
3841       in22 = _mm_srai_epi16(in22, 6);
3842       in23 = _mm_srai_epi16(in23, 6);
3843       in24 = _mm_srai_epi16(in24, 6);
3844       in25 = _mm_srai_epi16(in25, 6);
3845       in26 = _mm_srai_epi16(in26, 6);
3846       in27 = _mm_srai_epi16(in27, 6);
3847       in28 = _mm_srai_epi16(in28, 6);
3848       in29 = _mm_srai_epi16(in29, 6);
3849       in30 = _mm_srai_epi16(in30, 6);
3850       in31 = _mm_srai_epi16(in31, 6);
3851 
3852       RECON_AND_STORE(dest, in0);
3853       RECON_AND_STORE(dest, in1);
3854       RECON_AND_STORE(dest, in2);
3855       RECON_AND_STORE(dest, in3);
3856       RECON_AND_STORE(dest, in4);
3857       RECON_AND_STORE(dest, in5);
3858       RECON_AND_STORE(dest, in6);
3859       RECON_AND_STORE(dest, in7);
3860       RECON_AND_STORE(dest, in8);
3861       RECON_AND_STORE(dest, in9);
3862       RECON_AND_STORE(dest, in10);
3863       RECON_AND_STORE(dest, in11);
3864       RECON_AND_STORE(dest, in12);
3865       RECON_AND_STORE(dest, in13);
3866       RECON_AND_STORE(dest, in14);
3867       RECON_AND_STORE(dest, in15);
3868       RECON_AND_STORE(dest, in16);
3869       RECON_AND_STORE(dest, in17);
3870       RECON_AND_STORE(dest, in18);
3871       RECON_AND_STORE(dest, in19);
3872       RECON_AND_STORE(dest, in20);
3873       RECON_AND_STORE(dest, in21);
3874       RECON_AND_STORE(dest, in22);
3875       RECON_AND_STORE(dest, in23);
3876       RECON_AND_STORE(dest, in24);
3877       RECON_AND_STORE(dest, in25);
3878       RECON_AND_STORE(dest, in26);
3879       RECON_AND_STORE(dest, in27);
3880       RECON_AND_STORE(dest, in28);
3881       RECON_AND_STORE(dest, in29);
3882       RECON_AND_STORE(dest, in30);
3883       RECON_AND_STORE(dest, in31);
3884 
3885       dest += 8 - (stride * 32);
3886     }
3887   }
3888 }  //NOLINT
3889 
vp9_idct32x32_1_add_sse2(const int16_t * input,uint8_t * dest,int stride)3890 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
3891   __m128i dc_value;
3892   const __m128i zero = _mm_setzero_si128();
3893   int a, i;
3894 
3895   a = dct_const_round_shift(input[0] * cospi_16_64);
3896   a = dct_const_round_shift(a * cospi_16_64);
3897   a = ROUND_POWER_OF_TWO(a, 6);
3898 
3899   dc_value = _mm_set1_epi16(a);
3900 
3901   for (i = 0; i < 4; ++i) {
3902     RECON_AND_STORE(dest, dc_value);
3903     RECON_AND_STORE(dest, dc_value);
3904     RECON_AND_STORE(dest, dc_value);
3905     RECON_AND_STORE(dest, dc_value);
3906     RECON_AND_STORE(dest, dc_value);
3907     RECON_AND_STORE(dest, dc_value);
3908     RECON_AND_STORE(dest, dc_value);
3909     RECON_AND_STORE(dest, dc_value);
3910     RECON_AND_STORE(dest, dc_value);
3911     RECON_AND_STORE(dest, dc_value);
3912     RECON_AND_STORE(dest, dc_value);
3913     RECON_AND_STORE(dest, dc_value);
3914     RECON_AND_STORE(dest, dc_value);
3915     RECON_AND_STORE(dest, dc_value);
3916     RECON_AND_STORE(dest, dc_value);
3917     RECON_AND_STORE(dest, dc_value);
3918     RECON_AND_STORE(dest, dc_value);
3919     RECON_AND_STORE(dest, dc_value);
3920     RECON_AND_STORE(dest, dc_value);
3921     RECON_AND_STORE(dest, dc_value);
3922     RECON_AND_STORE(dest, dc_value);
3923     RECON_AND_STORE(dest, dc_value);
3924     RECON_AND_STORE(dest, dc_value);
3925     RECON_AND_STORE(dest, dc_value);
3926     RECON_AND_STORE(dest, dc_value);
3927     RECON_AND_STORE(dest, dc_value);
3928     RECON_AND_STORE(dest, dc_value);
3929     RECON_AND_STORE(dest, dc_value);
3930     RECON_AND_STORE(dest, dc_value);
3931     RECON_AND_STORE(dest, dc_value);
3932     RECON_AND_STORE(dest, dc_value);
3933     RECON_AND_STORE(dest, dc_value);
3934     dest += 8 - (stride * 32);
3935   }
3936 }
3937