• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>  // SSE2
13 
14 #include "./vp9_rtcd.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/txfm_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
19 #include "vpx_dsp/x86/transpose_sse2.h"
20 #include "vpx_dsp/x86/txfm_common_sse2.h"
21 #include "vpx_ports/mem.h"
22 
load_buffer_4x4(const int16_t * input,__m128i * in,int stride)23 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
24                                    int stride) {
25   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
26   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27   __m128i mask;
28 
29   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
30   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
31   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
32   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
33 
34   in[0] = _mm_slli_epi16(in[0], 4);
35   in[1] = _mm_slli_epi16(in[1], 4);
36   in[2] = _mm_slli_epi16(in[2], 4);
37   in[3] = _mm_slli_epi16(in[3], 4);
38 
39   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
40   in[0] = _mm_add_epi16(in[0], mask);
41   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
42 }
43 
write_buffer_4x4(tran_low_t * output,__m128i * res)44 static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
45   const __m128i kOne = _mm_set1_epi16(1);
46   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
47   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
48   __m128i out01 = _mm_add_epi16(in01, kOne);
49   __m128i out23 = _mm_add_epi16(in23, kOne);
50   out01 = _mm_srai_epi16(out01, 2);
51   out23 = _mm_srai_epi16(out23, 2);
52   store_output(&out01, (output + 0 * 8));
53   store_output(&out23, (output + 1 * 8));
54 }
55 
transpose_4x4(__m128i * res)56 static INLINE void transpose_4x4(__m128i *res) {
57   // Combine and transpose
58   // 00 01 02 03 20 21 22 23
59   // 10 11 12 13 30 31 32 33
60   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
61   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
62 
63   // 00 10 01 11 02 12 03 13
64   // 20 30 21 31 22 32 23 33
65   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
66   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
67 
68   // 00 10 20 30 01 11 21 31
69   // 02 12 22 32 03 13 23 33
70   // only use the first 4 16-bit integers
71   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
72   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
73 }
74 
fdct4_sse2(__m128i * in)75 static void fdct4_sse2(__m128i *in) {
76   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
77   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
78   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
79   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
80   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
81 
82   __m128i u[4], v[4];
83   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
84   u[1] = _mm_unpacklo_epi16(in[3], in[2]);
85 
86   v[0] = _mm_add_epi16(u[0], u[1]);
87   v[1] = _mm_sub_epi16(u[0], u[1]);
88 
89   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
90   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
91   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
92   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
93 
94   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
95   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
96   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
97   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
98   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
99   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
100   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
101   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
102 
103   in[0] = _mm_packs_epi32(u[0], u[1]);
104   in[1] = _mm_packs_epi32(u[2], u[3]);
105   transpose_4x4(in);
106 }
107 
fadst4_sse2(__m128i * in)108 static void fadst4_sse2(__m128i *in) {
109   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
110   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
111   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
112   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
113   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114   const __m128i kZero = _mm_set1_epi16(0);
115   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116   __m128i u[8], v[8];
117   __m128i in7 = _mm_add_epi16(in[0], in[1]);
118 
119   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
120   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
121   u[2] = _mm_unpacklo_epi16(in7, kZero);
122   u[3] = _mm_unpacklo_epi16(in[2], kZero);
123   u[4] = _mm_unpacklo_epi16(in[3], kZero);
124 
125   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
126   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
127   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
128   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
129   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
130   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
131   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
132 
133   u[0] = _mm_add_epi32(v[0], v[1]);
134   u[1] = _mm_sub_epi32(v[2], v[6]);
135   u[2] = _mm_add_epi32(v[3], v[4]);
136   u[3] = _mm_sub_epi32(u[2], u[0]);
137   u[4] = _mm_slli_epi32(v[5], 2);
138   u[5] = _mm_sub_epi32(u[4], v[5]);
139   u[6] = _mm_add_epi32(u[3], u[5]);
140 
141   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
142   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
143   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
144   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
145 
146   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
147   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
148   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
149   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
150 
151   in[0] = _mm_packs_epi32(u[0], u[2]);
152   in[1] = _mm_packs_epi32(u[1], u[3]);
153   transpose_4x4(in);
154 }
155 
vp9_fht4x4_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)156 void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
157                      int tx_type) {
158   __m128i in[4];
159 
160   switch (tx_type) {
161     case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
162     case ADST_DCT:
163       load_buffer_4x4(input, in, stride);
164       fadst4_sse2(in);
165       fdct4_sse2(in);
166       write_buffer_4x4(output, in);
167       break;
168     case DCT_ADST:
169       load_buffer_4x4(input, in, stride);
170       fdct4_sse2(in);
171       fadst4_sse2(in);
172       write_buffer_4x4(output, in);
173       break;
174     default:
175       assert(tx_type == ADST_ADST);
176       load_buffer_4x4(input, in, stride);
177       fadst4_sse2(in);
178       fadst4_sse2(in);
179       write_buffer_4x4(output, in);
180       break;
181   }
182 }
183 
vp9_fdct8x8_quant_sse2(const int16_t * input,int stride,tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)184 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
185                             tran_low_t *coeff_ptr, intptr_t n_coeffs,
186                             int skip_block, const int16_t *round_ptr,
187                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
188                             tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
189                             uint16_t *eob_ptr, const int16_t *scan,
190                             const int16_t *iscan) {
191   __m128i zero;
192   int pass;
193 
194   // Constants
195   //    When we use them, in one case, they are all the same. In all others
196   //    it's a pair of them that we need to repeat four times. This is done
197   //    by constructing the 32 bit constant corresponding to that pair.
198   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
199   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
200   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
201   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
202   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
203   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
204   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
205   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
206   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
207   // Load input
208   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
209   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
210   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
211   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
212   __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
213   __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
214   __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
215   __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
216   __m128i *in[8];
217   int index = 0;
218 
219   (void)scan;
220   (void)coeff_ptr;
221 
222   // Pre-condition input (shift by two)
223   in0 = _mm_slli_epi16(in0, 2);
224   in1 = _mm_slli_epi16(in1, 2);
225   in2 = _mm_slli_epi16(in2, 2);
226   in3 = _mm_slli_epi16(in3, 2);
227   in4 = _mm_slli_epi16(in4, 2);
228   in5 = _mm_slli_epi16(in5, 2);
229   in6 = _mm_slli_epi16(in6, 2);
230   in7 = _mm_slli_epi16(in7, 2);
231 
232   in[0] = &in0;
233   in[1] = &in1;
234   in[2] = &in2;
235   in[3] = &in3;
236   in[4] = &in4;
237   in[5] = &in5;
238   in[6] = &in6;
239   in[7] = &in7;
240 
241   // We do two passes, first the columns, then the rows. The results of the
242   // first pass are transposed so that the same column code can be reused. The
243   // results of the second pass are also transposed so that the rows (processed
244   // as columns) are put back in row positions.
245   for (pass = 0; pass < 2; pass++) {
246     // To store results of each pass before the transpose.
247     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
248     // Add/subtract
249     const __m128i q0 = _mm_add_epi16(in0, in7);
250     const __m128i q1 = _mm_add_epi16(in1, in6);
251     const __m128i q2 = _mm_add_epi16(in2, in5);
252     const __m128i q3 = _mm_add_epi16(in3, in4);
253     const __m128i q4 = _mm_sub_epi16(in3, in4);
254     const __m128i q5 = _mm_sub_epi16(in2, in5);
255     const __m128i q6 = _mm_sub_epi16(in1, in6);
256     const __m128i q7 = _mm_sub_epi16(in0, in7);
257     // Work on first four results
258     {
259       // Add/subtract
260       const __m128i r0 = _mm_add_epi16(q0, q3);
261       const __m128i r1 = _mm_add_epi16(q1, q2);
262       const __m128i r2 = _mm_sub_epi16(q1, q2);
263       const __m128i r3 = _mm_sub_epi16(q0, q3);
264       // Interleave to do the multiply by constants which gets us into 32bits
265       const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
266       const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
267       const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
268       const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
269       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
270       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
271       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
272       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
273       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
274       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
275       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
276       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
277       // dct_const_round_shift
278       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
279       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
280       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
281       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
282       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
283       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
284       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
285       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
286       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
287       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
288       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
289       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
290       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
291       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
292       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
293       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
294       // Combine
295       res0 = _mm_packs_epi32(w0, w1);
296       res4 = _mm_packs_epi32(w2, w3);
297       res2 = _mm_packs_epi32(w4, w5);
298       res6 = _mm_packs_epi32(w6, w7);
299     }
300     // Work on next four results
301     {
302       // Interleave to do the multiply by constants which gets us into 32bits
303       const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
304       const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
305       const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
306       const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
307       const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
308       const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
309       // dct_const_round_shift
310       const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
311       const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
312       const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
313       const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
314       const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
315       const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
316       const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
317       const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
318       // Combine
319       const __m128i r0 = _mm_packs_epi32(s0, s1);
320       const __m128i r1 = _mm_packs_epi32(s2, s3);
321       // Add/subtract
322       const __m128i x0 = _mm_add_epi16(q4, r0);
323       const __m128i x1 = _mm_sub_epi16(q4, r0);
324       const __m128i x2 = _mm_sub_epi16(q7, r1);
325       const __m128i x3 = _mm_add_epi16(q7, r1);
326       // Interleave to do the multiply by constants which gets us into 32bits
327       const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
328       const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
329       const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
330       const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
331       const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
332       const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
333       const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
334       const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
335       const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
336       const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
337       const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
338       const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
339       // dct_const_round_shift
340       const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
341       const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
342       const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
343       const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
344       const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
345       const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
346       const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
347       const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
348       const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
349       const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
350       const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
351       const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
352       const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
353       const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
354       const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
355       const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
356       // Combine
357       res1 = _mm_packs_epi32(w0, w1);
358       res7 = _mm_packs_epi32(w2, w3);
359       res5 = _mm_packs_epi32(w4, w5);
360       res3 = _mm_packs_epi32(w6, w7);
361     }
362     // Transpose the 8x8.
363     {
364       // 00 01 02 03 04 05 06 07
365       // 10 11 12 13 14 15 16 17
366       // 20 21 22 23 24 25 26 27
367       // 30 31 32 33 34 35 36 37
368       // 40 41 42 43 44 45 46 47
369       // 50 51 52 53 54 55 56 57
370       // 60 61 62 63 64 65 66 67
371       // 70 71 72 73 74 75 76 77
372       const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
373       const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
374       const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
375       const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
376       const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
377       const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
378       const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
379       const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
380       // 00 10 01 11 02 12 03 13
381       // 20 30 21 31 22 32 23 33
382       // 04 14 05 15 06 16 07 17
383       // 24 34 25 35 26 36 27 37
384       // 40 50 41 51 42 52 43 53
385       // 60 70 61 71 62 72 63 73
386       // 54 54 55 55 56 56 57 57
387       // 64 74 65 75 66 76 67 77
388       const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
389       const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
390       const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
391       const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
392       const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
393       const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
394       const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
395       const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
396       // 00 10 20 30 01 11 21 31
397       // 40 50 60 70 41 51 61 71
398       // 02 12 22 32 03 13 23 33
399       // 42 52 62 72 43 53 63 73
400       // 04 14 24 34 05 15 21 36
401       // 44 54 64 74 45 55 61 76
402       // 06 16 26 36 07 17 27 37
403       // 46 56 66 76 47 57 67 77
404       in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
405       in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
406       in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
407       in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
408       in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
409       in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
410       in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
411       in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
412       // 00 10 20 30 40 50 60 70
413       // 01 11 21 31 41 51 61 71
414       // 02 12 22 32 42 52 62 72
415       // 03 13 23 33 43 53 63 73
416       // 04 14 24 34 44 54 64 74
417       // 05 15 25 35 45 55 65 75
418       // 06 16 26 36 46 56 66 76
419       // 07 17 27 37 47 57 67 77
420     }
421   }
422   // Post-condition output and store it
423   {
424     // Post-condition (division by two)
425     //    division of two 16 bits signed numbers using shifts
426     //    n / 2 = (n - (n >> 15)) >> 1
427     const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
428     const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
429     const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
430     const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
431     const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
432     const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
433     const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
434     const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
435     in0 = _mm_sub_epi16(in0, sign_in0);
436     in1 = _mm_sub_epi16(in1, sign_in1);
437     in2 = _mm_sub_epi16(in2, sign_in2);
438     in3 = _mm_sub_epi16(in3, sign_in3);
439     in4 = _mm_sub_epi16(in4, sign_in4);
440     in5 = _mm_sub_epi16(in5, sign_in5);
441     in6 = _mm_sub_epi16(in6, sign_in6);
442     in7 = _mm_sub_epi16(in7, sign_in7);
443     in0 = _mm_srai_epi16(in0, 1);
444     in1 = _mm_srai_epi16(in1, 1);
445     in2 = _mm_srai_epi16(in2, 1);
446     in3 = _mm_srai_epi16(in3, 1);
447     in4 = _mm_srai_epi16(in4, 1);
448     in5 = _mm_srai_epi16(in5, 1);
449     in6 = _mm_srai_epi16(in6, 1);
450     in7 = _mm_srai_epi16(in7, 1);
451   }
452 
453   iscan += n_coeffs;
454   qcoeff_ptr += n_coeffs;
455   dqcoeff_ptr += n_coeffs;
456   n_coeffs = -n_coeffs;
457   zero = _mm_setzero_si128();
458 
459   if (!skip_block) {
460     __m128i eob;
461     __m128i round, quant, dequant;
462     {
463       __m128i coeff0, coeff1;
464 
465       // Setup global values
466       {
467         round = _mm_load_si128((const __m128i *)round_ptr);
468         quant = _mm_load_si128((const __m128i *)quant_ptr);
469         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
470       }
471 
472       {
473         __m128i coeff0_sign, coeff1_sign;
474         __m128i qcoeff0, qcoeff1;
475         __m128i qtmp0, qtmp1;
476         // Do DC and first 15 AC
477         coeff0 = *in[0];
478         coeff1 = *in[1];
479 
480         // Poor man's sign extract
481         coeff0_sign = _mm_srai_epi16(coeff0, 15);
482         coeff1_sign = _mm_srai_epi16(coeff1, 15);
483         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
484         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
485         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
486         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
487 
488         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
489         round = _mm_unpackhi_epi64(round, round);
490         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
491         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
492         quant = _mm_unpackhi_epi64(quant, quant);
493         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
494 
495         // Reinsert signs
496         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
497         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
498         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
499         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
500 
501         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
502         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
503 
504         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
505         dequant = _mm_unpackhi_epi64(dequant, dequant);
506         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
507 
508         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
509         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
510       }
511 
512       {
513         // Scan for eob
514         __m128i zero_coeff0, zero_coeff1;
515         __m128i nzero_coeff0, nzero_coeff1;
516         __m128i iscan0, iscan1;
517         __m128i eob1;
518         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
519         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
520         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
521         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
522         iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
523         iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
524         // Add one to convert from indices to counts
525         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
526         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
527         eob = _mm_and_si128(iscan0, nzero_coeff0);
528         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
529         eob = _mm_max_epi16(eob, eob1);
530       }
531       n_coeffs += 8 * 2;
532     }
533 
534     // AC only loop
535     index = 2;
536     while (n_coeffs < 0) {
537       __m128i coeff0, coeff1;
538       {
539         __m128i coeff0_sign, coeff1_sign;
540         __m128i qcoeff0, qcoeff1;
541         __m128i qtmp0, qtmp1;
542 
543         assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
544         coeff0 = *in[index];
545         coeff1 = *in[index + 1];
546 
547         // Poor man's sign extract
548         coeff0_sign = _mm_srai_epi16(coeff0, 15);
549         coeff1_sign = _mm_srai_epi16(coeff1, 15);
550         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
551         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
552         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
553         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
554 
555         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
556         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
557         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
558         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
559 
560         // Reinsert signs
561         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
562         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
563         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
564         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
565 
566         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
567         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
568 
569         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
570         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
571 
572         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
573         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
574       }
575 
576       {
577         // Scan for eob
578         __m128i zero_coeff0, zero_coeff1;
579         __m128i nzero_coeff0, nzero_coeff1;
580         __m128i iscan0, iscan1;
581         __m128i eob0, eob1;
582         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
583         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
584         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
585         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
586         iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
587         iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
588         // Add one to convert from indices to counts
589         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
590         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
591         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
592         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
593         eob0 = _mm_max_epi16(eob0, eob1);
594         eob = _mm_max_epi16(eob, eob0);
595       }
596       n_coeffs += 8 * 2;
597       index += 2;
598     }
599 
600     // Accumulate EOB
601     {
602       __m128i eob_shuffled;
603       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
604       eob = _mm_max_epi16(eob, eob_shuffled);
605       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
606       eob = _mm_max_epi16(eob, eob_shuffled);
607       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
608       eob = _mm_max_epi16(eob, eob_shuffled);
609       *eob_ptr = _mm_extract_epi16(eob, 1);
610     }
611   } else {
612     do {
613       store_tran_low(zero, qcoeff_ptr + n_coeffs);
614       store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
615       store_tran_low(zero, dqcoeff_ptr + n_coeffs);
616       store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
617       n_coeffs += 8 * 2;
618     } while (n_coeffs < 0);
619     *eob_ptr = 0;
620   }
621 }
622 
623 // load 8x8 array
load_buffer_8x8(const int16_t * input,__m128i * in,int stride)624 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
625                                    int stride) {
626   in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
627   in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
628   in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
629   in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
630   in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
631   in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
632   in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
633   in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
634 
635   in[0] = _mm_slli_epi16(in[0], 2);
636   in[1] = _mm_slli_epi16(in[1], 2);
637   in[2] = _mm_slli_epi16(in[2], 2);
638   in[3] = _mm_slli_epi16(in[3], 2);
639   in[4] = _mm_slli_epi16(in[4], 2);
640   in[5] = _mm_slli_epi16(in[5], 2);
641   in[6] = _mm_slli_epi16(in[6], 2);
642   in[7] = _mm_slli_epi16(in[7], 2);
643 }
644 
645 // right shift and rounding
right_shift_8x8(__m128i * res,const int bit)646 static INLINE void right_shift_8x8(__m128i *res, const int bit) {
647   __m128i sign0 = _mm_srai_epi16(res[0], 15);
648   __m128i sign1 = _mm_srai_epi16(res[1], 15);
649   __m128i sign2 = _mm_srai_epi16(res[2], 15);
650   __m128i sign3 = _mm_srai_epi16(res[3], 15);
651   __m128i sign4 = _mm_srai_epi16(res[4], 15);
652   __m128i sign5 = _mm_srai_epi16(res[5], 15);
653   __m128i sign6 = _mm_srai_epi16(res[6], 15);
654   __m128i sign7 = _mm_srai_epi16(res[7], 15);
655 
656   if (bit == 2) {
657     const __m128i const_rounding = _mm_set1_epi16(1);
658     res[0] = _mm_add_epi16(res[0], const_rounding);
659     res[1] = _mm_add_epi16(res[1], const_rounding);
660     res[2] = _mm_add_epi16(res[2], const_rounding);
661     res[3] = _mm_add_epi16(res[3], const_rounding);
662     res[4] = _mm_add_epi16(res[4], const_rounding);
663     res[5] = _mm_add_epi16(res[5], const_rounding);
664     res[6] = _mm_add_epi16(res[6], const_rounding);
665     res[7] = _mm_add_epi16(res[7], const_rounding);
666   }
667 
668   res[0] = _mm_sub_epi16(res[0], sign0);
669   res[1] = _mm_sub_epi16(res[1], sign1);
670   res[2] = _mm_sub_epi16(res[2], sign2);
671   res[3] = _mm_sub_epi16(res[3], sign3);
672   res[4] = _mm_sub_epi16(res[4], sign4);
673   res[5] = _mm_sub_epi16(res[5], sign5);
674   res[6] = _mm_sub_epi16(res[6], sign6);
675   res[7] = _mm_sub_epi16(res[7], sign7);
676 
677   if (bit == 1) {
678     res[0] = _mm_srai_epi16(res[0], 1);
679     res[1] = _mm_srai_epi16(res[1], 1);
680     res[2] = _mm_srai_epi16(res[2], 1);
681     res[3] = _mm_srai_epi16(res[3], 1);
682     res[4] = _mm_srai_epi16(res[4], 1);
683     res[5] = _mm_srai_epi16(res[5], 1);
684     res[6] = _mm_srai_epi16(res[6], 1);
685     res[7] = _mm_srai_epi16(res[7], 1);
686   } else {
687     res[0] = _mm_srai_epi16(res[0], 2);
688     res[1] = _mm_srai_epi16(res[1], 2);
689     res[2] = _mm_srai_epi16(res[2], 2);
690     res[3] = _mm_srai_epi16(res[3], 2);
691     res[4] = _mm_srai_epi16(res[4], 2);
692     res[5] = _mm_srai_epi16(res[5], 2);
693     res[6] = _mm_srai_epi16(res[6], 2);
694     res[7] = _mm_srai_epi16(res[7], 2);
695   }
696 }
697 
698 // write 8x8 array
write_buffer_8x8(tran_low_t * output,__m128i * res,int stride)699 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
700                                     int stride) {
701   store_output(&res[0], (output + 0 * stride));
702   store_output(&res[1], (output + 1 * stride));
703   store_output(&res[2], (output + 2 * stride));
704   store_output(&res[3], (output + 3 * stride));
705   store_output(&res[4], (output + 4 * stride));
706   store_output(&res[5], (output + 5 * stride));
707   store_output(&res[6], (output + 6 * stride));
708   store_output(&res[7], (output + 7 * stride));
709 }
710 
fdct8_sse2(__m128i * in)711 static void fdct8_sse2(__m128i *in) {
712   // constants
713   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
714   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
715   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
716   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
717   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
718   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
719   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
720   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
721   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
722   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
723   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
724   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
725 
726   // stage 1
727   s0 = _mm_add_epi16(in[0], in[7]);
728   s1 = _mm_add_epi16(in[1], in[6]);
729   s2 = _mm_add_epi16(in[2], in[5]);
730   s3 = _mm_add_epi16(in[3], in[4]);
731   s4 = _mm_sub_epi16(in[3], in[4]);
732   s5 = _mm_sub_epi16(in[2], in[5]);
733   s6 = _mm_sub_epi16(in[1], in[6]);
734   s7 = _mm_sub_epi16(in[0], in[7]);
735 
736   u0 = _mm_add_epi16(s0, s3);
737   u1 = _mm_add_epi16(s1, s2);
738   u2 = _mm_sub_epi16(s1, s2);
739   u3 = _mm_sub_epi16(s0, s3);
740   // interleave and perform butterfly multiplication/addition
741   v0 = _mm_unpacklo_epi16(u0, u1);
742   v1 = _mm_unpackhi_epi16(u0, u1);
743   v2 = _mm_unpacklo_epi16(u2, u3);
744   v3 = _mm_unpackhi_epi16(u2, u3);
745 
746   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
747   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
748   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
749   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
750   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
751   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
752   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
753   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
754 
755   // shift and rounding
756   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
757   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
758   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
759   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
760   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
761   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
762   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
763   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
764 
765   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
766   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
767   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
768   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
769   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
770   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
771   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
772   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
773 
774   in[0] = _mm_packs_epi32(u0, u1);
775   in[2] = _mm_packs_epi32(u4, u5);
776   in[4] = _mm_packs_epi32(u2, u3);
777   in[6] = _mm_packs_epi32(u6, u7);
778 
779   // stage 2
780   // interleave and perform butterfly multiplication/addition
781   u0 = _mm_unpacklo_epi16(s6, s5);
782   u1 = _mm_unpackhi_epi16(s6, s5);
783   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
784   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
785   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
786   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
787 
788   // shift and rounding
789   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
790   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
791   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
792   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
793 
794   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
795   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
796   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
797   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
798 
799   u0 = _mm_packs_epi32(v0, v1);
800   u1 = _mm_packs_epi32(v2, v3);
801 
802   // stage 3
803   s0 = _mm_add_epi16(s4, u0);
804   s1 = _mm_sub_epi16(s4, u0);
805   s2 = _mm_sub_epi16(s7, u1);
806   s3 = _mm_add_epi16(s7, u1);
807 
808   // stage 4
809   u0 = _mm_unpacklo_epi16(s0, s3);
810   u1 = _mm_unpackhi_epi16(s0, s3);
811   u2 = _mm_unpacklo_epi16(s1, s2);
812   u3 = _mm_unpackhi_epi16(s1, s2);
813 
814   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
815   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
816   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
817   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
818   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
819   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
820   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
821   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
822 
823   // shift and rounding
824   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
825   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
826   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
827   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
828   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
829   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
830   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
831   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
832 
833   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
834   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
835   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
836   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
837   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
838   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
839   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
840   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
841 
842   in[1] = _mm_packs_epi32(v0, v1);
843   in[3] = _mm_packs_epi32(v4, v5);
844   in[5] = _mm_packs_epi32(v2, v3);
845   in[7] = _mm_packs_epi32(v6, v7);
846 
847   // transpose
848   transpose_16bit_8x8(in, in);
849 }
850 
fadst8_sse2(__m128i * in)851 static void fadst8_sse2(__m128i *in) {
852   // Constants
853   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
854   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
855   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
856   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
857   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
858   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
859   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
860   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
861   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
862   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
863   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
864   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
865   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
866   const __m128i k__const_0 = _mm_set1_epi16(0);
867   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
868 
869   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
870   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
871   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
872   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
873   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
874 
875   // properly aligned for butterfly input
876   in0 = in[7];
877   in1 = in[0];
878   in2 = in[5];
879   in3 = in[2];
880   in4 = in[3];
881   in5 = in[4];
882   in6 = in[1];
883   in7 = in[6];
884 
885   // column transformation
886   // stage 1
887   // interleave and multiply/add into 32-bit integer
888   s0 = _mm_unpacklo_epi16(in0, in1);
889   s1 = _mm_unpackhi_epi16(in0, in1);
890   s2 = _mm_unpacklo_epi16(in2, in3);
891   s3 = _mm_unpackhi_epi16(in2, in3);
892   s4 = _mm_unpacklo_epi16(in4, in5);
893   s5 = _mm_unpackhi_epi16(in4, in5);
894   s6 = _mm_unpacklo_epi16(in6, in7);
895   s7 = _mm_unpackhi_epi16(in6, in7);
896 
897   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
898   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
899   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
900   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
901   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
902   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
903   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
904   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
905   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
906   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
907   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
908   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
909   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
910   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
911   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
912   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
913 
914   // addition
915   w0 = _mm_add_epi32(u0, u8);
916   w1 = _mm_add_epi32(u1, u9);
917   w2 = _mm_add_epi32(u2, u10);
918   w3 = _mm_add_epi32(u3, u11);
919   w4 = _mm_add_epi32(u4, u12);
920   w5 = _mm_add_epi32(u5, u13);
921   w6 = _mm_add_epi32(u6, u14);
922   w7 = _mm_add_epi32(u7, u15);
923   w8 = _mm_sub_epi32(u0, u8);
924   w9 = _mm_sub_epi32(u1, u9);
925   w10 = _mm_sub_epi32(u2, u10);
926   w11 = _mm_sub_epi32(u3, u11);
927   w12 = _mm_sub_epi32(u4, u12);
928   w13 = _mm_sub_epi32(u5, u13);
929   w14 = _mm_sub_epi32(u6, u14);
930   w15 = _mm_sub_epi32(u7, u15);
931 
932   // shift and rounding
933   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
934   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
935   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
936   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
937   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
938   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
939   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
940   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
941   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
942   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
943   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
944   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
945   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
946   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
947   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
948   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
949 
950   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
951   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
952   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
953   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
954   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
955   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
956   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
957   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
958   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
959   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
960   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
961   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
962   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
963   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
964   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
965   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
966 
967   // back to 16-bit and pack 8 integers into __m128i
968   in[0] = _mm_packs_epi32(u0, u1);
969   in[1] = _mm_packs_epi32(u2, u3);
970   in[2] = _mm_packs_epi32(u4, u5);
971   in[3] = _mm_packs_epi32(u6, u7);
972   in[4] = _mm_packs_epi32(u8, u9);
973   in[5] = _mm_packs_epi32(u10, u11);
974   in[6] = _mm_packs_epi32(u12, u13);
975   in[7] = _mm_packs_epi32(u14, u15);
976 
977   // stage 2
978   s0 = _mm_add_epi16(in[0], in[2]);
979   s1 = _mm_add_epi16(in[1], in[3]);
980   s2 = _mm_sub_epi16(in[0], in[2]);
981   s3 = _mm_sub_epi16(in[1], in[3]);
982   u0 = _mm_unpacklo_epi16(in[4], in[5]);
983   u1 = _mm_unpackhi_epi16(in[4], in[5]);
984   u2 = _mm_unpacklo_epi16(in[6], in[7]);
985   u3 = _mm_unpackhi_epi16(in[6], in[7]);
986 
987   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
988   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
989   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
990   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
991   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
992   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
993   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
994   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
995 
996   w0 = _mm_add_epi32(v0, v4);
997   w1 = _mm_add_epi32(v1, v5);
998   w2 = _mm_add_epi32(v2, v6);
999   w3 = _mm_add_epi32(v3, v7);
1000   w4 = _mm_sub_epi32(v0, v4);
1001   w5 = _mm_sub_epi32(v1, v5);
1002   w6 = _mm_sub_epi32(v2, v6);
1003   w7 = _mm_sub_epi32(v3, v7);
1004 
1005   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
1006   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
1007   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
1008   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
1009   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
1010   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
1011   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
1012   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
1013 
1014   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
1015   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
1016   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
1017   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
1018   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
1019   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
1020   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
1021   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
1022 
1023   // back to 16-bit intergers
1024   s4 = _mm_packs_epi32(u0, u1);
1025   s5 = _mm_packs_epi32(u2, u3);
1026   s6 = _mm_packs_epi32(u4, u5);
1027   s7 = _mm_packs_epi32(u6, u7);
1028 
1029   // stage 3
1030   u0 = _mm_unpacklo_epi16(s2, s3);
1031   u1 = _mm_unpackhi_epi16(s2, s3);
1032   u2 = _mm_unpacklo_epi16(s6, s7);
1033   u3 = _mm_unpackhi_epi16(s6, s7);
1034 
1035   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
1036   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
1037   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
1038   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
1039   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
1040   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
1041   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
1042   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
1043 
1044   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
1045   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
1046   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
1047   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
1048   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
1049   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
1050   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
1051   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
1052 
1053   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
1054   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
1055   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
1056   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
1057   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
1058   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
1059   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
1060   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
1061 
1062   s2 = _mm_packs_epi32(v0, v1);
1063   s3 = _mm_packs_epi32(v2, v3);
1064   s6 = _mm_packs_epi32(v4, v5);
1065   s7 = _mm_packs_epi32(v6, v7);
1066 
1067   // FIXME(jingning): do subtract using bit inversion?
1068   in[0] = s0;
1069   in[1] = _mm_sub_epi16(k__const_0, s4);
1070   in[2] = s6;
1071   in[3] = _mm_sub_epi16(k__const_0, s2);
1072   in[4] = s3;
1073   in[5] = _mm_sub_epi16(k__const_0, s7);
1074   in[6] = s5;
1075   in[7] = _mm_sub_epi16(k__const_0, s1);
1076 
1077   // transpose
1078   transpose_16bit_8x8(in, in);
1079 }
1080 
vp9_fht8x8_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)1081 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
1082                      int tx_type) {
1083   __m128i in[8];
1084 
1085   switch (tx_type) {
1086     case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
1087     case ADST_DCT:
1088       load_buffer_8x8(input, in, stride);
1089       fadst8_sse2(in);
1090       fdct8_sse2(in);
1091       right_shift_8x8(in, 1);
1092       write_buffer_8x8(output, in, 8);
1093       break;
1094     case DCT_ADST:
1095       load_buffer_8x8(input, in, stride);
1096       fdct8_sse2(in);
1097       fadst8_sse2(in);
1098       right_shift_8x8(in, 1);
1099       write_buffer_8x8(output, in, 8);
1100       break;
1101     default:
1102       assert(tx_type == ADST_ADST);
1103       load_buffer_8x8(input, in, stride);
1104       fadst8_sse2(in);
1105       fadst8_sse2(in);
1106       right_shift_8x8(in, 1);
1107       write_buffer_8x8(output, in, 8);
1108       break;
1109   }
1110 }
1111 
load_buffer_16x16(const int16_t * input,__m128i * in0,__m128i * in1,int stride)1112 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
1113                                      __m128i *in1, int stride) {
1114   // load first 8 columns
1115   load_buffer_8x8(input, in0, stride);
1116   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
1117 
1118   input += 8;
1119   // load second 8 columns
1120   load_buffer_8x8(input, in1, stride);
1121   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
1122 }
1123 
write_buffer_16x16(tran_low_t * output,__m128i * in0,__m128i * in1,int stride)1124 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
1125                                       __m128i *in1, int stride) {
1126   // write first 8 columns
1127   write_buffer_8x8(output, in0, stride);
1128   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
1129   // write second 8 columns
1130   output += 8;
1131   write_buffer_8x8(output, in1, stride);
1132   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
1133 }
1134 
right_shift_16x16(__m128i * res0,__m128i * res1)1135 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
1136   // perform rounding operations
1137   right_shift_8x8(res0, 2);
1138   right_shift_8x8(res0 + 8, 2);
1139   right_shift_8x8(res1, 2);
1140   right_shift_8x8(res1 + 8, 2);
1141 }
1142 
fdct16_8col(__m128i * in)1143 static void fdct16_8col(__m128i *in) {
1144   // perform 16x16 1-D DCT for 8 columns
1145   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
1146   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1147   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1148   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1149   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1150   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
1151   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1152   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
1153   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
1154   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
1155   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
1156   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
1157   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
1158   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
1159   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
1160   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
1161   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
1162   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
1163   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
1164   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1165 
1166   // stage 1
1167   i[0] = _mm_add_epi16(in[0], in[15]);
1168   i[1] = _mm_add_epi16(in[1], in[14]);
1169   i[2] = _mm_add_epi16(in[2], in[13]);
1170   i[3] = _mm_add_epi16(in[3], in[12]);
1171   i[4] = _mm_add_epi16(in[4], in[11]);
1172   i[5] = _mm_add_epi16(in[5], in[10]);
1173   i[6] = _mm_add_epi16(in[6], in[9]);
1174   i[7] = _mm_add_epi16(in[7], in[8]);
1175 
1176   s[0] = _mm_sub_epi16(in[7], in[8]);
1177   s[1] = _mm_sub_epi16(in[6], in[9]);
1178   s[2] = _mm_sub_epi16(in[5], in[10]);
1179   s[3] = _mm_sub_epi16(in[4], in[11]);
1180   s[4] = _mm_sub_epi16(in[3], in[12]);
1181   s[5] = _mm_sub_epi16(in[2], in[13]);
1182   s[6] = _mm_sub_epi16(in[1], in[14]);
1183   s[7] = _mm_sub_epi16(in[0], in[15]);
1184 
1185   p[0] = _mm_add_epi16(i[0], i[7]);
1186   p[1] = _mm_add_epi16(i[1], i[6]);
1187   p[2] = _mm_add_epi16(i[2], i[5]);
1188   p[3] = _mm_add_epi16(i[3], i[4]);
1189   p[4] = _mm_sub_epi16(i[3], i[4]);
1190   p[5] = _mm_sub_epi16(i[2], i[5]);
1191   p[6] = _mm_sub_epi16(i[1], i[6]);
1192   p[7] = _mm_sub_epi16(i[0], i[7]);
1193 
1194   u[0] = _mm_add_epi16(p[0], p[3]);
1195   u[1] = _mm_add_epi16(p[1], p[2]);
1196   u[2] = _mm_sub_epi16(p[1], p[2]);
1197   u[3] = _mm_sub_epi16(p[0], p[3]);
1198 
1199   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
1200   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
1201   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
1202   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
1203 
1204   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
1205   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
1206   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
1207   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
1208   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
1209   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
1210   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
1211   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
1212 
1213   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1214   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1215   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1216   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1217   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1218   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1219   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1220   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1221 
1222   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1223   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1224   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1225   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1226   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1227   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1228   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1229   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1230 
1231   in[0] = _mm_packs_epi32(u[0], u[1]);
1232   in[4] = _mm_packs_epi32(u[4], u[5]);
1233   in[8] = _mm_packs_epi32(u[2], u[3]);
1234   in[12] = _mm_packs_epi32(u[6], u[7]);
1235 
1236   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
1237   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
1238   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1239   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1240   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1241   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1242 
1243   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1244   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1245   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1246   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1247 
1248   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1249   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1250   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1251   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1252 
1253   u[0] = _mm_packs_epi32(v[0], v[1]);
1254   u[1] = _mm_packs_epi32(v[2], v[3]);
1255 
1256   t[0] = _mm_add_epi16(p[4], u[0]);
1257   t[1] = _mm_sub_epi16(p[4], u[0]);
1258   t[2] = _mm_sub_epi16(p[7], u[1]);
1259   t[3] = _mm_add_epi16(p[7], u[1]);
1260 
1261   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
1262   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
1263   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
1264   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
1265 
1266   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
1267   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
1268   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
1269   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
1270   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
1271   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
1272   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
1273   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
1274 
1275   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1276   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1277   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1278   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1279   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1280   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1281   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1282   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1283 
1284   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1285   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1286   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1287   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1288   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1289   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1290   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1291   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1292 
1293   in[2] = _mm_packs_epi32(v[0], v[1]);
1294   in[6] = _mm_packs_epi32(v[4], v[5]);
1295   in[10] = _mm_packs_epi32(v[2], v[3]);
1296   in[14] = _mm_packs_epi32(v[6], v[7]);
1297 
1298   // stage 2
1299   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
1300   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
1301   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
1302   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
1303 
1304   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
1305   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
1306   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1307   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1308   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1309   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1310   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1311   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1312 
1313   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1314   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1315   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1316   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1317   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1318   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1319   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1320   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1321 
1322   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1323   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1324   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1325   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1326   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1327   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1328   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1329   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1330 
1331   t[2] = _mm_packs_epi32(v[0], v[1]);
1332   t[3] = _mm_packs_epi32(v[2], v[3]);
1333   t[4] = _mm_packs_epi32(v[4], v[5]);
1334   t[5] = _mm_packs_epi32(v[6], v[7]);
1335 
1336   // stage 3
1337   p[0] = _mm_add_epi16(s[0], t[3]);
1338   p[1] = _mm_add_epi16(s[1], t[2]);
1339   p[2] = _mm_sub_epi16(s[1], t[2]);
1340   p[3] = _mm_sub_epi16(s[0], t[3]);
1341   p[4] = _mm_sub_epi16(s[7], t[4]);
1342   p[5] = _mm_sub_epi16(s[6], t[5]);
1343   p[6] = _mm_add_epi16(s[6], t[5]);
1344   p[7] = _mm_add_epi16(s[7], t[4]);
1345 
1346   // stage 4
1347   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
1348   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
1349   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
1350   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
1351 
1352   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
1353   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
1354   v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
1355   v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
1356   v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
1357   v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
1358   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
1359   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
1360 
1361   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1362   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1363   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1364   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1365   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1366   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1367   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1368   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1369 
1370   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1371   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1372   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1373   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1374   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1375   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1376   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1377   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1378 
1379   t[1] = _mm_packs_epi32(v[0], v[1]);
1380   t[2] = _mm_packs_epi32(v[2], v[3]);
1381   t[5] = _mm_packs_epi32(v[4], v[5]);
1382   t[6] = _mm_packs_epi32(v[6], v[7]);
1383 
1384   // stage 5
1385   s[0] = _mm_add_epi16(p[0], t[1]);
1386   s[1] = _mm_sub_epi16(p[0], t[1]);
1387   s[2] = _mm_add_epi16(p[3], t[2]);
1388   s[3] = _mm_sub_epi16(p[3], t[2]);
1389   s[4] = _mm_sub_epi16(p[4], t[5]);
1390   s[5] = _mm_add_epi16(p[4], t[5]);
1391   s[6] = _mm_sub_epi16(p[7], t[6]);
1392   s[7] = _mm_add_epi16(p[7], t[6]);
1393 
1394   // stage 6
1395   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
1396   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
1397   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
1398   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
1399   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
1400   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
1401   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
1402   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
1403 
1404   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
1405   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
1406   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
1407   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
1408   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
1409   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
1410   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
1411   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
1412   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
1413   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
1414   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
1415   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
1416   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
1417   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
1418   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
1419   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
1420 
1421   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1422   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1423   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1424   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1425   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1426   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1427   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1428   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1429   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1430   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1431   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1432   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1433   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1434   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1435   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1436   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1437 
1438   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1439   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1440   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1441   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1442   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1443   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1444   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1445   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1446   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1447   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1448   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1449   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1450   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1451   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1452   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1453   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1454 
1455   in[1] = _mm_packs_epi32(v[0], v[1]);
1456   in[9] = _mm_packs_epi32(v[2], v[3]);
1457   in[5] = _mm_packs_epi32(v[4], v[5]);
1458   in[13] = _mm_packs_epi32(v[6], v[7]);
1459   in[3] = _mm_packs_epi32(v[8], v[9]);
1460   in[11] = _mm_packs_epi32(v[10], v[11]);
1461   in[7] = _mm_packs_epi32(v[12], v[13]);
1462   in[15] = _mm_packs_epi32(v[14], v[15]);
1463 }
1464 
fadst16_8col(__m128i * in)1465 static void fadst16_8col(__m128i *in) {
1466   // perform 16x16 1-D ADST for 8 columns
1467   __m128i s[16], x[16], u[32], v[32];
1468   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1469   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1470   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1471   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1472   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1473   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1474   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1475   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1476   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1477   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1478   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1479   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1480   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1481   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1482   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1483   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1484   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1485   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1486   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1487   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1488   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1489   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1490   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1491   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1492   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1493   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1494   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1495   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1496   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1497   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1498   const __m128i kZero = _mm_set1_epi16(0);
1499 
1500   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1501   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1502   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1503   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1504   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1505   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1506   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1507   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1508   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1509   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1510   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1511   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1512   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1513   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1514   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1515   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1516 
1517   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1518   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1519   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1520   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1521   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1522   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1523   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1524   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1525   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1526   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1527   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1528   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1529   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1530   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1531   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1532   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1533   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1534   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1535   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1536   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1537   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1538   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1539   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1540   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1541   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1542   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1543   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1544   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1545   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1546   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1547   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1548   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1549 
1550   u[0] = _mm_add_epi32(v[0], v[16]);
1551   u[1] = _mm_add_epi32(v[1], v[17]);
1552   u[2] = _mm_add_epi32(v[2], v[18]);
1553   u[3] = _mm_add_epi32(v[3], v[19]);
1554   u[4] = _mm_add_epi32(v[4], v[20]);
1555   u[5] = _mm_add_epi32(v[5], v[21]);
1556   u[6] = _mm_add_epi32(v[6], v[22]);
1557   u[7] = _mm_add_epi32(v[7], v[23]);
1558   u[8] = _mm_add_epi32(v[8], v[24]);
1559   u[9] = _mm_add_epi32(v[9], v[25]);
1560   u[10] = _mm_add_epi32(v[10], v[26]);
1561   u[11] = _mm_add_epi32(v[11], v[27]);
1562   u[12] = _mm_add_epi32(v[12], v[28]);
1563   u[13] = _mm_add_epi32(v[13], v[29]);
1564   u[14] = _mm_add_epi32(v[14], v[30]);
1565   u[15] = _mm_add_epi32(v[15], v[31]);
1566   u[16] = _mm_sub_epi32(v[0], v[16]);
1567   u[17] = _mm_sub_epi32(v[1], v[17]);
1568   u[18] = _mm_sub_epi32(v[2], v[18]);
1569   u[19] = _mm_sub_epi32(v[3], v[19]);
1570   u[20] = _mm_sub_epi32(v[4], v[20]);
1571   u[21] = _mm_sub_epi32(v[5], v[21]);
1572   u[22] = _mm_sub_epi32(v[6], v[22]);
1573   u[23] = _mm_sub_epi32(v[7], v[23]);
1574   u[24] = _mm_sub_epi32(v[8], v[24]);
1575   u[25] = _mm_sub_epi32(v[9], v[25]);
1576   u[26] = _mm_sub_epi32(v[10], v[26]);
1577   u[27] = _mm_sub_epi32(v[11], v[27]);
1578   u[28] = _mm_sub_epi32(v[12], v[28]);
1579   u[29] = _mm_sub_epi32(v[13], v[29]);
1580   u[30] = _mm_sub_epi32(v[14], v[30]);
1581   u[31] = _mm_sub_epi32(v[15], v[31]);
1582 
1583   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1584   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1585   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1586   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1587   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1588   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1589   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1590   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1591   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1592   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1593   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1594   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1595   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1596   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1597   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1598   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1599   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1600   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1601   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1602   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1603   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1604   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1605   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1606   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1607   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1608   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1609   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1610   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1611   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1612   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1613   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1614   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1615 
1616   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1617   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1618   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1619   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1620   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1621   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1622   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1623   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1624   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1625   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1626   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1627   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1628   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1629   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1630   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1631   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1632   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1633   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1634   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1635   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1636   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1637   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1638   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1639   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1640   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1641   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1642   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1643   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1644   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1645   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1646   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1647   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1648 
1649   s[0] = _mm_packs_epi32(u[0], u[1]);
1650   s[1] = _mm_packs_epi32(u[2], u[3]);
1651   s[2] = _mm_packs_epi32(u[4], u[5]);
1652   s[3] = _mm_packs_epi32(u[6], u[7]);
1653   s[4] = _mm_packs_epi32(u[8], u[9]);
1654   s[5] = _mm_packs_epi32(u[10], u[11]);
1655   s[6] = _mm_packs_epi32(u[12], u[13]);
1656   s[7] = _mm_packs_epi32(u[14], u[15]);
1657   s[8] = _mm_packs_epi32(u[16], u[17]);
1658   s[9] = _mm_packs_epi32(u[18], u[19]);
1659   s[10] = _mm_packs_epi32(u[20], u[21]);
1660   s[11] = _mm_packs_epi32(u[22], u[23]);
1661   s[12] = _mm_packs_epi32(u[24], u[25]);
1662   s[13] = _mm_packs_epi32(u[26], u[27]);
1663   s[14] = _mm_packs_epi32(u[28], u[29]);
1664   s[15] = _mm_packs_epi32(u[30], u[31]);
1665 
1666   // stage 2
1667   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1668   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1669   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1670   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1671   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1672   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1673   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1674   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1675 
1676   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1677   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1678   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1679   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1680   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1681   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1682   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1683   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1684   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1685   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1686   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1687   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1688   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1689   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1690   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1691   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1692 
1693   u[0] = _mm_add_epi32(v[0], v[8]);
1694   u[1] = _mm_add_epi32(v[1], v[9]);
1695   u[2] = _mm_add_epi32(v[2], v[10]);
1696   u[3] = _mm_add_epi32(v[3], v[11]);
1697   u[4] = _mm_add_epi32(v[4], v[12]);
1698   u[5] = _mm_add_epi32(v[5], v[13]);
1699   u[6] = _mm_add_epi32(v[6], v[14]);
1700   u[7] = _mm_add_epi32(v[7], v[15]);
1701   u[8] = _mm_sub_epi32(v[0], v[8]);
1702   u[9] = _mm_sub_epi32(v[1], v[9]);
1703   u[10] = _mm_sub_epi32(v[2], v[10]);
1704   u[11] = _mm_sub_epi32(v[3], v[11]);
1705   u[12] = _mm_sub_epi32(v[4], v[12]);
1706   u[13] = _mm_sub_epi32(v[5], v[13]);
1707   u[14] = _mm_sub_epi32(v[6], v[14]);
1708   u[15] = _mm_sub_epi32(v[7], v[15]);
1709 
1710   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1711   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1712   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1713   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1714   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1715   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1716   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1717   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1718   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1719   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1720   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1721   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1722   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1723   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1724   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1725   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1726 
1727   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1728   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1729   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1730   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1731   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1732   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1733   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1734   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1735   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1736   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1737   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1738   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1739   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1740   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1741   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1742   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1743 
1744   x[0] = _mm_add_epi16(s[0], s[4]);
1745   x[1] = _mm_add_epi16(s[1], s[5]);
1746   x[2] = _mm_add_epi16(s[2], s[6]);
1747   x[3] = _mm_add_epi16(s[3], s[7]);
1748   x[4] = _mm_sub_epi16(s[0], s[4]);
1749   x[5] = _mm_sub_epi16(s[1], s[5]);
1750   x[6] = _mm_sub_epi16(s[2], s[6]);
1751   x[7] = _mm_sub_epi16(s[3], s[7]);
1752   x[8] = _mm_packs_epi32(u[0], u[1]);
1753   x[9] = _mm_packs_epi32(u[2], u[3]);
1754   x[10] = _mm_packs_epi32(u[4], u[5]);
1755   x[11] = _mm_packs_epi32(u[6], u[7]);
1756   x[12] = _mm_packs_epi32(u[8], u[9]);
1757   x[13] = _mm_packs_epi32(u[10], u[11]);
1758   x[14] = _mm_packs_epi32(u[12], u[13]);
1759   x[15] = _mm_packs_epi32(u[14], u[15]);
1760 
1761   // stage 3
1762   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1763   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1764   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1765   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1766   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1767   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1768   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1769   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1770 
1771   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1772   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1773   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1774   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1775   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1776   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1777   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1778   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1779   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1780   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1781   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1782   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1783   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1784   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1785   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1786   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1787 
1788   u[0] = _mm_add_epi32(v[0], v[4]);
1789   u[1] = _mm_add_epi32(v[1], v[5]);
1790   u[2] = _mm_add_epi32(v[2], v[6]);
1791   u[3] = _mm_add_epi32(v[3], v[7]);
1792   u[4] = _mm_sub_epi32(v[0], v[4]);
1793   u[5] = _mm_sub_epi32(v[1], v[5]);
1794   u[6] = _mm_sub_epi32(v[2], v[6]);
1795   u[7] = _mm_sub_epi32(v[3], v[7]);
1796   u[8] = _mm_add_epi32(v[8], v[12]);
1797   u[9] = _mm_add_epi32(v[9], v[13]);
1798   u[10] = _mm_add_epi32(v[10], v[14]);
1799   u[11] = _mm_add_epi32(v[11], v[15]);
1800   u[12] = _mm_sub_epi32(v[8], v[12]);
1801   u[13] = _mm_sub_epi32(v[9], v[13]);
1802   u[14] = _mm_sub_epi32(v[10], v[14]);
1803   u[15] = _mm_sub_epi32(v[11], v[15]);
1804 
1805   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1806   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1807   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1808   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1809   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1810   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1811   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1812   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1813   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1814   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1815   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1816   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1817   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1818   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1819   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1820   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1821 
1822   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1823   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1824   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1825   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1826   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1827   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1828   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1829   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1830   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1831   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1832   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1833   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1834   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1835   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1836   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1837   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1838 
1839   s[0] = _mm_add_epi16(x[0], x[2]);
1840   s[1] = _mm_add_epi16(x[1], x[3]);
1841   s[2] = _mm_sub_epi16(x[0], x[2]);
1842   s[3] = _mm_sub_epi16(x[1], x[3]);
1843   s[4] = _mm_packs_epi32(v[0], v[1]);
1844   s[5] = _mm_packs_epi32(v[2], v[3]);
1845   s[6] = _mm_packs_epi32(v[4], v[5]);
1846   s[7] = _mm_packs_epi32(v[6], v[7]);
1847   s[8] = _mm_add_epi16(x[8], x[10]);
1848   s[9] = _mm_add_epi16(x[9], x[11]);
1849   s[10] = _mm_sub_epi16(x[8], x[10]);
1850   s[11] = _mm_sub_epi16(x[9], x[11]);
1851   s[12] = _mm_packs_epi32(v[8], v[9]);
1852   s[13] = _mm_packs_epi32(v[10], v[11]);
1853   s[14] = _mm_packs_epi32(v[12], v[13]);
1854   s[15] = _mm_packs_epi32(v[14], v[15]);
1855 
1856   // stage 4
1857   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1858   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1859   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1860   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1861   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1862   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1863   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1864   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1865 
1866   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1867   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1868   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1869   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1870   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1871   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1872   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1873   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1874   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1875   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1876   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1877   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1878   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1879   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1880   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1881   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1882 
1883   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1884   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1885   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1886   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1887   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1888   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1889   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1890   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1891   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1892   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1893   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1894   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1895   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1896   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1897   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1898   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1899 
1900   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1901   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1902   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1903   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1904   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1905   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1906   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1907   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1908   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1909   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1910   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1911   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1912   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1913   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1914   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1915   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1916 
1917   in[0] = s[0];
1918   in[1] = _mm_sub_epi16(kZero, s[8]);
1919   in[2] = s[12];
1920   in[3] = _mm_sub_epi16(kZero, s[4]);
1921   in[4] = _mm_packs_epi32(v[4], v[5]);
1922   in[5] = _mm_packs_epi32(v[12], v[13]);
1923   in[6] = _mm_packs_epi32(v[8], v[9]);
1924   in[7] = _mm_packs_epi32(v[0], v[1]);
1925   in[8] = _mm_packs_epi32(v[2], v[3]);
1926   in[9] = _mm_packs_epi32(v[10], v[11]);
1927   in[10] = _mm_packs_epi32(v[14], v[15]);
1928   in[11] = _mm_packs_epi32(v[6], v[7]);
1929   in[12] = s[5];
1930   in[13] = _mm_sub_epi16(kZero, s[13]);
1931   in[14] = s[9];
1932   in[15] = _mm_sub_epi16(kZero, s[1]);
1933 }
1934 
fdct16_sse2(__m128i * in0,__m128i * in1)1935 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
1936   fdct16_8col(in0);
1937   fdct16_8col(in1);
1938   transpose_16bit_16x16(in0, in1);
1939 }
1940 
fadst16_sse2(__m128i * in0,__m128i * in1)1941 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
1942   fadst16_8col(in0);
1943   fadst16_8col(in1);
1944   transpose_16bit_16x16(in0, in1);
1945 }
1946 
vp9_fht16x16_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)1947 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
1948                        int tx_type) {
1949   __m128i in0[16], in1[16];
1950 
1951   switch (tx_type) {
1952     case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
1953     case ADST_DCT:
1954       load_buffer_16x16(input, in0, in1, stride);
1955       fadst16_sse2(in0, in1);
1956       right_shift_16x16(in0, in1);
1957       fdct16_sse2(in0, in1);
1958       write_buffer_16x16(output, in0, in1, 16);
1959       break;
1960     case DCT_ADST:
1961       load_buffer_16x16(input, in0, in1, stride);
1962       fdct16_sse2(in0, in1);
1963       right_shift_16x16(in0, in1);
1964       fadst16_sse2(in0, in1);
1965       write_buffer_16x16(output, in0, in1, 16);
1966       break;
1967     default:
1968       assert(tx_type == ADST_ADST);
1969       load_buffer_16x16(input, in0, in1, stride);
1970       fadst16_sse2(in0, in1);
1971       right_shift_16x16(in0, in1);
1972       fadst16_sse2(in0, in1);
1973       write_buffer_16x16(output, in0, in1, 16);
1974       break;
1975   }
1976 }
1977