• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "vpx_dsp/fwd_txfm.h"
14 #include "vpx_dsp/txfm_common.h"
15 #include "vpx_dsp/x86/txfm_common_sse2.h"
16 
17 // TODO(jingning) The high bit-depth version needs re-work for performance.
18 // The current SSE2 implementation also causes cross reference to the static
19 // functions in the C implementation file.
20 #if DCT_HIGH_BIT_DEPTH
21 #define ADD_EPI16 _mm_adds_epi16
22 #define SUB_EPI16 _mm_subs_epi16
23 #if FDCT32x32_HIGH_PRECISION
vpx_fdct32x32_rows_c(const int16_t * intermediate,tran_low_t * out)24 static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
25   int i, j;
26   for (i = 0; i < 32; ++i) {
27     tran_high_t temp_in[32], temp_out[32];
28     for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
29     vpx_fdct32(temp_in, temp_out, 0);
30     for (j = 0; j < 32; ++j)
31       out[j + i * 32] =
32           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
33   }
34 }
35 #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
36 #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
37 #else
vpx_fdct32x32_rd_rows_c(const int16_t * intermediate,tran_low_t * out)38 static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
39                                     tran_low_t *out) {
40   int i, j;
41   for (i = 0; i < 32; ++i) {
42     tran_high_t temp_in[32], temp_out[32];
43     for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
44     vpx_fdct32(temp_in, temp_out, 1);
45     for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
46   }
47 }
48 #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_rd_c
49 #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rd_rows_c
50 #endif  // FDCT32x32_HIGH_PRECISION
51 #else
52 #define ADD_EPI16 _mm_add_epi16
53 #define SUB_EPI16 _mm_sub_epi16
54 #endif  // DCT_HIGH_BIT_DEPTH
55 
FDCT32x32_2D(const int16_t * input,tran_low_t * output_org,int stride)56 void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
57   // Calculate pre-multiplied strides
58   const int str1 = stride;
59   const int str2 = 2 * stride;
60   const int str3 = 2 * stride + str1;
61   // We need an intermediate buffer between passes.
62   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
63   // Constants
64   //    When we use them, in one case, they are all the same. In all others
65   //    it's a pair of them that we need to repeat four times. This is done
66   //    by constructing the 32 bit constant corresponding to that pair.
67   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
68   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
69   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
70   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
71   const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
72   const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
73   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
74   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
75   const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
76   const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
77   const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
78   const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
79   const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
80   const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
81   const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
82   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
83   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
84   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
85   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
86   const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
87   const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
88   const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
89   const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
90   const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
91   const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
92   const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
93   const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
94   const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
95   const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
96   const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
97   const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
98   const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
99   const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
100   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
101   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
102   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
103   const __m128i kZero = _mm_set1_epi16(0);
104   const __m128i kOne = _mm_set1_epi16(1);
105 
106   // Do the two transform/transpose passes
107   int pass;
108 #if DCT_HIGH_BIT_DEPTH
109   int overflow;
110 #endif
111   for (pass = 0; pass < 2; ++pass) {
112     // We process eight columns (transposed rows in second pass) at a time.
113     int column_start;
114     for (column_start = 0; column_start < 32; column_start += 8) {
115       __m128i step1[32];
116       __m128i step2[32];
117       __m128i step3[32];
118       __m128i out[32];
119       // Stage 1
120       // Note: even though all the loads below are aligned, using the aligned
121       //       intrinsic make the code slightly slower.
122       if (0 == pass) {
123         const int16_t *in = &input[column_start];
124         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
125         // Note: the next four blocks could be in a loop. That would help the
126         //       instruction cache but is actually slower.
127         {
128           const int16_t *ina = in + 0 * str1;
129           const int16_t *inb = in + 31 * str1;
130           __m128i *step1a = &step1[0];
131           __m128i *step1b = &step1[31];
132           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
133           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
134           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
135           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
136           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
137           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
138           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
139           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
140           step1a[0] = _mm_add_epi16(ina0, inb0);
141           step1a[1] = _mm_add_epi16(ina1, inb1);
142           step1a[2] = _mm_add_epi16(ina2, inb2);
143           step1a[3] = _mm_add_epi16(ina3, inb3);
144           step1b[-3] = _mm_sub_epi16(ina3, inb3);
145           step1b[-2] = _mm_sub_epi16(ina2, inb2);
146           step1b[-1] = _mm_sub_epi16(ina1, inb1);
147           step1b[-0] = _mm_sub_epi16(ina0, inb0);
148           step1a[0] = _mm_slli_epi16(step1a[0], 2);
149           step1a[1] = _mm_slli_epi16(step1a[1], 2);
150           step1a[2] = _mm_slli_epi16(step1a[2], 2);
151           step1a[3] = _mm_slli_epi16(step1a[3], 2);
152           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
153           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
154           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
155           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
156         }
157         {
158           const int16_t *ina = in + 4 * str1;
159           const int16_t *inb = in + 27 * str1;
160           __m128i *step1a = &step1[4];
161           __m128i *step1b = &step1[27];
162           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
163           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
164           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
165           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
166           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
167           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
168           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
169           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
170           step1a[0] = _mm_add_epi16(ina0, inb0);
171           step1a[1] = _mm_add_epi16(ina1, inb1);
172           step1a[2] = _mm_add_epi16(ina2, inb2);
173           step1a[3] = _mm_add_epi16(ina3, inb3);
174           step1b[-3] = _mm_sub_epi16(ina3, inb3);
175           step1b[-2] = _mm_sub_epi16(ina2, inb2);
176           step1b[-1] = _mm_sub_epi16(ina1, inb1);
177           step1b[-0] = _mm_sub_epi16(ina0, inb0);
178           step1a[0] = _mm_slli_epi16(step1a[0], 2);
179           step1a[1] = _mm_slli_epi16(step1a[1], 2);
180           step1a[2] = _mm_slli_epi16(step1a[2], 2);
181           step1a[3] = _mm_slli_epi16(step1a[3], 2);
182           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
183           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
184           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
185           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
186         }
187         {
188           const int16_t *ina = in + 8 * str1;
189           const int16_t *inb = in + 23 * str1;
190           __m128i *step1a = &step1[8];
191           __m128i *step1b = &step1[23];
192           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
193           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
194           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
195           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
196           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
197           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
198           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
199           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
200           step1a[0] = _mm_add_epi16(ina0, inb0);
201           step1a[1] = _mm_add_epi16(ina1, inb1);
202           step1a[2] = _mm_add_epi16(ina2, inb2);
203           step1a[3] = _mm_add_epi16(ina3, inb3);
204           step1b[-3] = _mm_sub_epi16(ina3, inb3);
205           step1b[-2] = _mm_sub_epi16(ina2, inb2);
206           step1b[-1] = _mm_sub_epi16(ina1, inb1);
207           step1b[-0] = _mm_sub_epi16(ina0, inb0);
208           step1a[0] = _mm_slli_epi16(step1a[0], 2);
209           step1a[1] = _mm_slli_epi16(step1a[1], 2);
210           step1a[2] = _mm_slli_epi16(step1a[2], 2);
211           step1a[3] = _mm_slli_epi16(step1a[3], 2);
212           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
213           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
214           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
215           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
216         }
217         {
218           const int16_t *ina = in + 12 * str1;
219           const int16_t *inb = in + 19 * str1;
220           __m128i *step1a = &step1[12];
221           __m128i *step1b = &step1[19];
222           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
223           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
224           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
225           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
226           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
227           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
228           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
229           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
230           step1a[0] = _mm_add_epi16(ina0, inb0);
231           step1a[1] = _mm_add_epi16(ina1, inb1);
232           step1a[2] = _mm_add_epi16(ina2, inb2);
233           step1a[3] = _mm_add_epi16(ina3, inb3);
234           step1b[-3] = _mm_sub_epi16(ina3, inb3);
235           step1b[-2] = _mm_sub_epi16(ina2, inb2);
236           step1b[-1] = _mm_sub_epi16(ina1, inb1);
237           step1b[-0] = _mm_sub_epi16(ina0, inb0);
238           step1a[0] = _mm_slli_epi16(step1a[0], 2);
239           step1a[1] = _mm_slli_epi16(step1a[1], 2);
240           step1a[2] = _mm_slli_epi16(step1a[2], 2);
241           step1a[3] = _mm_slli_epi16(step1a[3], 2);
242           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
243           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
244           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
245           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
246         }
247       } else {
248         int16_t *in = &intermediate[column_start];
249         // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
250         // Note: using the same approach as above to have common offset is
251         //       counter-productive as all offsets can be calculated at compile
252         //       time.
253         // Note: the next four blocks could be in a loop. That would help the
254         //       instruction cache but is actually slower.
255         {
256           __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
257           __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
258           __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
259           __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
260           __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
261           __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
262           __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
263           __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
264           step1[0] = ADD_EPI16(in00, in31);
265           step1[1] = ADD_EPI16(in01, in30);
266           step1[2] = ADD_EPI16(in02, in29);
267           step1[3] = ADD_EPI16(in03, in28);
268           step1[28] = SUB_EPI16(in03, in28);
269           step1[29] = SUB_EPI16(in02, in29);
270           step1[30] = SUB_EPI16(in01, in30);
271           step1[31] = SUB_EPI16(in00, in31);
272 #if DCT_HIGH_BIT_DEPTH
273           overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
274                                              &step1[3], &step1[28], &step1[29],
275                                              &step1[30], &step1[31]);
276           if (overflow) {
277             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
278             return;
279           }
280 #endif  // DCT_HIGH_BIT_DEPTH
281         }
282         {
283           __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
284           __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
285           __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
286           __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
287           __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
288           __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
289           __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
290           __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
291           step1[4] = ADD_EPI16(in04, in27);
292           step1[5] = ADD_EPI16(in05, in26);
293           step1[6] = ADD_EPI16(in06, in25);
294           step1[7] = ADD_EPI16(in07, in24);
295           step1[24] = SUB_EPI16(in07, in24);
296           step1[25] = SUB_EPI16(in06, in25);
297           step1[26] = SUB_EPI16(in05, in26);
298           step1[27] = SUB_EPI16(in04, in27);
299 #if DCT_HIGH_BIT_DEPTH
300           overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
301                                              &step1[7], &step1[24], &step1[25],
302                                              &step1[26], &step1[27]);
303           if (overflow) {
304             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
305             return;
306           }
307 #endif  // DCT_HIGH_BIT_DEPTH
308         }
309         {
310           __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
311           __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
312           __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
313           __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
314           __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
315           __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
316           __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
317           __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
318           step1[8] = ADD_EPI16(in08, in23);
319           step1[9] = ADD_EPI16(in09, in22);
320           step1[10] = ADD_EPI16(in10, in21);
321           step1[11] = ADD_EPI16(in11, in20);
322           step1[20] = SUB_EPI16(in11, in20);
323           step1[21] = SUB_EPI16(in10, in21);
324           step1[22] = SUB_EPI16(in09, in22);
325           step1[23] = SUB_EPI16(in08, in23);
326 #if DCT_HIGH_BIT_DEPTH
327           overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
328                                              &step1[11], &step1[20], &step1[21],
329                                              &step1[22], &step1[23]);
330           if (overflow) {
331             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
332             return;
333           }
334 #endif  // DCT_HIGH_BIT_DEPTH
335         }
336         {
337           __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
338           __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
339           __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
340           __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
341           __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
342           __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
343           __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
344           __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
345           step1[12] = ADD_EPI16(in12, in19);
346           step1[13] = ADD_EPI16(in13, in18);
347           step1[14] = ADD_EPI16(in14, in17);
348           step1[15] = ADD_EPI16(in15, in16);
349           step1[16] = SUB_EPI16(in15, in16);
350           step1[17] = SUB_EPI16(in14, in17);
351           step1[18] = SUB_EPI16(in13, in18);
352           step1[19] = SUB_EPI16(in12, in19);
353 #if DCT_HIGH_BIT_DEPTH
354           overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
355                                              &step1[15], &step1[16], &step1[17],
356                                              &step1[18], &step1[19]);
357           if (overflow) {
358             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
359             return;
360           }
361 #endif  // DCT_HIGH_BIT_DEPTH
362         }
363       }
364       // Stage 2
365       {
366         step2[0] = ADD_EPI16(step1[0], step1[15]);
367         step2[1] = ADD_EPI16(step1[1], step1[14]);
368         step2[2] = ADD_EPI16(step1[2], step1[13]);
369         step2[3] = ADD_EPI16(step1[3], step1[12]);
370         step2[4] = ADD_EPI16(step1[4], step1[11]);
371         step2[5] = ADD_EPI16(step1[5], step1[10]);
372         step2[6] = ADD_EPI16(step1[6], step1[9]);
373         step2[7] = ADD_EPI16(step1[7], step1[8]);
374         step2[8] = SUB_EPI16(step1[7], step1[8]);
375         step2[9] = SUB_EPI16(step1[6], step1[9]);
376         step2[10] = SUB_EPI16(step1[5], step1[10]);
377         step2[11] = SUB_EPI16(step1[4], step1[11]);
378         step2[12] = SUB_EPI16(step1[3], step1[12]);
379         step2[13] = SUB_EPI16(step1[2], step1[13]);
380         step2[14] = SUB_EPI16(step1[1], step1[14]);
381         step2[15] = SUB_EPI16(step1[0], step1[15]);
382 #if DCT_HIGH_BIT_DEPTH
383         overflow = check_epi16_overflow_x16(
384             &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
385             &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
386             &step2[12], &step2[13], &step2[14], &step2[15]);
387         if (overflow) {
388           if (pass == 0)
389             HIGH_FDCT32x32_2D_C(input, output_org, stride);
390           else
391             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
392           return;
393         }
394 #endif  // DCT_HIGH_BIT_DEPTH
395       }
396       {
397         const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
398         const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
399         const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
400         const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
401         const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
402         const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
403         const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
404         const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
405         const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
406         const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
407         const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
408         const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
409         const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
410         const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
411         const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
412         const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
413         const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
414         const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
415         const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
416         const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
417         const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
418         const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
419         const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
420         const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
421         // dct_const_round_shift
422         const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
423         const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
424         const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
425         const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
426         const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
427         const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
428         const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
429         const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
430         const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
431         const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
432         const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
433         const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
434         const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
435         const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
436         const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
437         const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
438         const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
439         const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
440         const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
441         const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
442         const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
443         const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
444         const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
445         const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
446         const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
447         const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
448         const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
449         const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
450         const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
451         const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
452         const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
453         const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
454         // Combine
455         step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
456         step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
457         step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
458         step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
459         step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
460         step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
461         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
462         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
463 #if DCT_HIGH_BIT_DEPTH
464         overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
465                                            &step2[23], &step2[24], &step2[25],
466                                            &step2[26], &step2[27]);
467         if (overflow) {
468           if (pass == 0)
469             HIGH_FDCT32x32_2D_C(input, output_org, stride);
470           else
471             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
472           return;
473         }
474 #endif  // DCT_HIGH_BIT_DEPTH
475       }
476 
477 #if !FDCT32x32_HIGH_PRECISION
478       // dump the magnitude by half, hence the intermediate values are within
479       // the range of 16 bits.
480       if (1 == pass) {
481         __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
482         __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
483         __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
484         __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
485         __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
486         __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
487         __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
488         __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
489         __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
490         __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
491         __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
492         __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
493         __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
494         __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
495         __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
496         __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
497         __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
498         __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
499         __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
500         __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
501         __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
502         __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
503         __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
504         __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
505         __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
506         __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
507         __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
508         __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
509         __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
510         __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
511         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
512         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
513 
514         step2[0] = SUB_EPI16(step2[0], s3_00_0);
515         step2[1] = SUB_EPI16(step2[1], s3_01_0);
516         step2[2] = SUB_EPI16(step2[2], s3_02_0);
517         step2[3] = SUB_EPI16(step2[3], s3_03_0);
518         step2[4] = SUB_EPI16(step2[4], s3_04_0);
519         step2[5] = SUB_EPI16(step2[5], s3_05_0);
520         step2[6] = SUB_EPI16(step2[6], s3_06_0);
521         step2[7] = SUB_EPI16(step2[7], s3_07_0);
522         step2[8] = SUB_EPI16(step2[8], s2_08_0);
523         step2[9] = SUB_EPI16(step2[9], s2_09_0);
524         step2[10] = SUB_EPI16(step2[10], s3_10_0);
525         step2[11] = SUB_EPI16(step2[11], s3_11_0);
526         step2[12] = SUB_EPI16(step2[12], s3_12_0);
527         step2[13] = SUB_EPI16(step2[13], s3_13_0);
528         step2[14] = SUB_EPI16(step2[14], s2_14_0);
529         step2[15] = SUB_EPI16(step2[15], s2_15_0);
530         step1[16] = SUB_EPI16(step1[16], s3_16_0);
531         step1[17] = SUB_EPI16(step1[17], s3_17_0);
532         step1[18] = SUB_EPI16(step1[18], s3_18_0);
533         step1[19] = SUB_EPI16(step1[19], s3_19_0);
534         step2[20] = SUB_EPI16(step2[20], s3_20_0);
535         step2[21] = SUB_EPI16(step2[21], s3_21_0);
536         step2[22] = SUB_EPI16(step2[22], s3_22_0);
537         step2[23] = SUB_EPI16(step2[23], s3_23_0);
538         step2[24] = SUB_EPI16(step2[24], s3_24_0);
539         step2[25] = SUB_EPI16(step2[25], s3_25_0);
540         step2[26] = SUB_EPI16(step2[26], s3_26_0);
541         step2[27] = SUB_EPI16(step2[27], s3_27_0);
542         step1[28] = SUB_EPI16(step1[28], s3_28_0);
543         step1[29] = SUB_EPI16(step1[29], s3_29_0);
544         step1[30] = SUB_EPI16(step1[30], s3_30_0);
545         step1[31] = SUB_EPI16(step1[31], s3_31_0);
546 #if DCT_HIGH_BIT_DEPTH
547         overflow = check_epi16_overflow_x32(
548             &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
549             &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
550             &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
551             &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
552             &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
553             &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
554         if (overflow) {
555           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
556           return;
557         }
558 #endif  // DCT_HIGH_BIT_DEPTH
559         step2[0] = _mm_add_epi16(step2[0], kOne);
560         step2[1] = _mm_add_epi16(step2[1], kOne);
561         step2[2] = _mm_add_epi16(step2[2], kOne);
562         step2[3] = _mm_add_epi16(step2[3], kOne);
563         step2[4] = _mm_add_epi16(step2[4], kOne);
564         step2[5] = _mm_add_epi16(step2[5], kOne);
565         step2[6] = _mm_add_epi16(step2[6], kOne);
566         step2[7] = _mm_add_epi16(step2[7], kOne);
567         step2[8] = _mm_add_epi16(step2[8], kOne);
568         step2[9] = _mm_add_epi16(step2[9], kOne);
569         step2[10] = _mm_add_epi16(step2[10], kOne);
570         step2[11] = _mm_add_epi16(step2[11], kOne);
571         step2[12] = _mm_add_epi16(step2[12], kOne);
572         step2[13] = _mm_add_epi16(step2[13], kOne);
573         step2[14] = _mm_add_epi16(step2[14], kOne);
574         step2[15] = _mm_add_epi16(step2[15], kOne);
575         step1[16] = _mm_add_epi16(step1[16], kOne);
576         step1[17] = _mm_add_epi16(step1[17], kOne);
577         step1[18] = _mm_add_epi16(step1[18], kOne);
578         step1[19] = _mm_add_epi16(step1[19], kOne);
579         step2[20] = _mm_add_epi16(step2[20], kOne);
580         step2[21] = _mm_add_epi16(step2[21], kOne);
581         step2[22] = _mm_add_epi16(step2[22], kOne);
582         step2[23] = _mm_add_epi16(step2[23], kOne);
583         step2[24] = _mm_add_epi16(step2[24], kOne);
584         step2[25] = _mm_add_epi16(step2[25], kOne);
585         step2[26] = _mm_add_epi16(step2[26], kOne);
586         step2[27] = _mm_add_epi16(step2[27], kOne);
587         step1[28] = _mm_add_epi16(step1[28], kOne);
588         step1[29] = _mm_add_epi16(step1[29], kOne);
589         step1[30] = _mm_add_epi16(step1[30], kOne);
590         step1[31] = _mm_add_epi16(step1[31], kOne);
591 
592         step2[0] = _mm_srai_epi16(step2[0], 2);
593         step2[1] = _mm_srai_epi16(step2[1], 2);
594         step2[2] = _mm_srai_epi16(step2[2], 2);
595         step2[3] = _mm_srai_epi16(step2[3], 2);
596         step2[4] = _mm_srai_epi16(step2[4], 2);
597         step2[5] = _mm_srai_epi16(step2[5], 2);
598         step2[6] = _mm_srai_epi16(step2[6], 2);
599         step2[7] = _mm_srai_epi16(step2[7], 2);
600         step2[8] = _mm_srai_epi16(step2[8], 2);
601         step2[9] = _mm_srai_epi16(step2[9], 2);
602         step2[10] = _mm_srai_epi16(step2[10], 2);
603         step2[11] = _mm_srai_epi16(step2[11], 2);
604         step2[12] = _mm_srai_epi16(step2[12], 2);
605         step2[13] = _mm_srai_epi16(step2[13], 2);
606         step2[14] = _mm_srai_epi16(step2[14], 2);
607         step2[15] = _mm_srai_epi16(step2[15], 2);
608         step1[16] = _mm_srai_epi16(step1[16], 2);
609         step1[17] = _mm_srai_epi16(step1[17], 2);
610         step1[18] = _mm_srai_epi16(step1[18], 2);
611         step1[19] = _mm_srai_epi16(step1[19], 2);
612         step2[20] = _mm_srai_epi16(step2[20], 2);
613         step2[21] = _mm_srai_epi16(step2[21], 2);
614         step2[22] = _mm_srai_epi16(step2[22], 2);
615         step2[23] = _mm_srai_epi16(step2[23], 2);
616         step2[24] = _mm_srai_epi16(step2[24], 2);
617         step2[25] = _mm_srai_epi16(step2[25], 2);
618         step2[26] = _mm_srai_epi16(step2[26], 2);
619         step2[27] = _mm_srai_epi16(step2[27], 2);
620         step1[28] = _mm_srai_epi16(step1[28], 2);
621         step1[29] = _mm_srai_epi16(step1[29], 2);
622         step1[30] = _mm_srai_epi16(step1[30], 2);
623         step1[31] = _mm_srai_epi16(step1[31], 2);
624       }
625 #endif  // !FDCT32x32_HIGH_PRECISION
626 
627 #if FDCT32x32_HIGH_PRECISION
628       if (pass == 0) {
629 #endif
630         // Stage 3
631         {
632           step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
633           step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
634           step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
635           step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
636           step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
637           step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
638           step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
639           step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
640 #if DCT_HIGH_BIT_DEPTH
641           overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
642                                              &step3[3], &step3[4], &step3[5],
643                                              &step3[6], &step3[7]);
644           if (overflow) {
645             if (pass == 0)
646               HIGH_FDCT32x32_2D_C(input, output_org, stride);
647             else
648               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
649             return;
650           }
651 #endif  // DCT_HIGH_BIT_DEPTH
652         }
653         {
654           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
655           const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
656           const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
657           const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
658           const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
659           const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
660           const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
661           const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
662           const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
663           const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
664           const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
665           const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
666           // dct_const_round_shift
667           const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
668           const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
669           const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
670           const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
671           const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
672           const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
673           const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
674           const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
675           const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
676           const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
677           const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
678           const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
679           const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
680           const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
681           const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
682           const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
683           // Combine
684           step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
685           step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
686           step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
687           step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
688 #if DCT_HIGH_BIT_DEPTH
689           overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
690                                              &step3[13]);
691           if (overflow) {
692             if (pass == 0)
693               HIGH_FDCT32x32_2D_C(input, output_org, stride);
694             else
695               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
696             return;
697           }
698 #endif  // DCT_HIGH_BIT_DEPTH
699         }
700         {
701           step3[16] = ADD_EPI16(step2[23], step1[16]);
702           step3[17] = ADD_EPI16(step2[22], step1[17]);
703           step3[18] = ADD_EPI16(step2[21], step1[18]);
704           step3[19] = ADD_EPI16(step2[20], step1[19]);
705           step3[20] = SUB_EPI16(step1[19], step2[20]);
706           step3[21] = SUB_EPI16(step1[18], step2[21]);
707           step3[22] = SUB_EPI16(step1[17], step2[22]);
708           step3[23] = SUB_EPI16(step1[16], step2[23]);
709           step3[24] = SUB_EPI16(step1[31], step2[24]);
710           step3[25] = SUB_EPI16(step1[30], step2[25]);
711           step3[26] = SUB_EPI16(step1[29], step2[26]);
712           step3[27] = SUB_EPI16(step1[28], step2[27]);
713           step3[28] = ADD_EPI16(step2[27], step1[28]);
714           step3[29] = ADD_EPI16(step2[26], step1[29]);
715           step3[30] = ADD_EPI16(step2[25], step1[30]);
716           step3[31] = ADD_EPI16(step2[24], step1[31]);
717 #if DCT_HIGH_BIT_DEPTH
718           overflow = check_epi16_overflow_x16(
719               &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
720               &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
721               &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
722               &step3[31]);
723           if (overflow) {
724             if (pass == 0)
725               HIGH_FDCT32x32_2D_C(input, output_org, stride);
726             else
727               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
728             return;
729           }
730 #endif  // DCT_HIGH_BIT_DEPTH
731         }
732 
733         // Stage 4
734         {
735           step1[0] = ADD_EPI16(step3[3], step3[0]);
736           step1[1] = ADD_EPI16(step3[2], step3[1]);
737           step1[2] = SUB_EPI16(step3[1], step3[2]);
738           step1[3] = SUB_EPI16(step3[0], step3[3]);
739           step1[8] = ADD_EPI16(step3[11], step2[8]);
740           step1[9] = ADD_EPI16(step3[10], step2[9]);
741           step1[10] = SUB_EPI16(step2[9], step3[10]);
742           step1[11] = SUB_EPI16(step2[8], step3[11]);
743           step1[12] = SUB_EPI16(step2[15], step3[12]);
744           step1[13] = SUB_EPI16(step2[14], step3[13]);
745           step1[14] = ADD_EPI16(step3[13], step2[14]);
746           step1[15] = ADD_EPI16(step3[12], step2[15]);
747 #if DCT_HIGH_BIT_DEPTH
748           overflow = check_epi16_overflow_x16(
749               &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
750               &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
751               &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
752           if (overflow) {
753             if (pass == 0)
754               HIGH_FDCT32x32_2D_C(input, output_org, stride);
755             else
756               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
757             return;
758           }
759 #endif  // DCT_HIGH_BIT_DEPTH
760         }
761         {
762           const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
763           const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
764           const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
765           const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
766           const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
767           const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
768           // dct_const_round_shift
769           const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
770           const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
771           const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
772           const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
773           const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
774           const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
775           const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
776           const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
777           // Combine
778           step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
779           step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
780 #if DCT_HIGH_BIT_DEPTH
781           overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
782           if (overflow) {
783             if (pass == 0)
784               HIGH_FDCT32x32_2D_C(input, output_org, stride);
785             else
786               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
787             return;
788           }
789 #endif  // DCT_HIGH_BIT_DEPTH
790         }
791         {
792           const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
793           const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
794           const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
795           const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
796           const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
797           const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
798           const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
799           const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
800           const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
801           const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
802           const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
803           const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
804           const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
805           const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
806           const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
807           const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
808           const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
809           const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
810           const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
811           const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
812           const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
813           const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
814           const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
815           const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
816           // dct_const_round_shift
817           const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
818           const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
819           const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
820           const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
821           const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
822           const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
823           const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
824           const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
825           const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
826           const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
827           const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
828           const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
829           const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
830           const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
831           const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
832           const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
833           const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
834           const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
835           const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
836           const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
837           const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
838           const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
839           const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
840           const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
841           const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
842           const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
843           const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
844           const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
845           const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
846           const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
847           const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
848           const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
849           // Combine
850           step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
851           step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
852           step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
853           step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
854           step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
855           step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
856           step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
857           step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
858 #if DCT_HIGH_BIT_DEPTH
859           overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
860                                              &step1[21], &step1[26], &step1[27],
861                                              &step1[28], &step1[29]);
862           if (overflow) {
863             if (pass == 0)
864               HIGH_FDCT32x32_2D_C(input, output_org, stride);
865             else
866               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
867             return;
868           }
869 #endif  // DCT_HIGH_BIT_DEPTH
870         }
871         // Stage 5
872         {
873           step2[4] = ADD_EPI16(step1[5], step3[4]);
874           step2[5] = SUB_EPI16(step3[4], step1[5]);
875           step2[6] = SUB_EPI16(step3[7], step1[6]);
876           step2[7] = ADD_EPI16(step1[6], step3[7]);
877 #if DCT_HIGH_BIT_DEPTH
878           overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
879                                              &step2[7]);
880           if (overflow) {
881             if (pass == 0)
882               HIGH_FDCT32x32_2D_C(input, output_org, stride);
883             else
884               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
885             return;
886           }
887 #endif  // DCT_HIGH_BIT_DEPTH
888         }
889         {
890           const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
891           const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
892           const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
893           const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
894           const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
895           const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
896           const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
897           const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
898           const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
899           const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
900           const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
901           const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
902           // dct_const_round_shift
903           const __m128i out_00_4 =
904               _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
905           const __m128i out_00_5 =
906               _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
907           const __m128i out_16_4 =
908               _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
909           const __m128i out_16_5 =
910               _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
911           const __m128i out_08_4 =
912               _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
913           const __m128i out_08_5 =
914               _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
915           const __m128i out_24_4 =
916               _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
917           const __m128i out_24_5 =
918               _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
919           const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
920           const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
921           const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
922           const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
923           const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
924           const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
925           const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
926           const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
927           // Combine
928           out[0] = _mm_packs_epi32(out_00_6, out_00_7);
929           out[16] = _mm_packs_epi32(out_16_6, out_16_7);
930           out[8] = _mm_packs_epi32(out_08_6, out_08_7);
931           out[24] = _mm_packs_epi32(out_24_6, out_24_7);
932 #if DCT_HIGH_BIT_DEPTH
933           overflow =
934               check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
935           if (overflow) {
936             if (pass == 0)
937               HIGH_FDCT32x32_2D_C(input, output_org, stride);
938             else
939               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
940             return;
941           }
942 #endif  // DCT_HIGH_BIT_DEPTH
943         }
944         {
945           const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
946           const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
947           const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
948           const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
949           const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
950           const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
951           const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
952           const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
953           const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
954           const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
955           const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
956           const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
957           // dct_const_round_shift
958           const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
959           const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
960           const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
961           const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
962           const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
963           const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
964           const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
965           const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
966           const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
967           const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
968           const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
969           const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
970           const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
971           const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
972           const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
973           const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
974           // Combine
975           step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
976           step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
977           step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
978           step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
979 #if DCT_HIGH_BIT_DEPTH
980           overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
981                                              &step2[14]);
982           if (overflow) {
983             if (pass == 0)
984               HIGH_FDCT32x32_2D_C(input, output_org, stride);
985             else
986               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
987             return;
988           }
989 #endif  // DCT_HIGH_BIT_DEPTH
990         }
991         {
992           step2[16] = ADD_EPI16(step1[19], step3[16]);
993           step2[17] = ADD_EPI16(step1[18], step3[17]);
994           step2[18] = SUB_EPI16(step3[17], step1[18]);
995           step2[19] = SUB_EPI16(step3[16], step1[19]);
996           step2[20] = SUB_EPI16(step3[23], step1[20]);
997           step2[21] = SUB_EPI16(step3[22], step1[21]);
998           step2[22] = ADD_EPI16(step1[21], step3[22]);
999           step2[23] = ADD_EPI16(step1[20], step3[23]);
1000           step2[24] = ADD_EPI16(step1[27], step3[24]);
1001           step2[25] = ADD_EPI16(step1[26], step3[25]);
1002           step2[26] = SUB_EPI16(step3[25], step1[26]);
1003           step2[27] = SUB_EPI16(step3[24], step1[27]);
1004           step2[28] = SUB_EPI16(step3[31], step1[28]);
1005           step2[29] = SUB_EPI16(step3[30], step1[29]);
1006           step2[30] = ADD_EPI16(step1[29], step3[30]);
1007           step2[31] = ADD_EPI16(step1[28], step3[31]);
1008 #if DCT_HIGH_BIT_DEPTH
1009           overflow = check_epi16_overflow_x16(
1010               &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
1011               &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
1012               &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
1013               &step2[31]);
1014           if (overflow) {
1015             if (pass == 0)
1016               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1017             else
1018               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1019             return;
1020           }
1021 #endif  // DCT_HIGH_BIT_DEPTH
1022         }
1023         // Stage 6
1024         {
1025           const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1026           const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1027           const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1028           const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1029           const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1030           const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1031           const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1032           const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1033           const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
1034           const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
1035           const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
1036           const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
1037           const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
1038           const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
1039           const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
1040           const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
1041           // dct_const_round_shift
1042           const __m128i out_04_4 =
1043               _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
1044           const __m128i out_04_5 =
1045               _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
1046           const __m128i out_20_4 =
1047               _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
1048           const __m128i out_20_5 =
1049               _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
1050           const __m128i out_12_4 =
1051               _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
1052           const __m128i out_12_5 =
1053               _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
1054           const __m128i out_28_4 =
1055               _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
1056           const __m128i out_28_5 =
1057               _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
1058           const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
1059           const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
1060           const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
1061           const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
1062           const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
1063           const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
1064           const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
1065           const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
1066           // Combine
1067           out[4] = _mm_packs_epi32(out_04_6, out_04_7);
1068           out[20] = _mm_packs_epi32(out_20_6, out_20_7);
1069           out[12] = _mm_packs_epi32(out_12_6, out_12_7);
1070           out[28] = _mm_packs_epi32(out_28_6, out_28_7);
1071 #if DCT_HIGH_BIT_DEPTH
1072           overflow =
1073               check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
1074           if (overflow) {
1075             if (pass == 0)
1076               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1077             else
1078               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1079             return;
1080           }
1081 #endif  // DCT_HIGH_BIT_DEPTH
1082         }
1083         {
1084           step3[8] = ADD_EPI16(step2[9], step1[8]);
1085           step3[9] = SUB_EPI16(step1[8], step2[9]);
1086           step3[10] = SUB_EPI16(step1[11], step2[10]);
1087           step3[11] = ADD_EPI16(step2[10], step1[11]);
1088           step3[12] = ADD_EPI16(step2[13], step1[12]);
1089           step3[13] = SUB_EPI16(step1[12], step2[13]);
1090           step3[14] = SUB_EPI16(step1[15], step2[14]);
1091           step3[15] = ADD_EPI16(step2[14], step1[15]);
1092 #if DCT_HIGH_BIT_DEPTH
1093           overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
1094                                              &step3[11], &step3[12], &step3[13],
1095                                              &step3[14], &step3[15]);
1096           if (overflow) {
1097             if (pass == 0)
1098               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1099             else
1100               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1101             return;
1102           }
1103 #endif  // DCT_HIGH_BIT_DEPTH
1104         }
1105         {
1106           const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
1107           const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
1108           const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
1109           const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
1110           const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
1111           const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
1112           const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
1113           const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
1114           const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
1115           const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
1116           const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
1117           const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
1118           const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
1119           const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
1120           const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
1121           const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
1122           const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
1123           const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
1124           const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
1125           const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
1126           const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
1127           const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
1128           const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
1129           const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
1130           // dct_const_round_shift
1131           const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
1132           const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
1133           const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
1134           const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
1135           const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
1136           const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
1137           const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
1138           const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
1139           const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
1140           const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
1141           const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
1142           const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
1143           const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
1144           const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
1145           const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
1146           const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
1147           const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
1148           const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
1149           const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
1150           const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
1151           const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
1152           const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
1153           const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
1154           const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
1155           const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
1156           const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
1157           const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
1158           const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
1159           const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
1160           const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
1161           const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
1162           const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
1163           // Combine
1164           step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
1165           step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
1166           step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
1167           step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
1168           // Combine
1169           step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
1170           step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
1171           step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
1172           step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
1173 #if DCT_HIGH_BIT_DEPTH
1174           overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
1175                                              &step3[22], &step3[25], &step3[26],
1176                                              &step3[29], &step3[30]);
1177           if (overflow) {
1178             if (pass == 0)
1179               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1180             else
1181               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1182             return;
1183           }
1184 #endif  // DCT_HIGH_BIT_DEPTH
1185         }
1186         // Stage 7
1187         {
1188           const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
1189           const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
1190           const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
1191           const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
1192           const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
1193           const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
1194           const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
1195           const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
1196           const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
1197           const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
1198           const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
1199           const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
1200           const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
1201           const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
1202           const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
1203           const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
1204           const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
1205           const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
1206           const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
1207           const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
1208           const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
1209           const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
1210           const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
1211           const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
1212           // dct_const_round_shift
1213           const __m128i out_02_4 =
1214               _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
1215           const __m128i out_02_5 =
1216               _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
1217           const __m128i out_18_4 =
1218               _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
1219           const __m128i out_18_5 =
1220               _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
1221           const __m128i out_10_4 =
1222               _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
1223           const __m128i out_10_5 =
1224               _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
1225           const __m128i out_26_4 =
1226               _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
1227           const __m128i out_26_5 =
1228               _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
1229           const __m128i out_06_4 =
1230               _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
1231           const __m128i out_06_5 =
1232               _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
1233           const __m128i out_22_4 =
1234               _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
1235           const __m128i out_22_5 =
1236               _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
1237           const __m128i out_14_4 =
1238               _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
1239           const __m128i out_14_5 =
1240               _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
1241           const __m128i out_30_4 =
1242               _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
1243           const __m128i out_30_5 =
1244               _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
1245           const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
1246           const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
1247           const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
1248           const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
1249           const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
1250           const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
1251           const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
1252           const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
1253           const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
1254           const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
1255           const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
1256           const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
1257           const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
1258           const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
1259           const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
1260           const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
1261           // Combine
1262           out[2] = _mm_packs_epi32(out_02_6, out_02_7);
1263           out[18] = _mm_packs_epi32(out_18_6, out_18_7);
1264           out[10] = _mm_packs_epi32(out_10_6, out_10_7);
1265           out[26] = _mm_packs_epi32(out_26_6, out_26_7);
1266           out[6] = _mm_packs_epi32(out_06_6, out_06_7);
1267           out[22] = _mm_packs_epi32(out_22_6, out_22_7);
1268           out[14] = _mm_packs_epi32(out_14_6, out_14_7);
1269           out[30] = _mm_packs_epi32(out_30_6, out_30_7);
1270 #if DCT_HIGH_BIT_DEPTH
1271           overflow =
1272               check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
1273                                       &out[6], &out[22], &out[14], &out[30]);
1274           if (overflow) {
1275             if (pass == 0)
1276               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1277             else
1278               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1279             return;
1280           }
1281 #endif  // DCT_HIGH_BIT_DEPTH
1282         }
1283         {
1284           step1[16] = ADD_EPI16(step3[17], step2[16]);
1285           step1[17] = SUB_EPI16(step2[16], step3[17]);
1286           step1[18] = SUB_EPI16(step2[19], step3[18]);
1287           step1[19] = ADD_EPI16(step3[18], step2[19]);
1288           step1[20] = ADD_EPI16(step3[21], step2[20]);
1289           step1[21] = SUB_EPI16(step2[20], step3[21]);
1290           step1[22] = SUB_EPI16(step2[23], step3[22]);
1291           step1[23] = ADD_EPI16(step3[22], step2[23]);
1292           step1[24] = ADD_EPI16(step3[25], step2[24]);
1293           step1[25] = SUB_EPI16(step2[24], step3[25]);
1294           step1[26] = SUB_EPI16(step2[27], step3[26]);
1295           step1[27] = ADD_EPI16(step3[26], step2[27]);
1296           step1[28] = ADD_EPI16(step3[29], step2[28]);
1297           step1[29] = SUB_EPI16(step2[28], step3[29]);
1298           step1[30] = SUB_EPI16(step2[31], step3[30]);
1299           step1[31] = ADD_EPI16(step3[30], step2[31]);
1300 #if DCT_HIGH_BIT_DEPTH
1301           overflow = check_epi16_overflow_x16(
1302               &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
1303               &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
1304               &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
1305               &step1[31]);
1306           if (overflow) {
1307             if (pass == 0)
1308               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1309             else
1310               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1311             return;
1312           }
1313 #endif  // DCT_HIGH_BIT_DEPTH
1314         }
1315         // Final stage --- outputs indices are bit-reversed.
1316         {
1317           const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
1318           const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
1319           const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
1320           const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
1321           const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
1322           const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
1323           const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
1324           const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
1325           const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
1326           const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
1327           const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
1328           const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
1329           const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
1330           const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
1331           const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
1332           const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
1333           const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
1334           const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
1335           const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
1336           const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
1337           const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
1338           const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
1339           const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
1340           const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
1341           // dct_const_round_shift
1342           const __m128i out_01_4 =
1343               _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
1344           const __m128i out_01_5 =
1345               _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
1346           const __m128i out_17_4 =
1347               _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
1348           const __m128i out_17_5 =
1349               _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
1350           const __m128i out_09_4 =
1351               _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
1352           const __m128i out_09_5 =
1353               _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
1354           const __m128i out_25_4 =
1355               _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
1356           const __m128i out_25_5 =
1357               _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
1358           const __m128i out_07_4 =
1359               _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
1360           const __m128i out_07_5 =
1361               _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
1362           const __m128i out_23_4 =
1363               _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
1364           const __m128i out_23_5 =
1365               _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
1366           const __m128i out_15_4 =
1367               _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
1368           const __m128i out_15_5 =
1369               _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
1370           const __m128i out_31_4 =
1371               _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
1372           const __m128i out_31_5 =
1373               _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
1374           const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
1375           const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
1376           const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
1377           const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
1378           const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
1379           const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
1380           const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
1381           const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
1382           const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
1383           const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
1384           const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
1385           const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
1386           const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
1387           const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
1388           const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
1389           const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
1390           // Combine
1391           out[1] = _mm_packs_epi32(out_01_6, out_01_7);
1392           out[17] = _mm_packs_epi32(out_17_6, out_17_7);
1393           out[9] = _mm_packs_epi32(out_09_6, out_09_7);
1394           out[25] = _mm_packs_epi32(out_25_6, out_25_7);
1395           out[7] = _mm_packs_epi32(out_07_6, out_07_7);
1396           out[23] = _mm_packs_epi32(out_23_6, out_23_7);
1397           out[15] = _mm_packs_epi32(out_15_6, out_15_7);
1398           out[31] = _mm_packs_epi32(out_31_6, out_31_7);
1399 #if DCT_HIGH_BIT_DEPTH
1400           overflow =
1401               check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
1402                                       &out[7], &out[23], &out[15], &out[31]);
1403           if (overflow) {
1404             if (pass == 0)
1405               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1406             else
1407               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1408             return;
1409           }
1410 #endif  // DCT_HIGH_BIT_DEPTH
1411         }
1412         {
1413           const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
1414           const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
1415           const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
1416           const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
1417           const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
1418           const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
1419           const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
1420           const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
1421           const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
1422           const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
1423           const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
1424           const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
1425           const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
1426           const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
1427           const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
1428           const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
1429           const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
1430           const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
1431           const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
1432           const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
1433           const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
1434           const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
1435           const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
1436           const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
1437           // dct_const_round_shift
1438           const __m128i out_05_4 =
1439               _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
1440           const __m128i out_05_5 =
1441               _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
1442           const __m128i out_21_4 =
1443               _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
1444           const __m128i out_21_5 =
1445               _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
1446           const __m128i out_13_4 =
1447               _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
1448           const __m128i out_13_5 =
1449               _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
1450           const __m128i out_29_4 =
1451               _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
1452           const __m128i out_29_5 =
1453               _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
1454           const __m128i out_03_4 =
1455               _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
1456           const __m128i out_03_5 =
1457               _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
1458           const __m128i out_19_4 =
1459               _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
1460           const __m128i out_19_5 =
1461               _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
1462           const __m128i out_11_4 =
1463               _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
1464           const __m128i out_11_5 =
1465               _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
1466           const __m128i out_27_4 =
1467               _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
1468           const __m128i out_27_5 =
1469               _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
1470           const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
1471           const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
1472           const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
1473           const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
1474           const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
1475           const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
1476           const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
1477           const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
1478           const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
1479           const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
1480           const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
1481           const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
1482           const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
1483           const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
1484           const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
1485           const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
1486           // Combine
1487           out[5] = _mm_packs_epi32(out_05_6, out_05_7);
1488           out[21] = _mm_packs_epi32(out_21_6, out_21_7);
1489           out[13] = _mm_packs_epi32(out_13_6, out_13_7);
1490           out[29] = _mm_packs_epi32(out_29_6, out_29_7);
1491           out[3] = _mm_packs_epi32(out_03_6, out_03_7);
1492           out[19] = _mm_packs_epi32(out_19_6, out_19_7);
1493           out[11] = _mm_packs_epi32(out_11_6, out_11_7);
1494           out[27] = _mm_packs_epi32(out_27_6, out_27_7);
1495 #if DCT_HIGH_BIT_DEPTH
1496           overflow =
1497               check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
1498                                       &out[3], &out[19], &out[11], &out[27]);
1499           if (overflow) {
1500             if (pass == 0)
1501               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1502             else
1503               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1504             return;
1505           }
1506 #endif  // DCT_HIGH_BIT_DEPTH
1507         }
1508 #if FDCT32x32_HIGH_PRECISION
1509       } else {
1510         __m128i lstep1[64], lstep2[64], lstep3[64];
1511         __m128i u[32], v[32], sign[16];
1512         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
1513         const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
1514         // start using 32-bit operations
1515         // stage 3
1516         {
1517           // expanding to 32-bit length while adding and subtracting
1518           lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
1519           lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
1520           lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
1521           lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
1522           lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
1523           lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
1524           lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
1525           lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
1526 
1527           lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
1528           lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
1529           lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
1530           lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
1531           lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
1532           lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
1533           lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
1534           lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
1535 
1536           lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
1537           lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
1538           lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
1539           lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
1540           lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
1541           lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
1542           lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
1543           lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
1544         }
1545         {
1546           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
1547           const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
1548           const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
1549           const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
1550           const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
1551           const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
1552           const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
1553           const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
1554           const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
1555           const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
1556           const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
1557           const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
1558           // dct_const_round_shift
1559           const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
1560           const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
1561           const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
1562           const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
1563           const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
1564           const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
1565           const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
1566           const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
1567           lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
1568           lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
1569           lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
1570           lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
1571           lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
1572           lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
1573           lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
1574           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
1575         }
1576         {
1577           lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
1578           lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
1579           lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
1580           lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
1581           lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
1582           lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
1583           lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
1584           lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
1585 
1586           lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
1587           lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
1588           lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
1589           lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
1590           lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
1591           lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
1592           lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
1593           lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
1594 
1595           lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
1596           lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
1597           lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
1598           lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
1599           lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
1600           lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
1601           lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
1602           lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
1603 
1604           lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
1605           lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
1606           lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
1607           lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
1608           lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
1609           lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
1610           lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
1611           lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
1612 
1613           lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
1614           lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
1615           lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
1616           lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
1617           lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
1618           lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
1619           lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
1620           lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
1621 
1622           lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
1623           lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
1624           lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
1625           lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
1626           lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
1627           lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
1628           lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
1629           lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
1630         }
1631 
1632         // stage 4
1633         {
1634           // expanding to 32-bit length prior to addition operations
1635           sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
1636           sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
1637           sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
1638           sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
1639           lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
1640           lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
1641           lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
1642           lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
1643           lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
1644           lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
1645           lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
1646           lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
1647 
1648           lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
1649           lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
1650           lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
1651           lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
1652           lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
1653           lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
1654           lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
1655           lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
1656           lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
1657           lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
1658           lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
1659           lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
1660           lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
1661           lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
1662           lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
1663           lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
1664           lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
1665           lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
1666           lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
1667           lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
1668           lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
1669           lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
1670           lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
1671           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
1672         }
1673         {
1674           // to be continued...
1675           //
1676           const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1677           const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1678 
1679           u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
1680           u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
1681           u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
1682           u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
1683 
1684           // TODO(jingning): manually inline k_madd_epi32_ to further hide
1685           // instruction latency.
1686           v[0] = k_madd_epi32(u[0], k32_p16_m16);
1687           v[1] = k_madd_epi32(u[1], k32_p16_m16);
1688           v[2] = k_madd_epi32(u[2], k32_p16_m16);
1689           v[3] = k_madd_epi32(u[3], k32_p16_m16);
1690           v[4] = k_madd_epi32(u[0], k32_p16_p16);
1691           v[5] = k_madd_epi32(u[1], k32_p16_p16);
1692           v[6] = k_madd_epi32(u[2], k32_p16_p16);
1693           v[7] = k_madd_epi32(u[3], k32_p16_p16);
1694 #if DCT_HIGH_BIT_DEPTH
1695           overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
1696                                               &v[5], &v[6], &v[7], &kZero);
1697           if (overflow) {
1698             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1699             return;
1700           }
1701 #endif  // DCT_HIGH_BIT_DEPTH
1702           u[0] = k_packs_epi64(v[0], v[1]);
1703           u[1] = k_packs_epi64(v[2], v[3]);
1704           u[2] = k_packs_epi64(v[4], v[5]);
1705           u[3] = k_packs_epi64(v[6], v[7]);
1706 
1707           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1708           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1709           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1710           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1711 
1712           lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1713           lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1714           lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1715           lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1716         }
1717         {
1718           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1719           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1720           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1721 
1722           u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
1723           u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
1724           u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
1725           u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
1726           u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
1727           u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
1728           u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
1729           u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
1730           u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
1731           u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
1732           u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
1733           u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
1734           u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
1735           u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
1736           u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
1737           u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
1738 
1739           v[0] = k_madd_epi32(u[0], k32_m08_p24);
1740           v[1] = k_madd_epi32(u[1], k32_m08_p24);
1741           v[2] = k_madd_epi32(u[2], k32_m08_p24);
1742           v[3] = k_madd_epi32(u[3], k32_m08_p24);
1743           v[4] = k_madd_epi32(u[4], k32_m08_p24);
1744           v[5] = k_madd_epi32(u[5], k32_m08_p24);
1745           v[6] = k_madd_epi32(u[6], k32_m08_p24);
1746           v[7] = k_madd_epi32(u[7], k32_m08_p24);
1747           v[8] = k_madd_epi32(u[8], k32_m24_m08);
1748           v[9] = k_madd_epi32(u[9], k32_m24_m08);
1749           v[10] = k_madd_epi32(u[10], k32_m24_m08);
1750           v[11] = k_madd_epi32(u[11], k32_m24_m08);
1751           v[12] = k_madd_epi32(u[12], k32_m24_m08);
1752           v[13] = k_madd_epi32(u[13], k32_m24_m08);
1753           v[14] = k_madd_epi32(u[14], k32_m24_m08);
1754           v[15] = k_madd_epi32(u[15], k32_m24_m08);
1755           v[16] = k_madd_epi32(u[12], k32_m08_p24);
1756           v[17] = k_madd_epi32(u[13], k32_m08_p24);
1757           v[18] = k_madd_epi32(u[14], k32_m08_p24);
1758           v[19] = k_madd_epi32(u[15], k32_m08_p24);
1759           v[20] = k_madd_epi32(u[8], k32_m08_p24);
1760           v[21] = k_madd_epi32(u[9], k32_m08_p24);
1761           v[22] = k_madd_epi32(u[10], k32_m08_p24);
1762           v[23] = k_madd_epi32(u[11], k32_m08_p24);
1763           v[24] = k_madd_epi32(u[4], k32_p24_p08);
1764           v[25] = k_madd_epi32(u[5], k32_p24_p08);
1765           v[26] = k_madd_epi32(u[6], k32_p24_p08);
1766           v[27] = k_madd_epi32(u[7], k32_p24_p08);
1767           v[28] = k_madd_epi32(u[0], k32_p24_p08);
1768           v[29] = k_madd_epi32(u[1], k32_p24_p08);
1769           v[30] = k_madd_epi32(u[2], k32_p24_p08);
1770           v[31] = k_madd_epi32(u[3], k32_p24_p08);
1771 
1772 #if DCT_HIGH_BIT_DEPTH
1773           overflow = k_check_epi32_overflow_32(
1774               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
1775               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
1776               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
1777               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
1778           if (overflow) {
1779             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1780             return;
1781           }
1782 #endif  // DCT_HIGH_BIT_DEPTH
1783           u[0] = k_packs_epi64(v[0], v[1]);
1784           u[1] = k_packs_epi64(v[2], v[3]);
1785           u[2] = k_packs_epi64(v[4], v[5]);
1786           u[3] = k_packs_epi64(v[6], v[7]);
1787           u[4] = k_packs_epi64(v[8], v[9]);
1788           u[5] = k_packs_epi64(v[10], v[11]);
1789           u[6] = k_packs_epi64(v[12], v[13]);
1790           u[7] = k_packs_epi64(v[14], v[15]);
1791           u[8] = k_packs_epi64(v[16], v[17]);
1792           u[9] = k_packs_epi64(v[18], v[19]);
1793           u[10] = k_packs_epi64(v[20], v[21]);
1794           u[11] = k_packs_epi64(v[22], v[23]);
1795           u[12] = k_packs_epi64(v[24], v[25]);
1796           u[13] = k_packs_epi64(v[26], v[27]);
1797           u[14] = k_packs_epi64(v[28], v[29]);
1798           u[15] = k_packs_epi64(v[30], v[31]);
1799 
1800           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1801           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1802           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1803           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1804           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1805           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1806           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1807           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1808           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1809           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1810           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1811           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1812           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1813           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1814           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1815           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1816 
1817           lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1818           lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1819           lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1820           lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1821           lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1822           lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1823           lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1824           lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1825           lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1826           lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1827           lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1828           lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1829           lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1830           lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1831           lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1832           lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1833         }
1834         // stage 5
1835         {
1836           lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
1837           lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
1838           lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
1839           lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
1840           lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
1841           lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
1842           lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
1843           lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
1844         }
1845         {
1846           const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1847           const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1848           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1849           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1850 
1851           u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
1852           u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
1853           u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
1854           u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
1855           u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
1856           u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
1857           u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
1858           u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
1859 
1860           // TODO(jingning): manually inline k_madd_epi32_ to further hide
1861           // instruction latency.
1862           v[0] = k_madd_epi32(u[0], k32_p16_p16);
1863           v[1] = k_madd_epi32(u[1], k32_p16_p16);
1864           v[2] = k_madd_epi32(u[2], k32_p16_p16);
1865           v[3] = k_madd_epi32(u[3], k32_p16_p16);
1866           v[4] = k_madd_epi32(u[0], k32_p16_m16);
1867           v[5] = k_madd_epi32(u[1], k32_p16_m16);
1868           v[6] = k_madd_epi32(u[2], k32_p16_m16);
1869           v[7] = k_madd_epi32(u[3], k32_p16_m16);
1870           v[8] = k_madd_epi32(u[4], k32_p24_p08);
1871           v[9] = k_madd_epi32(u[5], k32_p24_p08);
1872           v[10] = k_madd_epi32(u[6], k32_p24_p08);
1873           v[11] = k_madd_epi32(u[7], k32_p24_p08);
1874           v[12] = k_madd_epi32(u[4], k32_m08_p24);
1875           v[13] = k_madd_epi32(u[5], k32_m08_p24);
1876           v[14] = k_madd_epi32(u[6], k32_m08_p24);
1877           v[15] = k_madd_epi32(u[7], k32_m08_p24);
1878 
1879 #if DCT_HIGH_BIT_DEPTH
1880           overflow = k_check_epi32_overflow_16(
1881               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
1882               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
1883           if (overflow) {
1884             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1885             return;
1886           }
1887 #endif  // DCT_HIGH_BIT_DEPTH
1888           u[0] = k_packs_epi64(v[0], v[1]);
1889           u[1] = k_packs_epi64(v[2], v[3]);
1890           u[2] = k_packs_epi64(v[4], v[5]);
1891           u[3] = k_packs_epi64(v[6], v[7]);
1892           u[4] = k_packs_epi64(v[8], v[9]);
1893           u[5] = k_packs_epi64(v[10], v[11]);
1894           u[6] = k_packs_epi64(v[12], v[13]);
1895           u[7] = k_packs_epi64(v[14], v[15]);
1896 
1897           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1898           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1899           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1900           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1901           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1902           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1903           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1904           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1905 
1906           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1907           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1908           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1909           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1910           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1911           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1912           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1913           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1914 
1915           sign[0] = _mm_cmplt_epi32(u[0], kZero);
1916           sign[1] = _mm_cmplt_epi32(u[1], kZero);
1917           sign[2] = _mm_cmplt_epi32(u[2], kZero);
1918           sign[3] = _mm_cmplt_epi32(u[3], kZero);
1919           sign[4] = _mm_cmplt_epi32(u[4], kZero);
1920           sign[5] = _mm_cmplt_epi32(u[5], kZero);
1921           sign[6] = _mm_cmplt_epi32(u[6], kZero);
1922           sign[7] = _mm_cmplt_epi32(u[7], kZero);
1923 
1924           u[0] = _mm_sub_epi32(u[0], sign[0]);
1925           u[1] = _mm_sub_epi32(u[1], sign[1]);
1926           u[2] = _mm_sub_epi32(u[2], sign[2]);
1927           u[3] = _mm_sub_epi32(u[3], sign[3]);
1928           u[4] = _mm_sub_epi32(u[4], sign[4]);
1929           u[5] = _mm_sub_epi32(u[5], sign[5]);
1930           u[6] = _mm_sub_epi32(u[6], sign[6]);
1931           u[7] = _mm_sub_epi32(u[7], sign[7]);
1932 
1933           u[0] = _mm_add_epi32(u[0], K32One);
1934           u[1] = _mm_add_epi32(u[1], K32One);
1935           u[2] = _mm_add_epi32(u[2], K32One);
1936           u[3] = _mm_add_epi32(u[3], K32One);
1937           u[4] = _mm_add_epi32(u[4], K32One);
1938           u[5] = _mm_add_epi32(u[5], K32One);
1939           u[6] = _mm_add_epi32(u[6], K32One);
1940           u[7] = _mm_add_epi32(u[7], K32One);
1941 
1942           u[0] = _mm_srai_epi32(u[0], 2);
1943           u[1] = _mm_srai_epi32(u[1], 2);
1944           u[2] = _mm_srai_epi32(u[2], 2);
1945           u[3] = _mm_srai_epi32(u[3], 2);
1946           u[4] = _mm_srai_epi32(u[4], 2);
1947           u[5] = _mm_srai_epi32(u[5], 2);
1948           u[6] = _mm_srai_epi32(u[6], 2);
1949           u[7] = _mm_srai_epi32(u[7], 2);
1950 
1951           // Combine
1952           out[0] = _mm_packs_epi32(u[0], u[1]);
1953           out[16] = _mm_packs_epi32(u[2], u[3]);
1954           out[8] = _mm_packs_epi32(u[4], u[5]);
1955           out[24] = _mm_packs_epi32(u[6], u[7]);
1956 #if DCT_HIGH_BIT_DEPTH
1957           overflow =
1958               check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
1959           if (overflow) {
1960             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1961             return;
1962           }
1963 #endif  // DCT_HIGH_BIT_DEPTH
1964         }
1965         {
1966           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1967           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1968           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1969 
1970           u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
1971           u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
1972           u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
1973           u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
1974           u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
1975           u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
1976           u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
1977           u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
1978 
1979           v[0] = k_madd_epi32(u[0], k32_m08_p24);
1980           v[1] = k_madd_epi32(u[1], k32_m08_p24);
1981           v[2] = k_madd_epi32(u[2], k32_m08_p24);
1982           v[3] = k_madd_epi32(u[3], k32_m08_p24);
1983           v[4] = k_madd_epi32(u[4], k32_m24_m08);
1984           v[5] = k_madd_epi32(u[5], k32_m24_m08);
1985           v[6] = k_madd_epi32(u[6], k32_m24_m08);
1986           v[7] = k_madd_epi32(u[7], k32_m24_m08);
1987           v[8] = k_madd_epi32(u[4], k32_m08_p24);
1988           v[9] = k_madd_epi32(u[5], k32_m08_p24);
1989           v[10] = k_madd_epi32(u[6], k32_m08_p24);
1990           v[11] = k_madd_epi32(u[7], k32_m08_p24);
1991           v[12] = k_madd_epi32(u[0], k32_p24_p08);
1992           v[13] = k_madd_epi32(u[1], k32_p24_p08);
1993           v[14] = k_madd_epi32(u[2], k32_p24_p08);
1994           v[15] = k_madd_epi32(u[3], k32_p24_p08);
1995 
1996 #if DCT_HIGH_BIT_DEPTH
1997           overflow = k_check_epi32_overflow_16(
1998               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
1999               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
2000           if (overflow) {
2001             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2002             return;
2003           }
2004 #endif  // DCT_HIGH_BIT_DEPTH
2005           u[0] = k_packs_epi64(v[0], v[1]);
2006           u[1] = k_packs_epi64(v[2], v[3]);
2007           u[2] = k_packs_epi64(v[4], v[5]);
2008           u[3] = k_packs_epi64(v[6], v[7]);
2009           u[4] = k_packs_epi64(v[8], v[9]);
2010           u[5] = k_packs_epi64(v[10], v[11]);
2011           u[6] = k_packs_epi64(v[12], v[13]);
2012           u[7] = k_packs_epi64(v[14], v[15]);
2013 
2014           u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2015           u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2016           u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2017           u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2018           u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2019           u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2020           u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2021           u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2022 
2023           lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2024           lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2025           lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2026           lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2027           lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2028           lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2029           lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2030           lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2031         }
2032         {
2033           lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
2034           lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
2035           lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
2036           lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
2037           lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
2038           lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
2039           lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
2040           lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
2041           lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
2042           lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
2043           lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
2044           lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
2045           lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
2046           lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
2047           lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
2048           lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
2049           lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
2050           lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
2051           lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
2052           lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
2053           lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
2054           lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
2055           lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
2056           lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
2057           lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
2058           lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
2059           lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
2060           lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
2061           lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
2062           lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
2063           lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
2064           lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
2065         }
2066         // stage 6
2067         {
2068           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2069           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2070           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2071           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2072 
2073           u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
2074           u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
2075           u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
2076           u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
2077           u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2078           u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2079           u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2080           u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2081           u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2082           u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2083           u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2084           u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2085           u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
2086           u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
2087           u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
2088           u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
2089 
2090           v[0] = k_madd_epi32(u[0], k32_p28_p04);
2091           v[1] = k_madd_epi32(u[1], k32_p28_p04);
2092           v[2] = k_madd_epi32(u[2], k32_p28_p04);
2093           v[3] = k_madd_epi32(u[3], k32_p28_p04);
2094           v[4] = k_madd_epi32(u[4], k32_p12_p20);
2095           v[5] = k_madd_epi32(u[5], k32_p12_p20);
2096           v[6] = k_madd_epi32(u[6], k32_p12_p20);
2097           v[7] = k_madd_epi32(u[7], k32_p12_p20);
2098           v[8] = k_madd_epi32(u[8], k32_m20_p12);
2099           v[9] = k_madd_epi32(u[9], k32_m20_p12);
2100           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2101           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2102           v[12] = k_madd_epi32(u[12], k32_m04_p28);
2103           v[13] = k_madd_epi32(u[13], k32_m04_p28);
2104           v[14] = k_madd_epi32(u[14], k32_m04_p28);
2105           v[15] = k_madd_epi32(u[15], k32_m04_p28);
2106 
2107 #if DCT_HIGH_BIT_DEPTH
2108           overflow = k_check_epi32_overflow_16(
2109               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2110               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
2111           if (overflow) {
2112             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2113             return;
2114           }
2115 #endif  // DCT_HIGH_BIT_DEPTH
2116           u[0] = k_packs_epi64(v[0], v[1]);
2117           u[1] = k_packs_epi64(v[2], v[3]);
2118           u[2] = k_packs_epi64(v[4], v[5]);
2119           u[3] = k_packs_epi64(v[6], v[7]);
2120           u[4] = k_packs_epi64(v[8], v[9]);
2121           u[5] = k_packs_epi64(v[10], v[11]);
2122           u[6] = k_packs_epi64(v[12], v[13]);
2123           u[7] = k_packs_epi64(v[14], v[15]);
2124 
2125           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2126           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2127           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2128           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2129           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2130           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2131           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2132           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2133 
2134           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2135           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2136           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2137           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2138           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2139           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2140           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2141           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2142 
2143           sign[0] = _mm_cmplt_epi32(u[0], kZero);
2144           sign[1] = _mm_cmplt_epi32(u[1], kZero);
2145           sign[2] = _mm_cmplt_epi32(u[2], kZero);
2146           sign[3] = _mm_cmplt_epi32(u[3], kZero);
2147           sign[4] = _mm_cmplt_epi32(u[4], kZero);
2148           sign[5] = _mm_cmplt_epi32(u[5], kZero);
2149           sign[6] = _mm_cmplt_epi32(u[6], kZero);
2150           sign[7] = _mm_cmplt_epi32(u[7], kZero);
2151 
2152           u[0] = _mm_sub_epi32(u[0], sign[0]);
2153           u[1] = _mm_sub_epi32(u[1], sign[1]);
2154           u[2] = _mm_sub_epi32(u[2], sign[2]);
2155           u[3] = _mm_sub_epi32(u[3], sign[3]);
2156           u[4] = _mm_sub_epi32(u[4], sign[4]);
2157           u[5] = _mm_sub_epi32(u[5], sign[5]);
2158           u[6] = _mm_sub_epi32(u[6], sign[6]);
2159           u[7] = _mm_sub_epi32(u[7], sign[7]);
2160 
2161           u[0] = _mm_add_epi32(u[0], K32One);
2162           u[1] = _mm_add_epi32(u[1], K32One);
2163           u[2] = _mm_add_epi32(u[2], K32One);
2164           u[3] = _mm_add_epi32(u[3], K32One);
2165           u[4] = _mm_add_epi32(u[4], K32One);
2166           u[5] = _mm_add_epi32(u[5], K32One);
2167           u[6] = _mm_add_epi32(u[6], K32One);
2168           u[7] = _mm_add_epi32(u[7], K32One);
2169 
2170           u[0] = _mm_srai_epi32(u[0], 2);
2171           u[1] = _mm_srai_epi32(u[1], 2);
2172           u[2] = _mm_srai_epi32(u[2], 2);
2173           u[3] = _mm_srai_epi32(u[3], 2);
2174           u[4] = _mm_srai_epi32(u[4], 2);
2175           u[5] = _mm_srai_epi32(u[5], 2);
2176           u[6] = _mm_srai_epi32(u[6], 2);
2177           u[7] = _mm_srai_epi32(u[7], 2);
2178 
2179           out[4] = _mm_packs_epi32(u[0], u[1]);
2180           out[20] = _mm_packs_epi32(u[2], u[3]);
2181           out[12] = _mm_packs_epi32(u[4], u[5]);
2182           out[28] = _mm_packs_epi32(u[6], u[7]);
2183 #if DCT_HIGH_BIT_DEPTH
2184           overflow =
2185               check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
2186           if (overflow) {
2187             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2188             return;
2189           }
2190 #endif  // DCT_HIGH_BIT_DEPTH
2191         }
2192         {
2193           lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
2194           lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
2195           lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
2196           lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
2197           lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
2198           lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
2199           lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
2200           lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
2201           lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
2202           lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
2203           lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
2204           lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
2205           lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
2206           lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
2207           lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
2208           lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
2209         }
2210         {
2211           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2212           const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
2213           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2214           const __m128i k32_m12_m20 =
2215               pair_set_epi32(-cospi_12_64, -cospi_20_64);
2216           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2217           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2218 
2219           u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
2220           u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
2221           u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
2222           u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
2223           u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
2224           u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
2225           u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
2226           u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
2227           u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
2228           u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
2229           u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
2230           u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
2231           u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
2232           u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
2233           u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
2234           u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
2235 
2236           v[0] = k_madd_epi32(u[0], k32_m04_p28);
2237           v[1] = k_madd_epi32(u[1], k32_m04_p28);
2238           v[2] = k_madd_epi32(u[2], k32_m04_p28);
2239           v[3] = k_madd_epi32(u[3], k32_m04_p28);
2240           v[4] = k_madd_epi32(u[4], k32_m28_m04);
2241           v[5] = k_madd_epi32(u[5], k32_m28_m04);
2242           v[6] = k_madd_epi32(u[6], k32_m28_m04);
2243           v[7] = k_madd_epi32(u[7], k32_m28_m04);
2244           v[8] = k_madd_epi32(u[8], k32_m20_p12);
2245           v[9] = k_madd_epi32(u[9], k32_m20_p12);
2246           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2247           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2248           v[12] = k_madd_epi32(u[12], k32_m12_m20);
2249           v[13] = k_madd_epi32(u[13], k32_m12_m20);
2250           v[14] = k_madd_epi32(u[14], k32_m12_m20);
2251           v[15] = k_madd_epi32(u[15], k32_m12_m20);
2252           v[16] = k_madd_epi32(u[12], k32_m20_p12);
2253           v[17] = k_madd_epi32(u[13], k32_m20_p12);
2254           v[18] = k_madd_epi32(u[14], k32_m20_p12);
2255           v[19] = k_madd_epi32(u[15], k32_m20_p12);
2256           v[20] = k_madd_epi32(u[8], k32_p12_p20);
2257           v[21] = k_madd_epi32(u[9], k32_p12_p20);
2258           v[22] = k_madd_epi32(u[10], k32_p12_p20);
2259           v[23] = k_madd_epi32(u[11], k32_p12_p20);
2260           v[24] = k_madd_epi32(u[4], k32_m04_p28);
2261           v[25] = k_madd_epi32(u[5], k32_m04_p28);
2262           v[26] = k_madd_epi32(u[6], k32_m04_p28);
2263           v[27] = k_madd_epi32(u[7], k32_m04_p28);
2264           v[28] = k_madd_epi32(u[0], k32_p28_p04);
2265           v[29] = k_madd_epi32(u[1], k32_p28_p04);
2266           v[30] = k_madd_epi32(u[2], k32_p28_p04);
2267           v[31] = k_madd_epi32(u[3], k32_p28_p04);
2268 
2269 #if DCT_HIGH_BIT_DEPTH
2270           overflow = k_check_epi32_overflow_32(
2271               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2272               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2273               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2274               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2275           if (overflow) {
2276             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2277             return;
2278           }
2279 #endif  // DCT_HIGH_BIT_DEPTH
2280           u[0] = k_packs_epi64(v[0], v[1]);
2281           u[1] = k_packs_epi64(v[2], v[3]);
2282           u[2] = k_packs_epi64(v[4], v[5]);
2283           u[3] = k_packs_epi64(v[6], v[7]);
2284           u[4] = k_packs_epi64(v[8], v[9]);
2285           u[5] = k_packs_epi64(v[10], v[11]);
2286           u[6] = k_packs_epi64(v[12], v[13]);
2287           u[7] = k_packs_epi64(v[14], v[15]);
2288           u[8] = k_packs_epi64(v[16], v[17]);
2289           u[9] = k_packs_epi64(v[18], v[19]);
2290           u[10] = k_packs_epi64(v[20], v[21]);
2291           u[11] = k_packs_epi64(v[22], v[23]);
2292           u[12] = k_packs_epi64(v[24], v[25]);
2293           u[13] = k_packs_epi64(v[26], v[27]);
2294           u[14] = k_packs_epi64(v[28], v[29]);
2295           u[15] = k_packs_epi64(v[30], v[31]);
2296 
2297           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2298           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2299           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2300           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2301           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2302           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2303           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2304           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2305           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2306           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2307           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2308           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2309           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2310           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2311           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2312           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2313 
2314           lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2315           lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2316           lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2317           lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2318           lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2319           lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2320           lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2321           lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2322           lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2323           lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2324           lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2325           lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2326           lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2327           lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2328           lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2329           lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2330         }
2331         // stage 7
2332         {
2333           const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
2334           const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
2335           const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
2336           const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
2337           const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
2338           const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
2339           const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
2340           const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
2341 
2342           u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
2343           u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
2344           u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
2345           u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
2346           u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
2347           u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
2348           u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
2349           u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
2350           u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
2351           u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
2352           u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
2353           u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
2354           u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
2355           u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
2356           u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
2357           u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
2358 
2359           v[0] = k_madd_epi32(u[0], k32_p30_p02);
2360           v[1] = k_madd_epi32(u[1], k32_p30_p02);
2361           v[2] = k_madd_epi32(u[2], k32_p30_p02);
2362           v[3] = k_madd_epi32(u[3], k32_p30_p02);
2363           v[4] = k_madd_epi32(u[4], k32_p14_p18);
2364           v[5] = k_madd_epi32(u[5], k32_p14_p18);
2365           v[6] = k_madd_epi32(u[6], k32_p14_p18);
2366           v[7] = k_madd_epi32(u[7], k32_p14_p18);
2367           v[8] = k_madd_epi32(u[8], k32_p22_p10);
2368           v[9] = k_madd_epi32(u[9], k32_p22_p10);
2369           v[10] = k_madd_epi32(u[10], k32_p22_p10);
2370           v[11] = k_madd_epi32(u[11], k32_p22_p10);
2371           v[12] = k_madd_epi32(u[12], k32_p06_p26);
2372           v[13] = k_madd_epi32(u[13], k32_p06_p26);
2373           v[14] = k_madd_epi32(u[14], k32_p06_p26);
2374           v[15] = k_madd_epi32(u[15], k32_p06_p26);
2375           v[16] = k_madd_epi32(u[12], k32_m26_p06);
2376           v[17] = k_madd_epi32(u[13], k32_m26_p06);
2377           v[18] = k_madd_epi32(u[14], k32_m26_p06);
2378           v[19] = k_madd_epi32(u[15], k32_m26_p06);
2379           v[20] = k_madd_epi32(u[8], k32_m10_p22);
2380           v[21] = k_madd_epi32(u[9], k32_m10_p22);
2381           v[22] = k_madd_epi32(u[10], k32_m10_p22);
2382           v[23] = k_madd_epi32(u[11], k32_m10_p22);
2383           v[24] = k_madd_epi32(u[4], k32_m18_p14);
2384           v[25] = k_madd_epi32(u[5], k32_m18_p14);
2385           v[26] = k_madd_epi32(u[6], k32_m18_p14);
2386           v[27] = k_madd_epi32(u[7], k32_m18_p14);
2387           v[28] = k_madd_epi32(u[0], k32_m02_p30);
2388           v[29] = k_madd_epi32(u[1], k32_m02_p30);
2389           v[30] = k_madd_epi32(u[2], k32_m02_p30);
2390           v[31] = k_madd_epi32(u[3], k32_m02_p30);
2391 
2392 #if DCT_HIGH_BIT_DEPTH
2393           overflow = k_check_epi32_overflow_32(
2394               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2395               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2396               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2397               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2398           if (overflow) {
2399             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2400             return;
2401           }
2402 #endif  // DCT_HIGH_BIT_DEPTH
2403           u[0] = k_packs_epi64(v[0], v[1]);
2404           u[1] = k_packs_epi64(v[2], v[3]);
2405           u[2] = k_packs_epi64(v[4], v[5]);
2406           u[3] = k_packs_epi64(v[6], v[7]);
2407           u[4] = k_packs_epi64(v[8], v[9]);
2408           u[5] = k_packs_epi64(v[10], v[11]);
2409           u[6] = k_packs_epi64(v[12], v[13]);
2410           u[7] = k_packs_epi64(v[14], v[15]);
2411           u[8] = k_packs_epi64(v[16], v[17]);
2412           u[9] = k_packs_epi64(v[18], v[19]);
2413           u[10] = k_packs_epi64(v[20], v[21]);
2414           u[11] = k_packs_epi64(v[22], v[23]);
2415           u[12] = k_packs_epi64(v[24], v[25]);
2416           u[13] = k_packs_epi64(v[26], v[27]);
2417           u[14] = k_packs_epi64(v[28], v[29]);
2418           u[15] = k_packs_epi64(v[30], v[31]);
2419 
2420           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2421           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2422           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2423           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2424           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2425           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2426           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2427           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2428           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2429           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2430           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2431           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2432           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2433           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2434           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2435           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2436 
2437           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2438           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2439           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2440           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2441           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2442           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2443           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2444           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2445           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2446           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2447           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2448           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2449           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2450           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2451           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2452           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2453 
2454           v[0] = _mm_cmplt_epi32(u[0], kZero);
2455           v[1] = _mm_cmplt_epi32(u[1], kZero);
2456           v[2] = _mm_cmplt_epi32(u[2], kZero);
2457           v[3] = _mm_cmplt_epi32(u[3], kZero);
2458           v[4] = _mm_cmplt_epi32(u[4], kZero);
2459           v[5] = _mm_cmplt_epi32(u[5], kZero);
2460           v[6] = _mm_cmplt_epi32(u[6], kZero);
2461           v[7] = _mm_cmplt_epi32(u[7], kZero);
2462           v[8] = _mm_cmplt_epi32(u[8], kZero);
2463           v[9] = _mm_cmplt_epi32(u[9], kZero);
2464           v[10] = _mm_cmplt_epi32(u[10], kZero);
2465           v[11] = _mm_cmplt_epi32(u[11], kZero);
2466           v[12] = _mm_cmplt_epi32(u[12], kZero);
2467           v[13] = _mm_cmplt_epi32(u[13], kZero);
2468           v[14] = _mm_cmplt_epi32(u[14], kZero);
2469           v[15] = _mm_cmplt_epi32(u[15], kZero);
2470 
2471           u[0] = _mm_sub_epi32(u[0], v[0]);
2472           u[1] = _mm_sub_epi32(u[1], v[1]);
2473           u[2] = _mm_sub_epi32(u[2], v[2]);
2474           u[3] = _mm_sub_epi32(u[3], v[3]);
2475           u[4] = _mm_sub_epi32(u[4], v[4]);
2476           u[5] = _mm_sub_epi32(u[5], v[5]);
2477           u[6] = _mm_sub_epi32(u[6], v[6]);
2478           u[7] = _mm_sub_epi32(u[7], v[7]);
2479           u[8] = _mm_sub_epi32(u[8], v[8]);
2480           u[9] = _mm_sub_epi32(u[9], v[9]);
2481           u[10] = _mm_sub_epi32(u[10], v[10]);
2482           u[11] = _mm_sub_epi32(u[11], v[11]);
2483           u[12] = _mm_sub_epi32(u[12], v[12]);
2484           u[13] = _mm_sub_epi32(u[13], v[13]);
2485           u[14] = _mm_sub_epi32(u[14], v[14]);
2486           u[15] = _mm_sub_epi32(u[15], v[15]);
2487 
2488           v[0] = _mm_add_epi32(u[0], K32One);
2489           v[1] = _mm_add_epi32(u[1], K32One);
2490           v[2] = _mm_add_epi32(u[2], K32One);
2491           v[3] = _mm_add_epi32(u[3], K32One);
2492           v[4] = _mm_add_epi32(u[4], K32One);
2493           v[5] = _mm_add_epi32(u[5], K32One);
2494           v[6] = _mm_add_epi32(u[6], K32One);
2495           v[7] = _mm_add_epi32(u[7], K32One);
2496           v[8] = _mm_add_epi32(u[8], K32One);
2497           v[9] = _mm_add_epi32(u[9], K32One);
2498           v[10] = _mm_add_epi32(u[10], K32One);
2499           v[11] = _mm_add_epi32(u[11], K32One);
2500           v[12] = _mm_add_epi32(u[12], K32One);
2501           v[13] = _mm_add_epi32(u[13], K32One);
2502           v[14] = _mm_add_epi32(u[14], K32One);
2503           v[15] = _mm_add_epi32(u[15], K32One);
2504 
2505           u[0] = _mm_srai_epi32(v[0], 2);
2506           u[1] = _mm_srai_epi32(v[1], 2);
2507           u[2] = _mm_srai_epi32(v[2], 2);
2508           u[3] = _mm_srai_epi32(v[3], 2);
2509           u[4] = _mm_srai_epi32(v[4], 2);
2510           u[5] = _mm_srai_epi32(v[5], 2);
2511           u[6] = _mm_srai_epi32(v[6], 2);
2512           u[7] = _mm_srai_epi32(v[7], 2);
2513           u[8] = _mm_srai_epi32(v[8], 2);
2514           u[9] = _mm_srai_epi32(v[9], 2);
2515           u[10] = _mm_srai_epi32(v[10], 2);
2516           u[11] = _mm_srai_epi32(v[11], 2);
2517           u[12] = _mm_srai_epi32(v[12], 2);
2518           u[13] = _mm_srai_epi32(v[13], 2);
2519           u[14] = _mm_srai_epi32(v[14], 2);
2520           u[15] = _mm_srai_epi32(v[15], 2);
2521 
2522           out[2] = _mm_packs_epi32(u[0], u[1]);
2523           out[18] = _mm_packs_epi32(u[2], u[3]);
2524           out[10] = _mm_packs_epi32(u[4], u[5]);
2525           out[26] = _mm_packs_epi32(u[6], u[7]);
2526           out[6] = _mm_packs_epi32(u[8], u[9]);
2527           out[22] = _mm_packs_epi32(u[10], u[11]);
2528           out[14] = _mm_packs_epi32(u[12], u[13]);
2529           out[30] = _mm_packs_epi32(u[14], u[15]);
2530 #if DCT_HIGH_BIT_DEPTH
2531           overflow =
2532               check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
2533                                       &out[6], &out[22], &out[14], &out[30]);
2534           if (overflow) {
2535             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2536             return;
2537           }
2538 #endif  // DCT_HIGH_BIT_DEPTH
2539         }
2540         {
2541           lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
2542           lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
2543           lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
2544           lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
2545           lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
2546           lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
2547           lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
2548           lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
2549           lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
2550           lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
2551           lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
2552           lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
2553           lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
2554           lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
2555           lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
2556           lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
2557           lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
2558           lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
2559           lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
2560           lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
2561           lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
2562           lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
2563           lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
2564           lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
2565           lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
2566           lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
2567           lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
2568           lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
2569           lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
2570           lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
2571           lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
2572           lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
2573         }
2574         // stage 8
2575         {
2576           const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
2577           const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
2578           const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
2579           const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
2580           const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
2581           const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
2582           const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
2583           const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
2584 
2585           u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
2586           u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
2587           u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
2588           u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
2589           u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
2590           u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
2591           u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
2592           u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
2593           u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
2594           u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
2595           u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
2596           u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
2597           u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
2598           u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
2599           u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
2600           u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
2601 
2602           v[0] = k_madd_epi32(u[0], k32_p31_p01);
2603           v[1] = k_madd_epi32(u[1], k32_p31_p01);
2604           v[2] = k_madd_epi32(u[2], k32_p31_p01);
2605           v[3] = k_madd_epi32(u[3], k32_p31_p01);
2606           v[4] = k_madd_epi32(u[4], k32_p15_p17);
2607           v[5] = k_madd_epi32(u[5], k32_p15_p17);
2608           v[6] = k_madd_epi32(u[6], k32_p15_p17);
2609           v[7] = k_madd_epi32(u[7], k32_p15_p17);
2610           v[8] = k_madd_epi32(u[8], k32_p23_p09);
2611           v[9] = k_madd_epi32(u[9], k32_p23_p09);
2612           v[10] = k_madd_epi32(u[10], k32_p23_p09);
2613           v[11] = k_madd_epi32(u[11], k32_p23_p09);
2614           v[12] = k_madd_epi32(u[12], k32_p07_p25);
2615           v[13] = k_madd_epi32(u[13], k32_p07_p25);
2616           v[14] = k_madd_epi32(u[14], k32_p07_p25);
2617           v[15] = k_madd_epi32(u[15], k32_p07_p25);
2618           v[16] = k_madd_epi32(u[12], k32_m25_p07);
2619           v[17] = k_madd_epi32(u[13], k32_m25_p07);
2620           v[18] = k_madd_epi32(u[14], k32_m25_p07);
2621           v[19] = k_madd_epi32(u[15], k32_m25_p07);
2622           v[20] = k_madd_epi32(u[8], k32_m09_p23);
2623           v[21] = k_madd_epi32(u[9], k32_m09_p23);
2624           v[22] = k_madd_epi32(u[10], k32_m09_p23);
2625           v[23] = k_madd_epi32(u[11], k32_m09_p23);
2626           v[24] = k_madd_epi32(u[4], k32_m17_p15);
2627           v[25] = k_madd_epi32(u[5], k32_m17_p15);
2628           v[26] = k_madd_epi32(u[6], k32_m17_p15);
2629           v[27] = k_madd_epi32(u[7], k32_m17_p15);
2630           v[28] = k_madd_epi32(u[0], k32_m01_p31);
2631           v[29] = k_madd_epi32(u[1], k32_m01_p31);
2632           v[30] = k_madd_epi32(u[2], k32_m01_p31);
2633           v[31] = k_madd_epi32(u[3], k32_m01_p31);
2634 
2635 #if DCT_HIGH_BIT_DEPTH
2636           overflow = k_check_epi32_overflow_32(
2637               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2638               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2639               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2640               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2641           if (overflow) {
2642             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2643             return;
2644           }
2645 #endif  // DCT_HIGH_BIT_DEPTH
2646           u[0] = k_packs_epi64(v[0], v[1]);
2647           u[1] = k_packs_epi64(v[2], v[3]);
2648           u[2] = k_packs_epi64(v[4], v[5]);
2649           u[3] = k_packs_epi64(v[6], v[7]);
2650           u[4] = k_packs_epi64(v[8], v[9]);
2651           u[5] = k_packs_epi64(v[10], v[11]);
2652           u[6] = k_packs_epi64(v[12], v[13]);
2653           u[7] = k_packs_epi64(v[14], v[15]);
2654           u[8] = k_packs_epi64(v[16], v[17]);
2655           u[9] = k_packs_epi64(v[18], v[19]);
2656           u[10] = k_packs_epi64(v[20], v[21]);
2657           u[11] = k_packs_epi64(v[22], v[23]);
2658           u[12] = k_packs_epi64(v[24], v[25]);
2659           u[13] = k_packs_epi64(v[26], v[27]);
2660           u[14] = k_packs_epi64(v[28], v[29]);
2661           u[15] = k_packs_epi64(v[30], v[31]);
2662 
2663           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2664           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2665           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2666           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2667           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2668           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2669           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2670           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2671           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2672           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2673           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2674           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2675           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2676           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2677           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2678           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2679 
2680           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2681           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2682           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2683           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2684           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2685           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2686           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2687           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2688           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2689           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2690           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2691           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2692           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2693           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2694           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2695           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2696 
2697           v[0] = _mm_cmplt_epi32(u[0], kZero);
2698           v[1] = _mm_cmplt_epi32(u[1], kZero);
2699           v[2] = _mm_cmplt_epi32(u[2], kZero);
2700           v[3] = _mm_cmplt_epi32(u[3], kZero);
2701           v[4] = _mm_cmplt_epi32(u[4], kZero);
2702           v[5] = _mm_cmplt_epi32(u[5], kZero);
2703           v[6] = _mm_cmplt_epi32(u[6], kZero);
2704           v[7] = _mm_cmplt_epi32(u[7], kZero);
2705           v[8] = _mm_cmplt_epi32(u[8], kZero);
2706           v[9] = _mm_cmplt_epi32(u[9], kZero);
2707           v[10] = _mm_cmplt_epi32(u[10], kZero);
2708           v[11] = _mm_cmplt_epi32(u[11], kZero);
2709           v[12] = _mm_cmplt_epi32(u[12], kZero);
2710           v[13] = _mm_cmplt_epi32(u[13], kZero);
2711           v[14] = _mm_cmplt_epi32(u[14], kZero);
2712           v[15] = _mm_cmplt_epi32(u[15], kZero);
2713 
2714           u[0] = _mm_sub_epi32(u[0], v[0]);
2715           u[1] = _mm_sub_epi32(u[1], v[1]);
2716           u[2] = _mm_sub_epi32(u[2], v[2]);
2717           u[3] = _mm_sub_epi32(u[3], v[3]);
2718           u[4] = _mm_sub_epi32(u[4], v[4]);
2719           u[5] = _mm_sub_epi32(u[5], v[5]);
2720           u[6] = _mm_sub_epi32(u[6], v[6]);
2721           u[7] = _mm_sub_epi32(u[7], v[7]);
2722           u[8] = _mm_sub_epi32(u[8], v[8]);
2723           u[9] = _mm_sub_epi32(u[9], v[9]);
2724           u[10] = _mm_sub_epi32(u[10], v[10]);
2725           u[11] = _mm_sub_epi32(u[11], v[11]);
2726           u[12] = _mm_sub_epi32(u[12], v[12]);
2727           u[13] = _mm_sub_epi32(u[13], v[13]);
2728           u[14] = _mm_sub_epi32(u[14], v[14]);
2729           u[15] = _mm_sub_epi32(u[15], v[15]);
2730 
2731           v[0] = _mm_add_epi32(u[0], K32One);
2732           v[1] = _mm_add_epi32(u[1], K32One);
2733           v[2] = _mm_add_epi32(u[2], K32One);
2734           v[3] = _mm_add_epi32(u[3], K32One);
2735           v[4] = _mm_add_epi32(u[4], K32One);
2736           v[5] = _mm_add_epi32(u[5], K32One);
2737           v[6] = _mm_add_epi32(u[6], K32One);
2738           v[7] = _mm_add_epi32(u[7], K32One);
2739           v[8] = _mm_add_epi32(u[8], K32One);
2740           v[9] = _mm_add_epi32(u[9], K32One);
2741           v[10] = _mm_add_epi32(u[10], K32One);
2742           v[11] = _mm_add_epi32(u[11], K32One);
2743           v[12] = _mm_add_epi32(u[12], K32One);
2744           v[13] = _mm_add_epi32(u[13], K32One);
2745           v[14] = _mm_add_epi32(u[14], K32One);
2746           v[15] = _mm_add_epi32(u[15], K32One);
2747 
2748           u[0] = _mm_srai_epi32(v[0], 2);
2749           u[1] = _mm_srai_epi32(v[1], 2);
2750           u[2] = _mm_srai_epi32(v[2], 2);
2751           u[3] = _mm_srai_epi32(v[3], 2);
2752           u[4] = _mm_srai_epi32(v[4], 2);
2753           u[5] = _mm_srai_epi32(v[5], 2);
2754           u[6] = _mm_srai_epi32(v[6], 2);
2755           u[7] = _mm_srai_epi32(v[7], 2);
2756           u[8] = _mm_srai_epi32(v[8], 2);
2757           u[9] = _mm_srai_epi32(v[9], 2);
2758           u[10] = _mm_srai_epi32(v[10], 2);
2759           u[11] = _mm_srai_epi32(v[11], 2);
2760           u[12] = _mm_srai_epi32(v[12], 2);
2761           u[13] = _mm_srai_epi32(v[13], 2);
2762           u[14] = _mm_srai_epi32(v[14], 2);
2763           u[15] = _mm_srai_epi32(v[15], 2);
2764 
2765           out[1] = _mm_packs_epi32(u[0], u[1]);
2766           out[17] = _mm_packs_epi32(u[2], u[3]);
2767           out[9] = _mm_packs_epi32(u[4], u[5]);
2768           out[25] = _mm_packs_epi32(u[6], u[7]);
2769           out[7] = _mm_packs_epi32(u[8], u[9]);
2770           out[23] = _mm_packs_epi32(u[10], u[11]);
2771           out[15] = _mm_packs_epi32(u[12], u[13]);
2772           out[31] = _mm_packs_epi32(u[14], u[15]);
2773 #if DCT_HIGH_BIT_DEPTH
2774           overflow =
2775               check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
2776                                       &out[7], &out[23], &out[15], &out[31]);
2777           if (overflow) {
2778             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2779             return;
2780           }
2781 #endif  // DCT_HIGH_BIT_DEPTH
2782         }
2783         {
2784           const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
2785           const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
2786           const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
2787           const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
2788           const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
2789           const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
2790           const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
2791           const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
2792 
2793           u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
2794           u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
2795           u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
2796           u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
2797           u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
2798           u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
2799           u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
2800           u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
2801           u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
2802           u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
2803           u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
2804           u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
2805           u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
2806           u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
2807           u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
2808           u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
2809 
2810           v[0] = k_madd_epi32(u[0], k32_p27_p05);
2811           v[1] = k_madd_epi32(u[1], k32_p27_p05);
2812           v[2] = k_madd_epi32(u[2], k32_p27_p05);
2813           v[3] = k_madd_epi32(u[3], k32_p27_p05);
2814           v[4] = k_madd_epi32(u[4], k32_p11_p21);
2815           v[5] = k_madd_epi32(u[5], k32_p11_p21);
2816           v[6] = k_madd_epi32(u[6], k32_p11_p21);
2817           v[7] = k_madd_epi32(u[7], k32_p11_p21);
2818           v[8] = k_madd_epi32(u[8], k32_p19_p13);
2819           v[9] = k_madd_epi32(u[9], k32_p19_p13);
2820           v[10] = k_madd_epi32(u[10], k32_p19_p13);
2821           v[11] = k_madd_epi32(u[11], k32_p19_p13);
2822           v[12] = k_madd_epi32(u[12], k32_p03_p29);
2823           v[13] = k_madd_epi32(u[13], k32_p03_p29);
2824           v[14] = k_madd_epi32(u[14], k32_p03_p29);
2825           v[15] = k_madd_epi32(u[15], k32_p03_p29);
2826           v[16] = k_madd_epi32(u[12], k32_m29_p03);
2827           v[17] = k_madd_epi32(u[13], k32_m29_p03);
2828           v[18] = k_madd_epi32(u[14], k32_m29_p03);
2829           v[19] = k_madd_epi32(u[15], k32_m29_p03);
2830           v[20] = k_madd_epi32(u[8], k32_m13_p19);
2831           v[21] = k_madd_epi32(u[9], k32_m13_p19);
2832           v[22] = k_madd_epi32(u[10], k32_m13_p19);
2833           v[23] = k_madd_epi32(u[11], k32_m13_p19);
2834           v[24] = k_madd_epi32(u[4], k32_m21_p11);
2835           v[25] = k_madd_epi32(u[5], k32_m21_p11);
2836           v[26] = k_madd_epi32(u[6], k32_m21_p11);
2837           v[27] = k_madd_epi32(u[7], k32_m21_p11);
2838           v[28] = k_madd_epi32(u[0], k32_m05_p27);
2839           v[29] = k_madd_epi32(u[1], k32_m05_p27);
2840           v[30] = k_madd_epi32(u[2], k32_m05_p27);
2841           v[31] = k_madd_epi32(u[3], k32_m05_p27);
2842 
2843 #if DCT_HIGH_BIT_DEPTH
2844           overflow = k_check_epi32_overflow_32(
2845               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2846               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2847               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2848               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2849           if (overflow) {
2850             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2851             return;
2852           }
2853 #endif  // DCT_HIGH_BIT_DEPTH
2854           u[0] = k_packs_epi64(v[0], v[1]);
2855           u[1] = k_packs_epi64(v[2], v[3]);
2856           u[2] = k_packs_epi64(v[4], v[5]);
2857           u[3] = k_packs_epi64(v[6], v[7]);
2858           u[4] = k_packs_epi64(v[8], v[9]);
2859           u[5] = k_packs_epi64(v[10], v[11]);
2860           u[6] = k_packs_epi64(v[12], v[13]);
2861           u[7] = k_packs_epi64(v[14], v[15]);
2862           u[8] = k_packs_epi64(v[16], v[17]);
2863           u[9] = k_packs_epi64(v[18], v[19]);
2864           u[10] = k_packs_epi64(v[20], v[21]);
2865           u[11] = k_packs_epi64(v[22], v[23]);
2866           u[12] = k_packs_epi64(v[24], v[25]);
2867           u[13] = k_packs_epi64(v[26], v[27]);
2868           u[14] = k_packs_epi64(v[28], v[29]);
2869           u[15] = k_packs_epi64(v[30], v[31]);
2870 
2871           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2872           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2873           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2874           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2875           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2876           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2877           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2878           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2879           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2880           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2881           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2882           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2883           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2884           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2885           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2886           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2887 
2888           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2889           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2890           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2891           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2892           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2893           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2894           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2895           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2896           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2897           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2898           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2899           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2900           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2901           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2902           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2903           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2904 
2905           v[0] = _mm_cmplt_epi32(u[0], kZero);
2906           v[1] = _mm_cmplt_epi32(u[1], kZero);
2907           v[2] = _mm_cmplt_epi32(u[2], kZero);
2908           v[3] = _mm_cmplt_epi32(u[3], kZero);
2909           v[4] = _mm_cmplt_epi32(u[4], kZero);
2910           v[5] = _mm_cmplt_epi32(u[5], kZero);
2911           v[6] = _mm_cmplt_epi32(u[6], kZero);
2912           v[7] = _mm_cmplt_epi32(u[7], kZero);
2913           v[8] = _mm_cmplt_epi32(u[8], kZero);
2914           v[9] = _mm_cmplt_epi32(u[9], kZero);
2915           v[10] = _mm_cmplt_epi32(u[10], kZero);
2916           v[11] = _mm_cmplt_epi32(u[11], kZero);
2917           v[12] = _mm_cmplt_epi32(u[12], kZero);
2918           v[13] = _mm_cmplt_epi32(u[13], kZero);
2919           v[14] = _mm_cmplt_epi32(u[14], kZero);
2920           v[15] = _mm_cmplt_epi32(u[15], kZero);
2921 
2922           u[0] = _mm_sub_epi32(u[0], v[0]);
2923           u[1] = _mm_sub_epi32(u[1], v[1]);
2924           u[2] = _mm_sub_epi32(u[2], v[2]);
2925           u[3] = _mm_sub_epi32(u[3], v[3]);
2926           u[4] = _mm_sub_epi32(u[4], v[4]);
2927           u[5] = _mm_sub_epi32(u[5], v[5]);
2928           u[6] = _mm_sub_epi32(u[6], v[6]);
2929           u[7] = _mm_sub_epi32(u[7], v[7]);
2930           u[8] = _mm_sub_epi32(u[8], v[8]);
2931           u[9] = _mm_sub_epi32(u[9], v[9]);
2932           u[10] = _mm_sub_epi32(u[10], v[10]);
2933           u[11] = _mm_sub_epi32(u[11], v[11]);
2934           u[12] = _mm_sub_epi32(u[12], v[12]);
2935           u[13] = _mm_sub_epi32(u[13], v[13]);
2936           u[14] = _mm_sub_epi32(u[14], v[14]);
2937           u[15] = _mm_sub_epi32(u[15], v[15]);
2938 
2939           v[0] = _mm_add_epi32(u[0], K32One);
2940           v[1] = _mm_add_epi32(u[1], K32One);
2941           v[2] = _mm_add_epi32(u[2], K32One);
2942           v[3] = _mm_add_epi32(u[3], K32One);
2943           v[4] = _mm_add_epi32(u[4], K32One);
2944           v[5] = _mm_add_epi32(u[5], K32One);
2945           v[6] = _mm_add_epi32(u[6], K32One);
2946           v[7] = _mm_add_epi32(u[7], K32One);
2947           v[8] = _mm_add_epi32(u[8], K32One);
2948           v[9] = _mm_add_epi32(u[9], K32One);
2949           v[10] = _mm_add_epi32(u[10], K32One);
2950           v[11] = _mm_add_epi32(u[11], K32One);
2951           v[12] = _mm_add_epi32(u[12], K32One);
2952           v[13] = _mm_add_epi32(u[13], K32One);
2953           v[14] = _mm_add_epi32(u[14], K32One);
2954           v[15] = _mm_add_epi32(u[15], K32One);
2955 
2956           u[0] = _mm_srai_epi32(v[0], 2);
2957           u[1] = _mm_srai_epi32(v[1], 2);
2958           u[2] = _mm_srai_epi32(v[2], 2);
2959           u[3] = _mm_srai_epi32(v[3], 2);
2960           u[4] = _mm_srai_epi32(v[4], 2);
2961           u[5] = _mm_srai_epi32(v[5], 2);
2962           u[6] = _mm_srai_epi32(v[6], 2);
2963           u[7] = _mm_srai_epi32(v[7], 2);
2964           u[8] = _mm_srai_epi32(v[8], 2);
2965           u[9] = _mm_srai_epi32(v[9], 2);
2966           u[10] = _mm_srai_epi32(v[10], 2);
2967           u[11] = _mm_srai_epi32(v[11], 2);
2968           u[12] = _mm_srai_epi32(v[12], 2);
2969           u[13] = _mm_srai_epi32(v[13], 2);
2970           u[14] = _mm_srai_epi32(v[14], 2);
2971           u[15] = _mm_srai_epi32(v[15], 2);
2972 
2973           out[5] = _mm_packs_epi32(u[0], u[1]);
2974           out[21] = _mm_packs_epi32(u[2], u[3]);
2975           out[13] = _mm_packs_epi32(u[4], u[5]);
2976           out[29] = _mm_packs_epi32(u[6], u[7]);
2977           out[3] = _mm_packs_epi32(u[8], u[9]);
2978           out[19] = _mm_packs_epi32(u[10], u[11]);
2979           out[11] = _mm_packs_epi32(u[12], u[13]);
2980           out[27] = _mm_packs_epi32(u[14], u[15]);
2981 #if DCT_HIGH_BIT_DEPTH
2982           overflow =
2983               check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
2984                                       &out[3], &out[19], &out[11], &out[27]);
2985           if (overflow) {
2986             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2987             return;
2988           }
2989 #endif  // DCT_HIGH_BIT_DEPTH
2990         }
2991       }
2992 #endif  // FDCT32x32_HIGH_PRECISION
2993       // Transpose the results, do it as four 8x8 transposes.
2994       {
2995         int transpose_block;
2996         int16_t *output0 = &intermediate[column_start * 32];
2997         tran_low_t *output1 = &output_org[column_start * 32];
2998         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
2999           __m128i *this_out = &out[8 * transpose_block];
3000           // 00 01 02 03 04 05 06 07
3001           // 10 11 12 13 14 15 16 17
3002           // 20 21 22 23 24 25 26 27
3003           // 30 31 32 33 34 35 36 37
3004           // 40 41 42 43 44 45 46 47
3005           // 50 51 52 53 54 55 56 57
3006           // 60 61 62 63 64 65 66 67
3007           // 70 71 72 73 74 75 76 77
3008           const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
3009           const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
3010           const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
3011           const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
3012           const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
3013           const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
3014           const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
3015           const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
3016           // 00 10 01 11 02 12 03 13
3017           // 20 30 21 31 22 32 23 33
3018           // 04 14 05 15 06 16 07 17
3019           // 24 34 25 35 26 36 27 37
3020           // 40 50 41 51 42 52 43 53
3021           // 60 70 61 71 62 72 63 73
3022           // 54 54 55 55 56 56 57 57
3023           // 64 74 65 75 66 76 67 77
3024           const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
3025           const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
3026           const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
3027           const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
3028           const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
3029           const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
3030           const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
3031           const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
3032           // 00 10 20 30 01 11 21 31
3033           // 40 50 60 70 41 51 61 71
3034           // 02 12 22 32 03 13 23 33
3035           // 42 52 62 72 43 53 63 73
3036           // 04 14 24 34 05 15 21 36
3037           // 44 54 64 74 45 55 61 76
3038           // 06 16 26 36 07 17 27 37
3039           // 46 56 66 76 47 57 67 77
3040           __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
3041           __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
3042           __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
3043           __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
3044           __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
3045           __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
3046           __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
3047           __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
3048           // 00 10 20 30 40 50 60 70
3049           // 01 11 21 31 41 51 61 71
3050           // 02 12 22 32 42 52 62 72
3051           // 03 13 23 33 43 53 63 73
3052           // 04 14 24 34 44 54 64 74
3053           // 05 15 25 35 45 55 65 75
3054           // 06 16 26 36 46 56 66 76
3055           // 07 17 27 37 47 57 67 77
3056           if (0 == pass) {
3057             // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
3058             // TODO(cd): see quality impact of only doing
3059             //           output[j] = (output[j] + 1) >> 2;
3060             //           which would remove the code between here ...
3061             __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
3062             __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
3063             __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
3064             __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
3065             __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
3066             __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
3067             __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
3068             __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
3069             tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
3070             tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
3071             tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
3072             tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
3073             tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
3074             tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
3075             tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
3076             tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
3077             //           ... and here.
3078             //           PS: also change code in vp9/encoder/vp9_dct.c
3079             tr2_0 = _mm_add_epi16(tr2_0, kOne);
3080             tr2_1 = _mm_add_epi16(tr2_1, kOne);
3081             tr2_2 = _mm_add_epi16(tr2_2, kOne);
3082             tr2_3 = _mm_add_epi16(tr2_3, kOne);
3083             tr2_4 = _mm_add_epi16(tr2_4, kOne);
3084             tr2_5 = _mm_add_epi16(tr2_5, kOne);
3085             tr2_6 = _mm_add_epi16(tr2_6, kOne);
3086             tr2_7 = _mm_add_epi16(tr2_7, kOne);
3087             tr2_0 = _mm_srai_epi16(tr2_0, 2);
3088             tr2_1 = _mm_srai_epi16(tr2_1, 2);
3089             tr2_2 = _mm_srai_epi16(tr2_2, 2);
3090             tr2_3 = _mm_srai_epi16(tr2_3, 2);
3091             tr2_4 = _mm_srai_epi16(tr2_4, 2);
3092             tr2_5 = _mm_srai_epi16(tr2_5, 2);
3093             tr2_6 = _mm_srai_epi16(tr2_6, 2);
3094             tr2_7 = _mm_srai_epi16(tr2_7, 2);
3095           }
3096           // Note: even though all these stores are aligned, using the aligned
3097           //       intrinsic make the code slightly slower.
3098           if (pass == 0) {
3099             _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
3100             _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
3101             _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
3102             _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
3103             _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
3104             _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
3105             _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
3106             _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
3107             // Process next 8x8
3108             output0 += 8;
3109           } else {
3110             storeu_output(&tr2_0, (output1 + 0 * 32));
3111             storeu_output(&tr2_1, (output1 + 1 * 32));
3112             storeu_output(&tr2_2, (output1 + 2 * 32));
3113             storeu_output(&tr2_3, (output1 + 3 * 32));
3114             storeu_output(&tr2_4, (output1 + 4 * 32));
3115             storeu_output(&tr2_5, (output1 + 5 * 32));
3116             storeu_output(&tr2_6, (output1 + 6 * 32));
3117             storeu_output(&tr2_7, (output1 + 7 * 32));
3118             // Process next 8x8
3119             output1 += 8;
3120           }
3121         }
3122       }
3123     }
3124   }
3125 }  // NOLINT
3126 
3127 #undef ADD_EPI16
3128 #undef SUB_EPI16
3129 #undef HIGH_FDCT32x32_2D_C
3130 #undef HIGH_FDCT32x32_2D_ROWS_C
3131