• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse2.h"
21 #include "av1/common/x86/av1_txfm_sse4.h"
22 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
23 
highbd_clamp_epi16(__m128i u,int bd)24 static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
25   const __m128i zero = _mm_setzero_si128();
26   const __m128i one = _mm_set1_epi16(1);
27   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
28   __m128i clamped, mask;
29 
30   mask = _mm_cmpgt_epi16(u, max);
31   clamped = _mm_andnot_si128(mask, u);
32   mask = _mm_and_si128(mask, max);
33   clamped = _mm_or_si128(mask, clamped);
34   mask = _mm_cmpgt_epi16(clamped, zero);
35   clamped = _mm_and_si128(clamped, mask);
36 
37   return clamped;
38 }
39 
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)40 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
41                                                   __m128i res0, __m128i res1,
42                                                   const int bd) {
43   __m128i x0 = _mm_cvtepi16_epi32(pred);
44   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
45 
46   x0 = _mm_add_epi32(res0, x0);
47   x1 = _mm_add_epi32(res1, x1);
48   x0 = _mm_packus_epi32(x0, x1);
49   x0 = highbd_clamp_epi16(x0, bd);
50   return x0;
51 }
52 
highbd_get_recon_4xn_sse4_1(const __m128i pred,__m128i res0,const int bd)53 static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
54                                                   __m128i res0, const int bd) {
55   __m128i x0 = _mm_cvtepi16_epi32(pred);
56 
57   x0 = _mm_add_epi32(res0, x0);
58   x0 = _mm_packus_epi32(x0, x0);
59   x0 = highbd_clamp_epi16(x0, bd);
60   return x0;
61 }
62 
highbd_write_buffer_4xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)63 static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
64                                                   int stride, int flipud,
65                                                   int height, const int bd) {
66   int j = flipud ? (height - 1) : 0;
67   const int step = flipud ? -1 : 1;
68   for (int i = 0; i < height; ++i, j += step) {
69     __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
70     __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
71 
72     _mm_storel_epi64((__m128i *)(output + i * stride), u);
73   }
74 }
75 
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)76 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
77                                                   int stride, int flipud,
78                                                   int height, const int bd) {
79   int j = flipud ? (height - 1) : 0;
80   const int step = flipud ? -1 : 1;
81   for (int i = 0; i < height; ++i, j += step) {
82     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
83     __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
84 
85     _mm_storeu_si128((__m128i *)(output + i * stride), u);
86   }
87 }
88 
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)89 static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
90                                            __m128i *out, int out_size) {
91   for (int i = 0; i < out_size; ++i) {
92     out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
93   }
94 }
95 
load_buffer_4x4(const int32_t * coeff,__m128i * in)96 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
97   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
98   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
99   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
100   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
101 }
102 
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)103 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
104                           __m128i *out1, const __m128i *clamp_lo,
105                           const __m128i *clamp_hi) {
106   __m128i a0 = _mm_add_epi32(in0, in1);
107   __m128i a1 = _mm_sub_epi32(in0, in1);
108 
109   a0 = _mm_max_epi32(a0, *clamp_lo);
110   a0 = _mm_min_epi32(a0, *clamp_hi);
111   a1 = _mm_max_epi32(a1, *clamp_lo);
112   a1 = _mm_min_epi32(a1, *clamp_hi);
113 
114   *out0 = a0;
115   *out1 = a1;
116 }
117 
addsub_no_clamp_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1)118 static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
119                                    __m128i *out0, __m128i *out1) {
120   __m128i a0 = _mm_add_epi32(in0, in1);
121   __m128i a1 = _mm_sub_epi32(in0, in1);
122 
123   *out0 = a0;
124   *out1 = a1;
125 }
126 
addsub_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)127 static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
128                                 __m128i *out0, __m128i *out1,
129                                 const __m128i *clamp_lo,
130                                 const __m128i *clamp_hi, int shift) {
131   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
132   __m128i in0_w_offset = _mm_add_epi32(in0, offset);
133   __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
134   __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
135 
136   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
137   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
138 
139   a0 = _mm_max_epi32(a0, *clamp_lo);
140   a0 = _mm_min_epi32(a0, *clamp_hi);
141   a1 = _mm_max_epi32(a1, *clamp_lo);
142   a1 = _mm_min_epi32(a1, *clamp_hi);
143 
144   *out0 = a0;
145   *out1 = a1;
146 }
147 
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)148 static INLINE void idct32_stage4_sse4_1(
149     __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
150     const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
151     const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
152     const __m128i *rounding, int bit) {
153   __m128i temp1, temp2;
154   temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
155   bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
156   bf1[17] = temp1;
157 
158   temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
159   bf1[29] =
160       half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
161   bf1[18] = temp2;
162 
163   temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
164   bf1[26] =
165       half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
166   bf1[21] = temp1;
167 
168   temp2 =
169       half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
170   bf1[25] =
171       half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
172   bf1[22] = temp2;
173 }
174 
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)175 static INLINE void idct32_stage5_sse4_1(
176     __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
177     const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
178     const __m128i *clamp_hi, const __m128i *rounding, int bit) {
179   __m128i temp1, temp2;
180   temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
181   bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
182   bf1[9] = temp1;
183 
184   temp2 =
185       half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
186   bf1[13] =
187       half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
188   bf1[10] = temp2;
189 
190   addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
191   addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
192   addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
193   addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
194   addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
195   addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
196   addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
197   addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
198 }
199 
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)200 static INLINE void idct32_stage6_sse4_1(
201     __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
202     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
203     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
204     const __m128i *rounding, int bit) {
205   __m128i temp1, temp2;
206   temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
207   bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
208   bf1[5] = temp1;
209 
210   addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
211   addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
212   addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
213   addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
214 
215   temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
216   bf1[29] =
217       half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
218   bf1[18] = temp1;
219   temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
220   bf1[28] =
221       half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
222   bf1[19] = temp2;
223   temp1 =
224       half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
225   bf1[27] =
226       half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
227   bf1[20] = temp1;
228   temp2 =
229       half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
230   bf1[26] =
231       half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
232   bf1[21] = temp2;
233 }
234 
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)235 static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
236                                         const __m128i *cospi32,
237                                         const __m128i *clamp_lo,
238                                         const __m128i *clamp_hi,
239                                         const __m128i *rounding, int bit) {
240   __m128i temp1, temp2;
241   addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
242   addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
243   addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
244   addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
245 
246   temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
247   bf1[13] =
248       half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
249   bf1[10] = temp1;
250   temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
251   bf1[12] =
252       half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
253   bf1[11] = temp2;
254 
255   addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
256   addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
257   addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
258   addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
259   addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
260   addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
261   addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
262   addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
263 }
264 
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)265 static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
266                                         const __m128i *cospi32,
267                                         const __m128i *clamp_lo,
268                                         const __m128i *clamp_hi,
269                                         const __m128i *rounding, int bit) {
270   __m128i temp1, temp2;
271   addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
272   addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
273   addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
274   addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
275   addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
276   addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
277   addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
278   addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
279 
280   temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
281   bf1[27] =
282       half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
283   bf1[20] = temp1;
284   temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
285   bf1[26] =
286       half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
287   bf1[21] = temp2;
288   temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
289   bf1[25] =
290       half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
291   bf1[22] = temp1;
292   temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
293   bf1[24] =
294       half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
295   bf1[23] = temp2;
296 }
297 
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const int log_range)298 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
299                                         const int do_cols, const int bd,
300                                         const int out_shift,
301                                         const int log_range) {
302   if (do_cols) {
303     addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
304     addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
305     addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
306     addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
307     addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
308     addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
309     addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
310     addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
311     addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
312     addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
313     addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
314     addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
315     addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
316     addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
317     addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
318     addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
319   } else {
320     const int log_range_out = AOMMAX(16, bd + 6);
321     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
322         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
323     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
324         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
325 
326     addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
327                         &clamp_hi_out, out_shift);
328     addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
329                         &clamp_hi_out, out_shift);
330     addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
331                         &clamp_hi_out, out_shift);
332     addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
333                         &clamp_hi_out, out_shift);
334     addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
335                         &clamp_hi_out, out_shift);
336     addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
337                         &clamp_hi_out, out_shift);
338     addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
339                         &clamp_hi_out, out_shift);
340     addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
341                         &clamp_hi_out, out_shift);
342     addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
343                         &clamp_hi_out, out_shift);
344     addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
345                         &clamp_hi_out, out_shift);
346     addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
347                         &clamp_hi_out, out_shift);
348     addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
349                         &clamp_hi_out, out_shift);
350     addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
351                         &clamp_hi_out, out_shift);
352     addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
353                         &clamp_hi_out, out_shift);
354     addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
355                         &clamp_hi_out, out_shift);
356     addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
357                         &clamp_hi_out, out_shift);
358   }
359 }
360 
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)361 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
362                              __m128i *out0, __m128i *out1,
363                              const __m128i *clamp_lo, const __m128i *clamp_hi,
364                              int shift) {
365   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
366   __m128i a0 = _mm_add_epi32(offset, in0);
367   __m128i a1 = _mm_sub_epi32(offset, in1);
368 
369   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
370   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
371 
372   a0 = _mm_max_epi32(a0, *clamp_lo);
373   a0 = _mm_min_epi32(a0, *clamp_hi);
374   a1 = _mm_max_epi32(a1, *clamp_lo);
375   a1 = _mm_min_epi32(a1, *clamp_hi);
376 
377   *out0 = a0;
378   *out1 = a1;
379 }
380 
idct4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)381 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
382                            int bd, int out_shift) {
383   (void)out_shift;
384   const int32_t *cospi = cospi_arr(bit);
385   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
386   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
387   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
388   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
389   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
390 
391   __m128i u0, u1, u2, u3;
392   __m128i v0, v1, v2, v3, x, y;
393 
394   v0 = _mm_unpacklo_epi32(in[0], in[1]);
395   v1 = _mm_unpackhi_epi32(in[0], in[1]);
396   v2 = _mm_unpacklo_epi32(in[2], in[3]);
397   v3 = _mm_unpackhi_epi32(in[2], in[3]);
398 
399   u0 = _mm_unpacklo_epi64(v0, v2);
400   u1 = _mm_unpackhi_epi64(v0, v2);
401   u2 = _mm_unpacklo_epi64(v1, v3);
402   u3 = _mm_unpackhi_epi64(v1, v3);
403 
404   x = _mm_mullo_epi32(u0, cospi32);
405   y = _mm_mullo_epi32(u2, cospi32);
406   v0 = _mm_add_epi32(x, y);
407   v0 = _mm_add_epi32(v0, rnding);
408   v0 = _mm_srai_epi32(v0, bit);
409 
410   v1 = _mm_sub_epi32(x, y);
411   v1 = _mm_add_epi32(v1, rnding);
412   v1 = _mm_srai_epi32(v1, bit);
413 
414   x = _mm_mullo_epi32(u1, cospi48);
415   y = _mm_mullo_epi32(u3, cospim16);
416   v2 = _mm_add_epi32(x, y);
417   v2 = _mm_add_epi32(v2, rnding);
418   v2 = _mm_srai_epi32(v2, bit);
419 
420   x = _mm_mullo_epi32(u1, cospi16);
421   y = _mm_mullo_epi32(u3, cospi48);
422   v3 = _mm_add_epi32(x, y);
423   v3 = _mm_add_epi32(v3, rnding);
424   v3 = _mm_srai_epi32(v3, bit);
425 
426   if (do_cols) {
427     addsub_no_clamp_sse4_1(v0, v3, out + 0, out + 3);
428     addsub_no_clamp_sse4_1(v1, v2, out + 1, out + 2);
429   } else {
430     const int log_range = AOMMAX(16, bd + 6);
431     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
432     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
433     addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
434     addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
435   }
436 }
437 
iadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)438 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
439                             int bd, int out_shift) {
440   (void)out_shift;
441   const int32_t *sinpi = sinpi_arr(bit);
442   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
443   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
444   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
445   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
446   const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
447   __m128i t;
448   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
449   __m128i x0, x1, x2, x3;
450   __m128i u0, u1, u2, u3;
451   __m128i v0, v1, v2, v3;
452 
453   v0 = _mm_unpacklo_epi32(in[0], in[1]);
454   v1 = _mm_unpackhi_epi32(in[0], in[1]);
455   v2 = _mm_unpacklo_epi32(in[2], in[3]);
456   v3 = _mm_unpackhi_epi32(in[2], in[3]);
457 
458   x0 = _mm_unpacklo_epi64(v0, v2);
459   x1 = _mm_unpackhi_epi64(v0, v2);
460   x2 = _mm_unpacklo_epi64(v1, v3);
461   x3 = _mm_unpackhi_epi64(v1, v3);
462 
463   s0 = _mm_mullo_epi32(x0, sinpi1);
464   s1 = _mm_mullo_epi32(x0, sinpi2);
465   s2 = _mm_mullo_epi32(x1, sinpi3);
466   s3 = _mm_mullo_epi32(x2, sinpi4);
467   s4 = _mm_mullo_epi32(x2, sinpi1);
468   s5 = _mm_mullo_epi32(x3, sinpi2);
469   s6 = _mm_mullo_epi32(x3, sinpi4);
470   t = _mm_sub_epi32(x0, x2);
471   s7 = _mm_add_epi32(t, x3);
472 
473   t = _mm_add_epi32(s0, s3);
474   s0 = _mm_add_epi32(t, s5);
475   t = _mm_sub_epi32(s1, s4);
476   s1 = _mm_sub_epi32(t, s6);
477   s3 = s2;
478   s2 = _mm_mullo_epi32(s7, sinpi3);
479 
480   u0 = _mm_add_epi32(s0, s3);
481   u1 = _mm_add_epi32(s1, s3);
482   u2 = s2;
483   t = _mm_add_epi32(s0, s1);
484   u3 = _mm_sub_epi32(t, s3);
485 
486   u0 = _mm_add_epi32(u0, rnding);
487   u0 = _mm_srai_epi32(u0, bit);
488 
489   u1 = _mm_add_epi32(u1, rnding);
490   u1 = _mm_srai_epi32(u1, bit);
491 
492   u2 = _mm_add_epi32(u2, rnding);
493   u2 = _mm_srai_epi32(u2, bit);
494 
495   u3 = _mm_add_epi32(u3, rnding);
496   u3 = _mm_srai_epi32(u3, bit);
497 
498   if (!do_cols) {
499     const int log_range = AOMMAX(16, bd + 6);
500     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
501     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
502 
503     u0 = _mm_max_epi32(u0, clamp_lo);
504     u0 = _mm_min_epi32(u0, clamp_hi);
505     u1 = _mm_max_epi32(u1, clamp_lo);
506     u1 = _mm_min_epi32(u1, clamp_hi);
507     u2 = _mm_max_epi32(u2, clamp_lo);
508     u2 = _mm_min_epi32(u2, clamp_hi);
509     u3 = _mm_max_epi32(u3, clamp_lo);
510     u3 = _mm_min_epi32(u3, clamp_hi);
511   }
512 
513   out[0] = u0;
514   out[1] = u1;
515   out[2] = u2;
516   out[3] = u3;
517 }
518 
round_shift_4x4(__m128i * in,int shift)519 static INLINE void round_shift_4x4(__m128i *in, int shift) {
520   __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
521 
522   in[0] = _mm_add_epi32(in[0], rnding);
523   in[1] = _mm_add_epi32(in[1], rnding);
524   in[2] = _mm_add_epi32(in[2], rnding);
525   in[3] = _mm_add_epi32(in[3], rnding);
526 
527   in[0] = _mm_srai_epi32(in[0], shift);
528   in[1] = _mm_srai_epi32(in[1], shift);
529   in[2] = _mm_srai_epi32(in[2], shift);
530   in[3] = _mm_srai_epi32(in[3], shift);
531 }
532 
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)533 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
534                              int fliplr, int flipud, int shift, int bd) {
535   const __m128i zero = _mm_setzero_si128();
536   __m128i u0, u1, u2, u3;
537   __m128i v0, v1, v2, v3;
538 
539   round_shift_4x4(in, shift);
540 
541   v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
542   v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
543   v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
544   v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
545 
546   v0 = _mm_unpacklo_epi16(v0, zero);
547   v1 = _mm_unpacklo_epi16(v1, zero);
548   v2 = _mm_unpacklo_epi16(v2, zero);
549   v3 = _mm_unpacklo_epi16(v3, zero);
550 
551   if (fliplr) {
552     in[0] = _mm_shuffle_epi32(in[0], 0x1B);
553     in[1] = _mm_shuffle_epi32(in[1], 0x1B);
554     in[2] = _mm_shuffle_epi32(in[2], 0x1B);
555     in[3] = _mm_shuffle_epi32(in[3], 0x1B);
556   }
557 
558   if (flipud) {
559     u0 = _mm_add_epi32(in[3], v0);
560     u1 = _mm_add_epi32(in[2], v1);
561     u2 = _mm_add_epi32(in[1], v2);
562     u3 = _mm_add_epi32(in[0], v3);
563   } else {
564     u0 = _mm_add_epi32(in[0], v0);
565     u1 = _mm_add_epi32(in[1], v1);
566     u2 = _mm_add_epi32(in[2], v2);
567     u3 = _mm_add_epi32(in[3], v3);
568   }
569 
570   v0 = _mm_packus_epi32(u0, u1);
571   v2 = _mm_packus_epi32(u2, u3);
572 
573   u0 = highbd_clamp_epi16(v0, bd);
574   u2 = highbd_clamp_epi16(v2, bd);
575 
576   v0 = _mm_unpacklo_epi64(u0, u0);
577   v1 = _mm_unpackhi_epi64(u0, u0);
578   v2 = _mm_unpacklo_epi64(u2, u2);
579   v3 = _mm_unpackhi_epi64(u2, u2);
580 
581   _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
582   _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
583   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
584   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
585 }
highbd_clamp_epi32_sse4_1(const __m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int size)586 static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
587                                       const __m128i *clamp_lo,
588                                       const __m128i *clamp_hi, int size) {
589   __m128i a0, a1;
590   for (int i = 0; i < size; i += 4) {
591     a0 = _mm_max_epi32(in[i], *clamp_lo);
592     out[i] = _mm_min_epi32(a0, *clamp_hi);
593 
594     a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
595     out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
596 
597     a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
598     out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
599 
600     a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
601     out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
602   }
603 }
iidentity4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)604 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
605                               int bd, int out_shift) {
606   (void)bit;
607   (void)out_shift;
608   __m128i v[4];
609   __m128i fact = _mm_set1_epi32(NewSqrt2);
610   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
611   __m128i a0, a1;
612 
613   a0 = _mm_mullo_epi32(in[0], fact);
614   a1 = _mm_mullo_epi32(in[1], fact);
615   a0 = _mm_add_epi32(a0, offset);
616   a1 = _mm_add_epi32(a1, offset);
617   out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
618   out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
619 
620   a0 = _mm_mullo_epi32(in[2], fact);
621   a1 = _mm_mullo_epi32(in[3], fact);
622   a0 = _mm_add_epi32(a0, offset);
623   a1 = _mm_add_epi32(a1, offset);
624   out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
625   out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
626 
627   if (!do_cols) {
628     const int log_range = AOMMAX(16, bd + 6);
629     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
630     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
631 
632     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
633   }
634 
635   // Transpose for 4x4
636   v[0] = _mm_unpacklo_epi32(out[0], out[1]);
637   v[1] = _mm_unpackhi_epi32(out[0], out[1]);
638   v[2] = _mm_unpacklo_epi32(out[2], out[3]);
639   v[3] = _mm_unpackhi_epi32(out[2], out[3]);
640 
641   out[0] = _mm_unpacklo_epi64(v[0], v[2]);
642   out[1] = _mm_unpackhi_epi64(v[0], v[2]);
643   out[2] = _mm_unpacklo_epi64(v[1], v[3]);
644   out[3] = _mm_unpackhi_epi64(v[1], v[3]);
645 }
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)646 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
647                                    int stride, TX_TYPE tx_type, int bd) {
648   __m128i in[4];
649   const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
650   const int txw_idx = get_txw_idx(TX_4X4);
651   const int txh_idx = get_txh_idx(TX_4X4);
652 
653   switch (tx_type) {
654     case DCT_DCT:
655       load_buffer_4x4(coeff, in);
656       idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
657       idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
658       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
659       break;
660     case ADST_DCT:
661       load_buffer_4x4(coeff, in);
662       idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
663       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
664       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
665       break;
666     case DCT_ADST:
667       load_buffer_4x4(coeff, in);
668       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
669       idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
670       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
671       break;
672     case ADST_ADST:
673       load_buffer_4x4(coeff, in);
674       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
675       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
676       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
677       break;
678     case FLIPADST_DCT:
679       load_buffer_4x4(coeff, in);
680       idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
681       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
682       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
683       break;
684     case DCT_FLIPADST:
685       load_buffer_4x4(coeff, in);
686       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
687       idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
688       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
689       break;
690     case FLIPADST_FLIPADST:
691       load_buffer_4x4(coeff, in);
692       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
693       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
694       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
695       break;
696     case ADST_FLIPADST:
697       load_buffer_4x4(coeff, in);
698       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
699       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
700       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
701       break;
702     case FLIPADST_ADST:
703       load_buffer_4x4(coeff, in);
704       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
705       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
706       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
707       break;
708     case IDTX:
709       load_buffer_4x4(coeff, in);
710       iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
711       iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
712       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
713       break;
714     case V_DCT:
715       load_buffer_4x4(coeff, in);
716       iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
717       idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
718       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
719       break;
720     case H_DCT:
721       load_buffer_4x4(coeff, in);
722       idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
723       iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
724       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
725       break;
726     case V_ADST:
727       load_buffer_4x4(coeff, in);
728       iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
729       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
730       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
731       break;
732     case H_ADST:
733       load_buffer_4x4(coeff, in);
734       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
735       iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
736       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
737       break;
738     case V_FLIPADST:
739       load_buffer_4x4(coeff, in);
740       iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
741       iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
742       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
743       break;
744     case H_FLIPADST:
745       load_buffer_4x4(coeff, in);
746       iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
747       iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
748       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
749       break;
750     default: assert(0);
751   }
752 }
753 
754 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)755 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
756   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
757   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
758   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
759   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
760   in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
761   in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
762   in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
763   in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
764   in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
765   in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
766   in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
767   in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
768   in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
769   in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
770   in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
771   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
772 }
773 
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)774 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
775                            int bd, int out_shift) {
776   const int32_t *cospi = cospi_arr(bit);
777   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
778   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
779   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
780   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
781   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
782   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
783   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
784   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
785   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
786   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
787   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
788   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
789   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
790   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
791   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
792   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
793   __m128i x, y;
794   int col;
795 
796   // Note:
797   //  Even column: 0, 2, ..., 14
798   //  Odd column: 1, 3, ..., 15
799   //  one even column plus one odd column constructs one row (8 coeffs)
800   //  total we have 8 rows (8x8).
801   for (col = 0; col < 2; ++col) {
802     // stage 0
803     // stage 1
804     // stage 2
805     u0 = in[0 * 2 + col];
806     u1 = in[4 * 2 + col];
807     u2 = in[2 * 2 + col];
808     u3 = in[6 * 2 + col];
809 
810     x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
811     y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
812     u4 = _mm_add_epi32(x, y);
813     u4 = _mm_add_epi32(u4, rnding);
814     u4 = _mm_srai_epi32(u4, bit);
815 
816     x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
817     y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
818     u7 = _mm_add_epi32(x, y);
819     u7 = _mm_add_epi32(u7, rnding);
820     u7 = _mm_srai_epi32(u7, bit);
821 
822     x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
823     y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
824     u5 = _mm_add_epi32(x, y);
825     u5 = _mm_add_epi32(u5, rnding);
826     u5 = _mm_srai_epi32(u5, bit);
827 
828     x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
829     y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
830     u6 = _mm_add_epi32(x, y);
831     u6 = _mm_add_epi32(u6, rnding);
832     u6 = _mm_srai_epi32(u6, bit);
833 
834     // stage 3
835     x = _mm_mullo_epi32(u0, cospi32);
836     y = _mm_mullo_epi32(u1, cospi32);
837     v0 = _mm_add_epi32(x, y);
838     v0 = _mm_add_epi32(v0, rnding);
839     v0 = _mm_srai_epi32(v0, bit);
840 
841     v1 = _mm_sub_epi32(x, y);
842     v1 = _mm_add_epi32(v1, rnding);
843     v1 = _mm_srai_epi32(v1, bit);
844 
845     x = _mm_mullo_epi32(u2, cospi48);
846     y = _mm_mullo_epi32(u3, cospim16);
847     v2 = _mm_add_epi32(x, y);
848     v2 = _mm_add_epi32(v2, rnding);
849     v2 = _mm_srai_epi32(v2, bit);
850 
851     x = _mm_mullo_epi32(u2, cospi16);
852     y = _mm_mullo_epi32(u3, cospi48);
853     v3 = _mm_add_epi32(x, y);
854     v3 = _mm_add_epi32(v3, rnding);
855     v3 = _mm_srai_epi32(v3, bit);
856 
857     addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
858     addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
859 
860     // stage 4
861     addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
862     addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
863     u4 = v4;
864     u7 = v7;
865 
866     x = _mm_mullo_epi32(v5, cospi32);
867     y = _mm_mullo_epi32(v6, cospi32);
868     u6 = _mm_add_epi32(y, x);
869     u6 = _mm_add_epi32(u6, rnding);
870     u6 = _mm_srai_epi32(u6, bit);
871 
872     u5 = _mm_sub_epi32(y, x);
873     u5 = _mm_add_epi32(u5, rnding);
874     u5 = _mm_srai_epi32(u5, bit);
875 
876     // stage 5
877     if (do_cols) {
878       addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
879       addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
880       addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
881       addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
882     } else {
883       const int log_range_out = AOMMAX(16, bd + 6);
884       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
885           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
886       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
887           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
888       addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
889                           &clamp_lo_out, &clamp_hi_out, out_shift);
890       addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
891                           &clamp_lo_out, &clamp_hi_out, out_shift);
892       addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
893                           &clamp_lo_out, &clamp_hi_out, out_shift);
894       addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
895                           &clamp_lo_out, &clamp_hi_out, out_shift);
896     }
897   }
898 }
899 
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)900 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
901                             int bd, int out_shift) {
902   const int32_t *cospi = cospi_arr(bit);
903   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
904   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
905   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
906   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
907   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
908   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
909   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
910   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
911   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
912   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
913   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
914   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
915   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
916   const __m128i kZero = _mm_setzero_si128();
917   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
918   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
919   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
920   __m128i u[8], v[8], x;
921 
922   // Even 8 points: 0, 2, ..., 14
923   // stage 0
924   // stage 1
925   // stage 2
926   // (1)
927   u[0] = _mm_mullo_epi32(in[14], cospi4);
928   x = _mm_mullo_epi32(in[0], cospi60);
929   u[0] = _mm_add_epi32(u[0], x);
930   u[0] = _mm_add_epi32(u[0], rnding);
931   u[0] = _mm_srai_epi32(u[0], bit);
932 
933   u[1] = _mm_mullo_epi32(in[14], cospi60);
934   x = _mm_mullo_epi32(in[0], cospi4);
935   u[1] = _mm_sub_epi32(u[1], x);
936   u[1] = _mm_add_epi32(u[1], rnding);
937   u[1] = _mm_srai_epi32(u[1], bit);
938 
939   // (2)
940   u[2] = _mm_mullo_epi32(in[10], cospi20);
941   x = _mm_mullo_epi32(in[4], cospi44);
942   u[2] = _mm_add_epi32(u[2], x);
943   u[2] = _mm_add_epi32(u[2], rnding);
944   u[2] = _mm_srai_epi32(u[2], bit);
945 
946   u[3] = _mm_mullo_epi32(in[10], cospi44);
947   x = _mm_mullo_epi32(in[4], cospi20);
948   u[3] = _mm_sub_epi32(u[3], x);
949   u[3] = _mm_add_epi32(u[3], rnding);
950   u[3] = _mm_srai_epi32(u[3], bit);
951 
952   // (3)
953   u[4] = _mm_mullo_epi32(in[6], cospi36);
954   x = _mm_mullo_epi32(in[8], cospi28);
955   u[4] = _mm_add_epi32(u[4], x);
956   u[4] = _mm_add_epi32(u[4], rnding);
957   u[4] = _mm_srai_epi32(u[4], bit);
958 
959   u[5] = _mm_mullo_epi32(in[6], cospi28);
960   x = _mm_mullo_epi32(in[8], cospi36);
961   u[5] = _mm_sub_epi32(u[5], x);
962   u[5] = _mm_add_epi32(u[5], rnding);
963   u[5] = _mm_srai_epi32(u[5], bit);
964 
965   // (4)
966   u[6] = _mm_mullo_epi32(in[2], cospi52);
967   x = _mm_mullo_epi32(in[12], cospi12);
968   u[6] = _mm_add_epi32(u[6], x);
969   u[6] = _mm_add_epi32(u[6], rnding);
970   u[6] = _mm_srai_epi32(u[6], bit);
971 
972   u[7] = _mm_mullo_epi32(in[2], cospi12);
973   x = _mm_mullo_epi32(in[12], cospi52);
974   u[7] = _mm_sub_epi32(u[7], x);
975   u[7] = _mm_add_epi32(u[7], rnding);
976   u[7] = _mm_srai_epi32(u[7], bit);
977 
978   // stage 3
979   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
980   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
981   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
982   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
983 
984   // stage 4
985   u[0] = v[0];
986   u[1] = v[1];
987   u[2] = v[2];
988   u[3] = v[3];
989 
990   u[4] = _mm_mullo_epi32(v[4], cospi16);
991   x = _mm_mullo_epi32(v[5], cospi48);
992   u[4] = _mm_add_epi32(u[4], x);
993   u[4] = _mm_add_epi32(u[4], rnding);
994   u[4] = _mm_srai_epi32(u[4], bit);
995 
996   u[5] = _mm_mullo_epi32(v[4], cospi48);
997   x = _mm_mullo_epi32(v[5], cospi16);
998   u[5] = _mm_sub_epi32(u[5], x);
999   u[5] = _mm_add_epi32(u[5], rnding);
1000   u[5] = _mm_srai_epi32(u[5], bit);
1001 
1002   u[6] = _mm_mullo_epi32(v[6], cospim48);
1003   x = _mm_mullo_epi32(v[7], cospi16);
1004   u[6] = _mm_add_epi32(u[6], x);
1005   u[6] = _mm_add_epi32(u[6], rnding);
1006   u[6] = _mm_srai_epi32(u[6], bit);
1007 
1008   u[7] = _mm_mullo_epi32(v[6], cospi16);
1009   x = _mm_mullo_epi32(v[7], cospim48);
1010   u[7] = _mm_sub_epi32(u[7], x);
1011   u[7] = _mm_add_epi32(u[7], rnding);
1012   u[7] = _mm_srai_epi32(u[7], bit);
1013 
1014   // stage 5
1015   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1016   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1017   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1018   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1019 
1020   // stage 6
1021   u[0] = v[0];
1022   u[1] = v[1];
1023   u[4] = v[4];
1024   u[5] = v[5];
1025 
1026   v[0] = _mm_mullo_epi32(v[2], cospi32);
1027   x = _mm_mullo_epi32(v[3], cospi32);
1028   u[2] = _mm_add_epi32(v[0], x);
1029   u[2] = _mm_add_epi32(u[2], rnding);
1030   u[2] = _mm_srai_epi32(u[2], bit);
1031 
1032   u[3] = _mm_sub_epi32(v[0], x);
1033   u[3] = _mm_add_epi32(u[3], rnding);
1034   u[3] = _mm_srai_epi32(u[3], bit);
1035 
1036   v[0] = _mm_mullo_epi32(v[6], cospi32);
1037   x = _mm_mullo_epi32(v[7], cospi32);
1038   u[6] = _mm_add_epi32(v[0], x);
1039   u[6] = _mm_add_epi32(u[6], rnding);
1040   u[6] = _mm_srai_epi32(u[6], bit);
1041 
1042   u[7] = _mm_sub_epi32(v[0], x);
1043   u[7] = _mm_add_epi32(u[7], rnding);
1044   u[7] = _mm_srai_epi32(u[7], bit);
1045 
1046   // stage 7
1047   if (do_cols) {
1048     out[0] = u[0];
1049     out[2] = _mm_sub_epi32(kZero, u[4]);
1050     out[4] = u[6];
1051     out[6] = _mm_sub_epi32(kZero, u[2]);
1052     out[8] = u[3];
1053     out[10] = _mm_sub_epi32(kZero, u[7]);
1054     out[12] = u[5];
1055     out[14] = _mm_sub_epi32(kZero, u[1]);
1056   } else {
1057     const int log_range_out = AOMMAX(16, bd + 6);
1058     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1059     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1060 
1061     neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1062                      out_shift);
1063     neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1064                      out_shift);
1065     neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
1066                      &clamp_hi_out, out_shift);
1067     neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
1068                      &clamp_hi_out, out_shift);
1069   }
1070 
1071   // Odd 8 points: 1, 3, ..., 15
1072   // stage 0
1073   // stage 1
1074   // stage 2
1075   // (1)
1076   u[0] = _mm_mullo_epi32(in[15], cospi4);
1077   x = _mm_mullo_epi32(in[1], cospi60);
1078   u[0] = _mm_add_epi32(u[0], x);
1079   u[0] = _mm_add_epi32(u[0], rnding);
1080   u[0] = _mm_srai_epi32(u[0], bit);
1081 
1082   u[1] = _mm_mullo_epi32(in[15], cospi60);
1083   x = _mm_mullo_epi32(in[1], cospi4);
1084   u[1] = _mm_sub_epi32(u[1], x);
1085   u[1] = _mm_add_epi32(u[1], rnding);
1086   u[1] = _mm_srai_epi32(u[1], bit);
1087 
1088   // (2)
1089   u[2] = _mm_mullo_epi32(in[11], cospi20);
1090   x = _mm_mullo_epi32(in[5], cospi44);
1091   u[2] = _mm_add_epi32(u[2], x);
1092   u[2] = _mm_add_epi32(u[2], rnding);
1093   u[2] = _mm_srai_epi32(u[2], bit);
1094 
1095   u[3] = _mm_mullo_epi32(in[11], cospi44);
1096   x = _mm_mullo_epi32(in[5], cospi20);
1097   u[3] = _mm_sub_epi32(u[3], x);
1098   u[3] = _mm_add_epi32(u[3], rnding);
1099   u[3] = _mm_srai_epi32(u[3], bit);
1100 
1101   // (3)
1102   u[4] = _mm_mullo_epi32(in[7], cospi36);
1103   x = _mm_mullo_epi32(in[9], cospi28);
1104   u[4] = _mm_add_epi32(u[4], x);
1105   u[4] = _mm_add_epi32(u[4], rnding);
1106   u[4] = _mm_srai_epi32(u[4], bit);
1107 
1108   u[5] = _mm_mullo_epi32(in[7], cospi28);
1109   x = _mm_mullo_epi32(in[9], cospi36);
1110   u[5] = _mm_sub_epi32(u[5], x);
1111   u[5] = _mm_add_epi32(u[5], rnding);
1112   u[5] = _mm_srai_epi32(u[5], bit);
1113 
1114   // (4)
1115   u[6] = _mm_mullo_epi32(in[3], cospi52);
1116   x = _mm_mullo_epi32(in[13], cospi12);
1117   u[6] = _mm_add_epi32(u[6], x);
1118   u[6] = _mm_add_epi32(u[6], rnding);
1119   u[6] = _mm_srai_epi32(u[6], bit);
1120 
1121   u[7] = _mm_mullo_epi32(in[3], cospi12);
1122   x = _mm_mullo_epi32(in[13], cospi52);
1123   u[7] = _mm_sub_epi32(u[7], x);
1124   u[7] = _mm_add_epi32(u[7], rnding);
1125   u[7] = _mm_srai_epi32(u[7], bit);
1126 
1127   // stage 3
1128   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1129   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1130   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1131   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1132 
1133   // stage 4
1134   u[0] = v[0];
1135   u[1] = v[1];
1136   u[2] = v[2];
1137   u[3] = v[3];
1138 
1139   u[4] = _mm_mullo_epi32(v[4], cospi16);
1140   x = _mm_mullo_epi32(v[5], cospi48);
1141   u[4] = _mm_add_epi32(u[4], x);
1142   u[4] = _mm_add_epi32(u[4], rnding);
1143   u[4] = _mm_srai_epi32(u[4], bit);
1144 
1145   u[5] = _mm_mullo_epi32(v[4], cospi48);
1146   x = _mm_mullo_epi32(v[5], cospi16);
1147   u[5] = _mm_sub_epi32(u[5], x);
1148   u[5] = _mm_add_epi32(u[5], rnding);
1149   u[5] = _mm_srai_epi32(u[5], bit);
1150 
1151   u[6] = _mm_mullo_epi32(v[6], cospim48);
1152   x = _mm_mullo_epi32(v[7], cospi16);
1153   u[6] = _mm_add_epi32(u[6], x);
1154   u[6] = _mm_add_epi32(u[6], rnding);
1155   u[6] = _mm_srai_epi32(u[6], bit);
1156 
1157   u[7] = _mm_mullo_epi32(v[6], cospi16);
1158   x = _mm_mullo_epi32(v[7], cospim48);
1159   u[7] = _mm_sub_epi32(u[7], x);
1160   u[7] = _mm_add_epi32(u[7], rnding);
1161   u[7] = _mm_srai_epi32(u[7], bit);
1162 
1163   // stage 5
1164   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1165   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1166   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1167   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1168 
1169   // stage 6
1170   u[0] = v[0];
1171   u[1] = v[1];
1172   u[4] = v[4];
1173   u[5] = v[5];
1174 
1175   v[0] = _mm_mullo_epi32(v[2], cospi32);
1176   x = _mm_mullo_epi32(v[3], cospi32);
1177   u[2] = _mm_add_epi32(v[0], x);
1178   u[2] = _mm_add_epi32(u[2], rnding);
1179   u[2] = _mm_srai_epi32(u[2], bit);
1180 
1181   u[3] = _mm_sub_epi32(v[0], x);
1182   u[3] = _mm_add_epi32(u[3], rnding);
1183   u[3] = _mm_srai_epi32(u[3], bit);
1184 
1185   v[0] = _mm_mullo_epi32(v[6], cospi32);
1186   x = _mm_mullo_epi32(v[7], cospi32);
1187   u[6] = _mm_add_epi32(v[0], x);
1188   u[6] = _mm_add_epi32(u[6], rnding);
1189   u[6] = _mm_srai_epi32(u[6], bit);
1190 
1191   u[7] = _mm_sub_epi32(v[0], x);
1192   u[7] = _mm_add_epi32(u[7], rnding);
1193   u[7] = _mm_srai_epi32(u[7], bit);
1194 
1195   // stage 7
1196   if (do_cols) {
1197     out[1] = u[0];
1198     out[3] = _mm_sub_epi32(kZero, u[4]);
1199     out[5] = u[6];
1200     out[7] = _mm_sub_epi32(kZero, u[2]);
1201     out[9] = u[3];
1202     out[11] = _mm_sub_epi32(kZero, u[7]);
1203     out[13] = u[5];
1204     out[15] = _mm_sub_epi32(kZero, u[1]);
1205   } else {
1206     const int log_range_out = AOMMAX(16, bd + 6);
1207     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1208     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1209 
1210     neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1211                      out_shift);
1212     neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1213                      out_shift);
1214     neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1215                      &clamp_hi_out, out_shift);
1216     neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1217                      &clamp_hi_out, out_shift);
1218   }
1219 }
shift_sse4_1(const __m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift,int size)1220 static void shift_sse4_1(const __m128i *in, __m128i *out,
1221                          const __m128i *clamp_lo, const __m128i *clamp_hi,
1222                          int shift, int size) {
1223   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
1224   __m128i shift_vec = _mm_cvtsi32_si128(shift);
1225   __m128i a0, a1;
1226   for (int i = 0; i < size; i += 4) {
1227     a0 = _mm_add_epi32(in[i], offset);
1228     a1 = _mm_add_epi32(in[i + 1], offset);
1229     a0 = _mm_sra_epi32(a0, shift_vec);
1230     a1 = _mm_sra_epi32(a1, shift_vec);
1231     a0 = _mm_max_epi32(a0, *clamp_lo);
1232     a1 = _mm_max_epi32(a1, *clamp_lo);
1233     out[i] = _mm_min_epi32(a0, *clamp_hi);
1234     out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
1235 
1236     a0 = _mm_add_epi32(in[i + 2], offset);
1237     a1 = _mm_add_epi32(in[i + 3], offset);
1238     a0 = _mm_sra_epi32(a0, shift_vec);
1239     a1 = _mm_sra_epi32(a1, shift_vec);
1240     a0 = _mm_max_epi32(a0, *clamp_lo);
1241     a1 = _mm_max_epi32(a1, *clamp_lo);
1242     out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
1243     out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
1244   }
1245 }
1246 
iidentity8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1247 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1248                               int bd, int out_shift) {
1249   (void)bit;
1250   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1251   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1252   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1253   __m128i v[8];
1254   v[0] = _mm_add_epi32(in[0], in[0]);
1255   v[1] = _mm_add_epi32(in[1], in[1]);
1256   v[2] = _mm_add_epi32(in[2], in[2]);
1257   v[3] = _mm_add_epi32(in[3], in[3]);
1258   v[4] = _mm_add_epi32(in[4], in[4]);
1259   v[5] = _mm_add_epi32(in[5], in[5]);
1260   v[6] = _mm_add_epi32(in[6], in[6]);
1261   v[7] = _mm_add_epi32(in[7], in[7]);
1262 
1263   if (!do_cols) {
1264     const int log_range_out = AOMMAX(16, bd + 6);
1265     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1266         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1267     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1268         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1269 
1270     shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
1271   } else {
1272     highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
1273   }
1274 }
1275 
round_shift_8x8(__m128i * in,int shift)1276 static void round_shift_8x8(__m128i *in, int shift) {
1277   round_shift_4x4(&in[0], shift);
1278   round_shift_4x4(&in[4], shift);
1279   round_shift_4x4(&in[8], shift);
1280   round_shift_4x4(&in[12], shift);
1281 }
1282 
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1283 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1284                              int fliplr, int bd) {
1285   __m128i x0, x1;
1286   const __m128i zero = _mm_setzero_si128();
1287 
1288   x0 = _mm_unpacklo_epi16(pred, zero);
1289   x1 = _mm_unpackhi_epi16(pred, zero);
1290 
1291   if (fliplr) {
1292     res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1293     res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1294     x0 = _mm_add_epi32(res_hi, x0);
1295     x1 = _mm_add_epi32(res_lo, x1);
1296 
1297   } else {
1298     x0 = _mm_add_epi32(res_lo, x0);
1299     x1 = _mm_add_epi32(res_hi, x1);
1300   }
1301 
1302   x0 = _mm_packus_epi32(x0, x1);
1303   return highbd_clamp_epi16(x0, bd);
1304 }
1305 
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1306 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1307                              int fliplr, int flipud, int shift, int bd) {
1308   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1309   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1310 
1311   round_shift_8x8(in, shift);
1312 
1313   v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1314   v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1315   v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1316   v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1317   v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1318   v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1319   v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1320   v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1321 
1322   if (flipud) {
1323     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1324     u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1325     u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1326     u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1327     u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1328     u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1329     u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1330     u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1331   } else {
1332     u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1333     u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1334     u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1335     u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1336     u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1337     u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1338     u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1339     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1340   }
1341 
1342   _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1343   _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1344   _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1345   _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1346   _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1347   _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1348   _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1349   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1350 }
1351 
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1352 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
1353                                    int stride, TX_TYPE tx_type, int bd) {
1354   __m128i in[16], out[16];
1355   const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
1356   const int txw_idx = get_txw_idx(TX_8X8);
1357   const int txh_idx = get_txh_idx(TX_8X8);
1358 
1359   switch (tx_type) {
1360     case DCT_DCT:
1361       load_buffer_8x8(coeff, in);
1362       transpose_8x8(in, out);
1363       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1364                      -shift[0]);
1365       transpose_8x8(in, out);
1366       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1367       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1368       break;
1369     case DCT_ADST:
1370       load_buffer_8x8(coeff, in);
1371       transpose_8x8(in, out);
1372       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1373                       -shift[0]);
1374       transpose_8x8(in, out);
1375       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1376       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1377       break;
1378     case ADST_DCT:
1379       load_buffer_8x8(coeff, in);
1380       transpose_8x8(in, out);
1381       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1382                      -shift[0]);
1383       transpose_8x8(in, out);
1384       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1385       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1386       break;
1387     case ADST_ADST:
1388       load_buffer_8x8(coeff, in);
1389       transpose_8x8(in, out);
1390       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1391                       -shift[0]);
1392       transpose_8x8(in, out);
1393       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1394       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1395       break;
1396     case FLIPADST_DCT:
1397       load_buffer_8x8(coeff, in);
1398       transpose_8x8(in, out);
1399       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1400                      -shift[0]);
1401       transpose_8x8(in, out);
1402       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1403       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1404       break;
1405     case DCT_FLIPADST:
1406       load_buffer_8x8(coeff, in);
1407       transpose_8x8(in, out);
1408       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1409                       -shift[0]);
1410       transpose_8x8(in, out);
1411       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1412       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1413       break;
1414     case ADST_FLIPADST:
1415       load_buffer_8x8(coeff, in);
1416       transpose_8x8(in, out);
1417       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1418                       -shift[0]);
1419       transpose_8x8(in, out);
1420       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1421       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1422       break;
1423     case FLIPADST_FLIPADST:
1424       load_buffer_8x8(coeff, in);
1425       transpose_8x8(in, out);
1426       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1427                       -shift[0]);
1428       transpose_8x8(in, out);
1429       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1430       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
1431       break;
1432     case FLIPADST_ADST:
1433       load_buffer_8x8(coeff, in);
1434       transpose_8x8(in, out);
1435       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1436                       -shift[0]);
1437       transpose_8x8(in, out);
1438       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1439       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1440       break;
1441     default: assert(0);
1442   }
1443 }
1444 
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1445 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1446                                 int bd, int out_shift) {
1447   const int32_t *cospi = cospi_arr(bit);
1448   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1449   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1450   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1451   __m128i x;
1452 
1453   // stage 0
1454   // stage 1
1455   // stage 2
1456   // stage 3
1457   x = _mm_mullo_epi32(in[0], cospi32);
1458   x = _mm_add_epi32(x, rnding);
1459   x = _mm_srai_epi32(x, bit);
1460 
1461   // stage 4
1462   // stage 5
1463   if (!do_cols) {
1464     const int log_range_out = AOMMAX(16, bd + 6);
1465     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1466         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1467     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1468         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1469 
1470     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1471     x = _mm_add_epi32(x, offset);
1472     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1473     x = _mm_max_epi32(x, clamp_lo_out);
1474     x = _mm_min_epi32(x, clamp_hi_out);
1475   }
1476 
1477   out[0] = x;
1478   out[1] = x;
1479   out[2] = x;
1480   out[3] = x;
1481   out[4] = x;
1482   out[5] = x;
1483   out[6] = x;
1484   out[7] = x;
1485 }
1486 
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1487 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1488                                int bd, int out_shift) {
1489   const int32_t *cospi = cospi_arr(bit);
1490   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1491   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1492   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1493   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1494   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1495   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1496   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1497   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1498   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1499   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1500   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1501   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1502   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1503   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1504   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1505   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1506   __m128i x, y;
1507 
1508   // stage 0
1509   // stage 1
1510   // stage 2
1511   u0 = in[0];
1512   u1 = in[4];
1513   u2 = in[2];
1514   u3 = in[6];
1515 
1516   x = _mm_mullo_epi32(in[1], cospi56);
1517   y = _mm_mullo_epi32(in[7], cospim8);
1518   u4 = _mm_add_epi32(x, y);
1519   u4 = _mm_add_epi32(u4, rnding);
1520   u4 = _mm_srai_epi32(u4, bit);
1521 
1522   x = _mm_mullo_epi32(in[1], cospi8);
1523   y = _mm_mullo_epi32(in[7], cospi56);
1524   u7 = _mm_add_epi32(x, y);
1525   u7 = _mm_add_epi32(u7, rnding);
1526   u7 = _mm_srai_epi32(u7, bit);
1527 
1528   x = _mm_mullo_epi32(in[5], cospi24);
1529   y = _mm_mullo_epi32(in[3], cospim40);
1530   u5 = _mm_add_epi32(x, y);
1531   u5 = _mm_add_epi32(u5, rnding);
1532   u5 = _mm_srai_epi32(u5, bit);
1533 
1534   x = _mm_mullo_epi32(in[5], cospi40);
1535   y = _mm_mullo_epi32(in[3], cospi24);
1536   u6 = _mm_add_epi32(x, y);
1537   u6 = _mm_add_epi32(u6, rnding);
1538   u6 = _mm_srai_epi32(u6, bit);
1539 
1540   // stage 3
1541   x = _mm_mullo_epi32(u0, cospi32);
1542   y = _mm_mullo_epi32(u1, cospi32);
1543   v0 = _mm_add_epi32(x, y);
1544   v0 = _mm_add_epi32(v0, rnding);
1545   v0 = _mm_srai_epi32(v0, bit);
1546 
1547   v1 = _mm_sub_epi32(x, y);
1548   v1 = _mm_add_epi32(v1, rnding);
1549   v1 = _mm_srai_epi32(v1, bit);
1550 
1551   x = _mm_mullo_epi32(u2, cospi48);
1552   y = _mm_mullo_epi32(u3, cospim16);
1553   v2 = _mm_add_epi32(x, y);
1554   v2 = _mm_add_epi32(v2, rnding);
1555   v2 = _mm_srai_epi32(v2, bit);
1556 
1557   x = _mm_mullo_epi32(u2, cospi16);
1558   y = _mm_mullo_epi32(u3, cospi48);
1559   v3 = _mm_add_epi32(x, y);
1560   v3 = _mm_add_epi32(v3, rnding);
1561   v3 = _mm_srai_epi32(v3, bit);
1562 
1563   addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1564   addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1565 
1566   // stage 4
1567   addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1568   addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1569   u4 = v4;
1570   u7 = v7;
1571 
1572   x = _mm_mullo_epi32(v5, cospi32);
1573   y = _mm_mullo_epi32(v6, cospi32);
1574   u6 = _mm_add_epi32(y, x);
1575   u6 = _mm_add_epi32(u6, rnding);
1576   u6 = _mm_srai_epi32(u6, bit);
1577 
1578   u5 = _mm_sub_epi32(y, x);
1579   u5 = _mm_add_epi32(u5, rnding);
1580   u5 = _mm_srai_epi32(u5, bit);
1581 
1582   // stage 5
1583   if (do_cols) {
1584     addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
1585     addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
1586     addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
1587     addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
1588   } else {
1589     const int log_range_out = AOMMAX(16, bd + 6);
1590     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1591         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1592     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1593         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1594     addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
1595                         out_shift);
1596     addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
1597                         out_shift);
1598     addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
1599                         out_shift);
1600     addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
1601                         out_shift);
1602   }
1603 }
1604 
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1605 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1606                                  int do_cols, int bd, int out_shift) {
1607   const int32_t *cospi = cospi_arr(bit);
1608   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1609   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1610   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1611   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1612   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1613   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1614   const __m128i kZero = _mm_setzero_si128();
1615   __m128i u[8], x;
1616 
1617   // stage 0
1618   // stage 1
1619   // stage 2
1620 
1621   x = _mm_mullo_epi32(in[0], cospi60);
1622   u[0] = _mm_add_epi32(x, rnding);
1623   u[0] = _mm_srai_epi32(u[0], bit);
1624 
1625   x = _mm_mullo_epi32(in[0], cospi4);
1626   u[1] = _mm_sub_epi32(kZero, x);
1627   u[1] = _mm_add_epi32(u[1], rnding);
1628   u[1] = _mm_srai_epi32(u[1], bit);
1629 
1630   // stage 3
1631   // stage 4
1632   __m128i temp1, temp2;
1633   temp1 = _mm_mullo_epi32(u[0], cospi16);
1634   x = _mm_mullo_epi32(u[1], cospi48);
1635   temp1 = _mm_add_epi32(temp1, x);
1636   temp1 = _mm_add_epi32(temp1, rnding);
1637   temp1 = _mm_srai_epi32(temp1, bit);
1638   u[4] = temp1;
1639 
1640   temp2 = _mm_mullo_epi32(u[0], cospi48);
1641   x = _mm_mullo_epi32(u[1], cospi16);
1642   u[5] = _mm_sub_epi32(temp2, x);
1643   u[5] = _mm_add_epi32(u[5], rnding);
1644   u[5] = _mm_srai_epi32(u[5], bit);
1645 
1646   // stage 5
1647   // stage 6
1648   temp1 = _mm_mullo_epi32(u[0], cospi32);
1649   x = _mm_mullo_epi32(u[1], cospi32);
1650   u[2] = _mm_add_epi32(temp1, x);
1651   u[2] = _mm_add_epi32(u[2], rnding);
1652   u[2] = _mm_srai_epi32(u[2], bit);
1653 
1654   u[3] = _mm_sub_epi32(temp1, x);
1655   u[3] = _mm_add_epi32(u[3], rnding);
1656   u[3] = _mm_srai_epi32(u[3], bit);
1657 
1658   temp1 = _mm_mullo_epi32(u[4], cospi32);
1659   x = _mm_mullo_epi32(u[5], cospi32);
1660   u[6] = _mm_add_epi32(temp1, x);
1661   u[6] = _mm_add_epi32(u[6], rnding);
1662   u[6] = _mm_srai_epi32(u[6], bit);
1663 
1664   u[7] = _mm_sub_epi32(temp1, x);
1665   u[7] = _mm_add_epi32(u[7], rnding);
1666   u[7] = _mm_srai_epi32(u[7], bit);
1667 
1668   // stage 7
1669   if (do_cols) {
1670     out[0] = u[0];
1671     out[1] = _mm_sub_epi32(kZero, u[4]);
1672     out[2] = u[6];
1673     out[3] = _mm_sub_epi32(kZero, u[2]);
1674     out[4] = u[3];
1675     out[5] = _mm_sub_epi32(kZero, u[7]);
1676     out[6] = u[5];
1677     out[7] = _mm_sub_epi32(kZero, u[1]);
1678   } else {
1679     const int log_range_out = AOMMAX(16, bd + 6);
1680     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1681     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1682 
1683     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1684                      out_shift);
1685     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1686                      out_shift);
1687     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1688                      out_shift);
1689     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1690                      out_shift);
1691   }
1692 }
1693 
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1694 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1695                                 int bd, int out_shift) {
1696   const int32_t *cospi = cospi_arr(bit);
1697   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1698   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1699   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1700   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1701   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1702   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1703   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1704   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1705   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1706   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1707   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1708   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1709   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1710   const __m128i kZero = _mm_setzero_si128();
1711   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1712   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1713   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1714   __m128i u[8], v[8], x;
1715 
1716   // stage 0
1717   // stage 1
1718   // stage 2
1719 
1720   u[0] = _mm_mullo_epi32(in[7], cospi4);
1721   x = _mm_mullo_epi32(in[0], cospi60);
1722   u[0] = _mm_add_epi32(u[0], x);
1723   u[0] = _mm_add_epi32(u[0], rnding);
1724   u[0] = _mm_srai_epi32(u[0], bit);
1725 
1726   u[1] = _mm_mullo_epi32(in[7], cospi60);
1727   x = _mm_mullo_epi32(in[0], cospi4);
1728   u[1] = _mm_sub_epi32(u[1], x);
1729   u[1] = _mm_add_epi32(u[1], rnding);
1730   u[1] = _mm_srai_epi32(u[1], bit);
1731 
1732   // (2)
1733   u[2] = _mm_mullo_epi32(in[5], cospi20);
1734   x = _mm_mullo_epi32(in[2], cospi44);
1735   u[2] = _mm_add_epi32(u[2], x);
1736   u[2] = _mm_add_epi32(u[2], rnding);
1737   u[2] = _mm_srai_epi32(u[2], bit);
1738 
1739   u[3] = _mm_mullo_epi32(in[5], cospi44);
1740   x = _mm_mullo_epi32(in[2], cospi20);
1741   u[3] = _mm_sub_epi32(u[3], x);
1742   u[3] = _mm_add_epi32(u[3], rnding);
1743   u[3] = _mm_srai_epi32(u[3], bit);
1744 
1745   // (3)
1746   u[4] = _mm_mullo_epi32(in[3], cospi36);
1747   x = _mm_mullo_epi32(in[4], cospi28);
1748   u[4] = _mm_add_epi32(u[4], x);
1749   u[4] = _mm_add_epi32(u[4], rnding);
1750   u[4] = _mm_srai_epi32(u[4], bit);
1751 
1752   u[5] = _mm_mullo_epi32(in[3], cospi28);
1753   x = _mm_mullo_epi32(in[4], cospi36);
1754   u[5] = _mm_sub_epi32(u[5], x);
1755   u[5] = _mm_add_epi32(u[5], rnding);
1756   u[5] = _mm_srai_epi32(u[5], bit);
1757 
1758   // (4)
1759   u[6] = _mm_mullo_epi32(in[1], cospi52);
1760   x = _mm_mullo_epi32(in[6], cospi12);
1761   u[6] = _mm_add_epi32(u[6], x);
1762   u[6] = _mm_add_epi32(u[6], rnding);
1763   u[6] = _mm_srai_epi32(u[6], bit);
1764 
1765   u[7] = _mm_mullo_epi32(in[1], cospi12);
1766   x = _mm_mullo_epi32(in[6], cospi52);
1767   u[7] = _mm_sub_epi32(u[7], x);
1768   u[7] = _mm_add_epi32(u[7], rnding);
1769   u[7] = _mm_srai_epi32(u[7], bit);
1770 
1771   // stage 3
1772   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1773   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1774   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1775   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1776 
1777   // stage 4
1778   u[0] = v[0];
1779   u[1] = v[1];
1780   u[2] = v[2];
1781   u[3] = v[3];
1782 
1783   u[4] = _mm_mullo_epi32(v[4], cospi16);
1784   x = _mm_mullo_epi32(v[5], cospi48);
1785   u[4] = _mm_add_epi32(u[4], x);
1786   u[4] = _mm_add_epi32(u[4], rnding);
1787   u[4] = _mm_srai_epi32(u[4], bit);
1788 
1789   u[5] = _mm_mullo_epi32(v[4], cospi48);
1790   x = _mm_mullo_epi32(v[5], cospi16);
1791   u[5] = _mm_sub_epi32(u[5], x);
1792   u[5] = _mm_add_epi32(u[5], rnding);
1793   u[5] = _mm_srai_epi32(u[5], bit);
1794 
1795   u[6] = _mm_mullo_epi32(v[6], cospim48);
1796   x = _mm_mullo_epi32(v[7], cospi16);
1797   u[6] = _mm_add_epi32(u[6], x);
1798   u[6] = _mm_add_epi32(u[6], rnding);
1799   u[6] = _mm_srai_epi32(u[6], bit);
1800 
1801   u[7] = _mm_mullo_epi32(v[6], cospi16);
1802   x = _mm_mullo_epi32(v[7], cospim48);
1803   u[7] = _mm_sub_epi32(u[7], x);
1804   u[7] = _mm_add_epi32(u[7], rnding);
1805   u[7] = _mm_srai_epi32(u[7], bit);
1806 
1807   // stage 5
1808   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1809   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1810   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1811   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1812 
1813   // stage 6
1814   u[0] = v[0];
1815   u[1] = v[1];
1816   u[4] = v[4];
1817   u[5] = v[5];
1818 
1819   v[0] = _mm_mullo_epi32(v[2], cospi32);
1820   x = _mm_mullo_epi32(v[3], cospi32);
1821   u[2] = _mm_add_epi32(v[0], x);
1822   u[2] = _mm_add_epi32(u[2], rnding);
1823   u[2] = _mm_srai_epi32(u[2], bit);
1824 
1825   u[3] = _mm_sub_epi32(v[0], x);
1826   u[3] = _mm_add_epi32(u[3], rnding);
1827   u[3] = _mm_srai_epi32(u[3], bit);
1828 
1829   v[0] = _mm_mullo_epi32(v[6], cospi32);
1830   x = _mm_mullo_epi32(v[7], cospi32);
1831   u[6] = _mm_add_epi32(v[0], x);
1832   u[6] = _mm_add_epi32(u[6], rnding);
1833   u[6] = _mm_srai_epi32(u[6], bit);
1834 
1835   u[7] = _mm_sub_epi32(v[0], x);
1836   u[7] = _mm_add_epi32(u[7], rnding);
1837   u[7] = _mm_srai_epi32(u[7], bit);
1838 
1839   // stage 7
1840   if (do_cols) {
1841     out[0] = u[0];
1842     out[1] = _mm_sub_epi32(kZero, u[4]);
1843     out[2] = u[6];
1844     out[3] = _mm_sub_epi32(kZero, u[2]);
1845     out[4] = u[3];
1846     out[5] = _mm_sub_epi32(kZero, u[7]);
1847     out[6] = u[5];
1848     out[7] = _mm_sub_epi32(kZero, u[1]);
1849   } else {
1850     const int log_range_out = AOMMAX(16, bd + 6);
1851     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1852     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1853 
1854     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1855                      out_shift);
1856     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1857                      out_shift);
1858     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1859                      out_shift);
1860     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1861                      out_shift);
1862   }
1863 }
1864 
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1865 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1866                                   int do_cols, int bd, int out_shift) {
1867   const int32_t *cospi = cospi_arr(bit);
1868   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1869   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1870   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1871   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1872   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1873 
1874   {
1875     // stage 0
1876     // stage 1
1877     // stage 2
1878     // stage 3
1879     // stage 4
1880     in[0] = _mm_mullo_epi32(in[0], cospi32);
1881     in[0] = _mm_add_epi32(in[0], rnding);
1882     in[0] = _mm_srai_epi32(in[0], bit);
1883 
1884     // stage 5
1885     // stage 6
1886     // stage 7
1887     if (do_cols) {
1888       in[0] = _mm_max_epi32(in[0], clamp_lo);
1889       in[0] = _mm_min_epi32(in[0], clamp_hi);
1890     } else {
1891       const int log_range_out = AOMMAX(16, bd + 6);
1892       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1893           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1894       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1895           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1896       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1897       in[0] = _mm_add_epi32(in[0], offset);
1898       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1899       in[0] = _mm_max_epi32(in[0], clamp_lo_out);
1900       in[0] = _mm_min_epi32(in[0], clamp_hi_out);
1901     }
1902 
1903     out[0] = in[0];
1904     out[1] = in[0];
1905     out[2] = in[0];
1906     out[3] = in[0];
1907     out[4] = in[0];
1908     out[5] = in[0];
1909     out[6] = in[0];
1910     out[7] = in[0];
1911     out[8] = in[0];
1912     out[9] = in[0];
1913     out[10] = in[0];
1914     out[11] = in[0];
1915     out[12] = in[0];
1916     out[13] = in[0];
1917     out[14] = in[0];
1918     out[15] = in[0];
1919   }
1920 }
1921 
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1922 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1923                                   int do_cols, int bd, int out_shift) {
1924   const int32_t *cospi = cospi_arr(bit);
1925   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1926   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1927   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1928   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1929   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1930   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1931   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1932   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1933   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1934   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1935   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1936   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1937   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1938   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1939   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1940   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1941   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1942   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1943   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1944   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1945   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1946   __m128i u[16], x, y;
1947 
1948   {
1949     // stage 0
1950     // stage 1
1951     u[0] = in[0];
1952     u[2] = in[4];
1953     u[4] = in[2];
1954     u[6] = in[6];
1955     u[8] = in[1];
1956     u[10] = in[5];
1957     u[12] = in[3];
1958     u[14] = in[7];
1959 
1960     // stage 2
1961     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1962     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1963 
1964     u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1965     u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1966 
1967     u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1968     u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1969 
1970     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1971     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1972 
1973     // stage 3
1974     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1975     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1976     u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1977     u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1978 
1979     addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1980     addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1981     addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1982     addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1983 
1984     // stage 4
1985     x = _mm_mullo_epi32(u[0], cospi32);
1986     u[0] = _mm_add_epi32(x, rnding);
1987     u[0] = _mm_srai_epi32(u[0], bit);
1988     u[1] = u[0];
1989 
1990     u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
1991     u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
1992 
1993     addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1994     addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1995 
1996     x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1997     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1998     u[9] = x;
1999     y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2000     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2001     u[10] = y;
2002 
2003     // stage 5
2004     addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2005     addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2006 
2007     x = _mm_mullo_epi32(u[5], cospi32);
2008     y = _mm_mullo_epi32(u[6], cospi32);
2009     u[5] = _mm_sub_epi32(y, x);
2010     u[5] = _mm_add_epi32(u[5], rnding);
2011     u[5] = _mm_srai_epi32(u[5], bit);
2012 
2013     u[6] = _mm_add_epi32(y, x);
2014     u[6] = _mm_add_epi32(u[6], rnding);
2015     u[6] = _mm_srai_epi32(u[6], bit);
2016 
2017     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2018     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2019     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2020     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2021 
2022     // stage 6
2023     addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
2024     addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
2025     addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
2026     addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
2027 
2028     x = _mm_mullo_epi32(u[10], cospi32);
2029     y = _mm_mullo_epi32(u[13], cospi32);
2030     u[10] = _mm_sub_epi32(y, x);
2031     u[10] = _mm_add_epi32(u[10], rnding);
2032     u[10] = _mm_srai_epi32(u[10], bit);
2033 
2034     u[13] = _mm_add_epi32(x, y);
2035     u[13] = _mm_add_epi32(u[13], rnding);
2036     u[13] = _mm_srai_epi32(u[13], bit);
2037 
2038     x = _mm_mullo_epi32(u[11], cospi32);
2039     y = _mm_mullo_epi32(u[12], cospi32);
2040     u[11] = _mm_sub_epi32(y, x);
2041     u[11] = _mm_add_epi32(u[11], rnding);
2042     u[11] = _mm_srai_epi32(u[11], bit);
2043 
2044     u[12] = _mm_add_epi32(x, y);
2045     u[12] = _mm_add_epi32(u[12], rnding);
2046     u[12] = _mm_srai_epi32(u[12], bit);
2047     // stage 7
2048     if (do_cols) {
2049       addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
2050       addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
2051       addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
2052       addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
2053       addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
2054       addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
2055       addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
2056       addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
2057     } else {
2058       const int log_range_out = AOMMAX(16, bd + 6);
2059       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2060           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2061       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2062           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2063 
2064       addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
2065                           &clamp_hi_out, out_shift);
2066       addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
2067                           &clamp_hi_out, out_shift);
2068       addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
2069                           &clamp_hi_out, out_shift);
2070       addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
2071                           &clamp_hi_out, out_shift);
2072       addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
2073                           &clamp_hi_out, out_shift);
2074       addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
2075                           &clamp_hi_out, out_shift);
2076       addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
2077                           &clamp_hi_out, out_shift);
2078       addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
2079                           &clamp_hi_out, out_shift);
2080     }
2081   }
2082 }
2083 
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2084 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
2085                                    int do_cols, int bd, int out_shift) {
2086   const int32_t *cospi = cospi_arr(bit);
2087   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2088   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2089   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2090   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2091   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2092   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2093   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2094   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2095   const __m128i zero = _mm_setzero_si128();
2096   __m128i v[16], x, y, temp1, temp2;
2097 
2098   // Calculate the column 0, 1, 2, 3
2099   {
2100     // stage 0
2101     // stage 1
2102     // stage 2
2103     x = _mm_mullo_epi32(in[0], cospi62);
2104     v[0] = _mm_add_epi32(x, rnding);
2105     v[0] = _mm_srai_epi32(v[0], bit);
2106 
2107     x = _mm_mullo_epi32(in[0], cospi2);
2108     v[1] = _mm_sub_epi32(zero, x);
2109     v[1] = _mm_add_epi32(v[1], rnding);
2110     v[1] = _mm_srai_epi32(v[1], bit);
2111 
2112     // stage 3
2113     v[8] = v[0];
2114     v[9] = v[1];
2115 
2116     // stage 4
2117     temp1 = _mm_mullo_epi32(v[8], cospi8);
2118     x = _mm_mullo_epi32(v[9], cospi56);
2119     temp1 = _mm_add_epi32(temp1, x);
2120     temp1 = _mm_add_epi32(temp1, rnding);
2121     temp1 = _mm_srai_epi32(temp1, bit);
2122 
2123     temp2 = _mm_mullo_epi32(v[8], cospi56);
2124     x = _mm_mullo_epi32(v[9], cospi8);
2125     temp2 = _mm_sub_epi32(temp2, x);
2126     temp2 = _mm_add_epi32(temp2, rnding);
2127     temp2 = _mm_srai_epi32(temp2, bit);
2128     v[8] = temp1;
2129     v[9] = temp2;
2130 
2131     // stage 5
2132     v[4] = v[0];
2133     v[5] = v[1];
2134     v[12] = v[8];
2135     v[13] = v[9];
2136 
2137     // stage 6
2138     temp1 = _mm_mullo_epi32(v[4], cospi16);
2139     x = _mm_mullo_epi32(v[5], cospi48);
2140     temp1 = _mm_add_epi32(temp1, x);
2141     temp1 = _mm_add_epi32(temp1, rnding);
2142     temp1 = _mm_srai_epi32(temp1, bit);
2143 
2144     temp2 = _mm_mullo_epi32(v[4], cospi48);
2145     x = _mm_mullo_epi32(v[5], cospi16);
2146     temp2 = _mm_sub_epi32(temp2, x);
2147     temp2 = _mm_add_epi32(temp2, rnding);
2148     temp2 = _mm_srai_epi32(temp2, bit);
2149     v[4] = temp1;
2150     v[5] = temp2;
2151 
2152     temp1 = _mm_mullo_epi32(v[12], cospi16);
2153     x = _mm_mullo_epi32(v[13], cospi48);
2154     temp1 = _mm_add_epi32(temp1, x);
2155     temp1 = _mm_add_epi32(temp1, rnding);
2156     temp1 = _mm_srai_epi32(temp1, bit);
2157 
2158     temp2 = _mm_mullo_epi32(v[12], cospi48);
2159     x = _mm_mullo_epi32(v[13], cospi16);
2160     temp2 = _mm_sub_epi32(temp2, x);
2161     temp2 = _mm_add_epi32(temp2, rnding);
2162     temp2 = _mm_srai_epi32(temp2, bit);
2163     v[12] = temp1;
2164     v[13] = temp2;
2165 
2166     // stage 7
2167     v[2] = v[0];
2168     v[3] = v[1];
2169     v[6] = v[4];
2170     v[7] = v[5];
2171     v[10] = v[8];
2172     v[11] = v[9];
2173     v[14] = v[12];
2174     v[15] = v[13];
2175 
2176     // stage 8
2177     y = _mm_mullo_epi32(v[2], cospi32);
2178     x = _mm_mullo_epi32(v[3], cospi32);
2179     v[2] = _mm_add_epi32(y, x);
2180     v[2] = _mm_add_epi32(v[2], rnding);
2181     v[2] = _mm_srai_epi32(v[2], bit);
2182 
2183     v[3] = _mm_sub_epi32(y, x);
2184     v[3] = _mm_add_epi32(v[3], rnding);
2185     v[3] = _mm_srai_epi32(v[3], bit);
2186 
2187     y = _mm_mullo_epi32(v[6], cospi32);
2188     x = _mm_mullo_epi32(v[7], cospi32);
2189     v[6] = _mm_add_epi32(y, x);
2190     v[6] = _mm_add_epi32(v[6], rnding);
2191     v[6] = _mm_srai_epi32(v[6], bit);
2192 
2193     v[7] = _mm_sub_epi32(y, x);
2194     v[7] = _mm_add_epi32(v[7], rnding);
2195     v[7] = _mm_srai_epi32(v[7], bit);
2196 
2197     y = _mm_mullo_epi32(v[10], cospi32);
2198     x = _mm_mullo_epi32(v[11], cospi32);
2199     v[10] = _mm_add_epi32(y, x);
2200     v[10] = _mm_add_epi32(v[10], rnding);
2201     v[10] = _mm_srai_epi32(v[10], bit);
2202 
2203     v[11] = _mm_sub_epi32(y, x);
2204     v[11] = _mm_add_epi32(v[11], rnding);
2205     v[11] = _mm_srai_epi32(v[11], bit);
2206 
2207     y = _mm_mullo_epi32(v[14], cospi32);
2208     x = _mm_mullo_epi32(v[15], cospi32);
2209     v[14] = _mm_add_epi32(y, x);
2210     v[14] = _mm_add_epi32(v[14], rnding);
2211     v[14] = _mm_srai_epi32(v[14], bit);
2212 
2213     v[15] = _mm_sub_epi32(y, x);
2214     v[15] = _mm_add_epi32(v[15], rnding);
2215     v[15] = _mm_srai_epi32(v[15], bit);
2216 
2217     // stage 9
2218     if (do_cols) {
2219       out[0] = v[0];
2220       out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2221       out[2] = v[12];
2222       out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2223       out[4] = v[6];
2224       out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2225       out[6] = v[10];
2226       out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2227       out[8] = v[3];
2228       out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2229       out[10] = v[15];
2230       out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2231       out[12] = v[5];
2232       out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2233       out[14] = v[9];
2234       out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2235     } else {
2236       const int log_range_out = AOMMAX(16, bd + 6);
2237       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2238       const __m128i clamp_hi_out =
2239           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2240 
2241       neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2242                        &clamp_hi_out, out_shift);
2243       neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2244                        &clamp_hi_out, out_shift);
2245       neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2246                        &clamp_hi_out, out_shift);
2247       neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2248                        &clamp_hi_out, out_shift);
2249       neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2250                        &clamp_hi_out, out_shift);
2251       neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2252                        &clamp_hi_out, out_shift);
2253       neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2254                        &clamp_hi_out, out_shift);
2255       neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2256                        &clamp_hi_out, out_shift);
2257     }
2258   }
2259 }
2260 
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2261 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2262                                    int do_cols, int bd, int out_shift) {
2263   const int32_t *cospi = cospi_arr(bit);
2264   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2265   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2266   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2267   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2268   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2269   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2270   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2271   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2272   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2273   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2274   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2275   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2276   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2277   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2278   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2279   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2280   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2281   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2282   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2283   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2284   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2285   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2286   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2287   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2288   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2289   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2290   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2291   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2292   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2293   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2294   __m128i u[16], x, y;
2295 
2296   // Calculate the column 0, 1, 2, 3
2297   {
2298     // stage 0
2299     // stage 1
2300     // stage 2
2301     __m128i zero = _mm_setzero_si128();
2302     x = _mm_mullo_epi32(in[0], cospi62);
2303     u[0] = _mm_add_epi32(x, rnding);
2304     u[0] = _mm_srai_epi32(u[0], bit);
2305 
2306     x = _mm_mullo_epi32(in[0], cospi2);
2307     u[1] = _mm_sub_epi32(zero, x);
2308     u[1] = _mm_add_epi32(u[1], rnding);
2309     u[1] = _mm_srai_epi32(u[1], bit);
2310 
2311     x = _mm_mullo_epi32(in[2], cospi54);
2312     u[2] = _mm_add_epi32(x, rnding);
2313     u[2] = _mm_srai_epi32(u[2], bit);
2314 
2315     x = _mm_mullo_epi32(in[2], cospi10);
2316     u[3] = _mm_sub_epi32(zero, x);
2317     u[3] = _mm_add_epi32(u[3], rnding);
2318     u[3] = _mm_srai_epi32(u[3], bit);
2319 
2320     x = _mm_mullo_epi32(in[4], cospi46);
2321     u[4] = _mm_add_epi32(x, rnding);
2322     u[4] = _mm_srai_epi32(u[4], bit);
2323 
2324     x = _mm_mullo_epi32(in[4], cospi18);
2325     u[5] = _mm_sub_epi32(zero, x);
2326     u[5] = _mm_add_epi32(u[5], rnding);
2327     u[5] = _mm_srai_epi32(u[5], bit);
2328 
2329     x = _mm_mullo_epi32(in[6], cospi38);
2330     u[6] = _mm_add_epi32(x, rnding);
2331     u[6] = _mm_srai_epi32(u[6], bit);
2332 
2333     x = _mm_mullo_epi32(in[6], cospi26);
2334     u[7] = _mm_sub_epi32(zero, x);
2335     u[7] = _mm_add_epi32(u[7], rnding);
2336     u[7] = _mm_srai_epi32(u[7], bit);
2337 
2338     u[8] = _mm_mullo_epi32(in[7], cospi34);
2339     u[8] = _mm_add_epi32(u[8], rnding);
2340     u[8] = _mm_srai_epi32(u[8], bit);
2341 
2342     u[9] = _mm_mullo_epi32(in[7], cospi30);
2343     u[9] = _mm_add_epi32(u[9], rnding);
2344     u[9] = _mm_srai_epi32(u[9], bit);
2345 
2346     u[10] = _mm_mullo_epi32(in[5], cospi42);
2347     u[10] = _mm_add_epi32(u[10], rnding);
2348     u[10] = _mm_srai_epi32(u[10], bit);
2349 
2350     u[11] = _mm_mullo_epi32(in[5], cospi22);
2351     u[11] = _mm_add_epi32(u[11], rnding);
2352     u[11] = _mm_srai_epi32(u[11], bit);
2353 
2354     u[12] = _mm_mullo_epi32(in[3], cospi50);
2355     u[12] = _mm_add_epi32(u[12], rnding);
2356     u[12] = _mm_srai_epi32(u[12], bit);
2357 
2358     u[13] = _mm_mullo_epi32(in[3], cospi14);
2359     u[13] = _mm_add_epi32(u[13], rnding);
2360     u[13] = _mm_srai_epi32(u[13], bit);
2361 
2362     u[14] = _mm_mullo_epi32(in[1], cospi58);
2363     u[14] = _mm_add_epi32(u[14], rnding);
2364     u[14] = _mm_srai_epi32(u[14], bit);
2365 
2366     u[15] = _mm_mullo_epi32(in[1], cospi6);
2367     u[15] = _mm_add_epi32(u[15], rnding);
2368     u[15] = _mm_srai_epi32(u[15], bit);
2369 
2370     // stage 3
2371     addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2372     addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2373     addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2374     addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2375     addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2376     addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2377     addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2378     addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2379 
2380     // stage 4
2381     y = _mm_mullo_epi32(u[8], cospi56);
2382     x = _mm_mullo_epi32(u[9], cospi56);
2383     u[8] = _mm_mullo_epi32(u[8], cospi8);
2384     u[8] = _mm_add_epi32(u[8], x);
2385     u[8] = _mm_add_epi32(u[8], rnding);
2386     u[8] = _mm_srai_epi32(u[8], bit);
2387 
2388     x = _mm_mullo_epi32(u[9], cospi8);
2389     u[9] = _mm_sub_epi32(y, x);
2390     u[9] = _mm_add_epi32(u[9], rnding);
2391     u[9] = _mm_srai_epi32(u[9], bit);
2392 
2393     x = _mm_mullo_epi32(u[11], cospi24);
2394     y = _mm_mullo_epi32(u[10], cospi24);
2395     u[10] = _mm_mullo_epi32(u[10], cospi40);
2396     u[10] = _mm_add_epi32(u[10], x);
2397     u[10] = _mm_add_epi32(u[10], rnding);
2398     u[10] = _mm_srai_epi32(u[10], bit);
2399 
2400     x = _mm_mullo_epi32(u[11], cospi40);
2401     u[11] = _mm_sub_epi32(y, x);
2402     u[11] = _mm_add_epi32(u[11], rnding);
2403     u[11] = _mm_srai_epi32(u[11], bit);
2404 
2405     x = _mm_mullo_epi32(u[13], cospi8);
2406     y = _mm_mullo_epi32(u[12], cospi8);
2407     u[12] = _mm_mullo_epi32(u[12], cospim56);
2408     u[12] = _mm_add_epi32(u[12], x);
2409     u[12] = _mm_add_epi32(u[12], rnding);
2410     u[12] = _mm_srai_epi32(u[12], bit);
2411 
2412     x = _mm_mullo_epi32(u[13], cospim56);
2413     u[13] = _mm_sub_epi32(y, x);
2414     u[13] = _mm_add_epi32(u[13], rnding);
2415     u[13] = _mm_srai_epi32(u[13], bit);
2416 
2417     x = _mm_mullo_epi32(u[15], cospi40);
2418     y = _mm_mullo_epi32(u[14], cospi40);
2419     u[14] = _mm_mullo_epi32(u[14], cospim24);
2420     u[14] = _mm_add_epi32(u[14], x);
2421     u[14] = _mm_add_epi32(u[14], rnding);
2422     u[14] = _mm_srai_epi32(u[14], bit);
2423 
2424     x = _mm_mullo_epi32(u[15], cospim24);
2425     u[15] = _mm_sub_epi32(y, x);
2426     u[15] = _mm_add_epi32(u[15], rnding);
2427     u[15] = _mm_srai_epi32(u[15], bit);
2428 
2429     // stage 5
2430     addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2431     addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2432     addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2433     addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2434     addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2435     addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2436     addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2437     addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2438 
2439     // stage 6
2440     x = _mm_mullo_epi32(u[5], cospi48);
2441     y = _mm_mullo_epi32(u[4], cospi48);
2442     u[4] = _mm_mullo_epi32(u[4], cospi16);
2443     u[4] = _mm_add_epi32(u[4], x);
2444     u[4] = _mm_add_epi32(u[4], rnding);
2445     u[4] = _mm_srai_epi32(u[4], bit);
2446 
2447     x = _mm_mullo_epi32(u[5], cospi16);
2448     u[5] = _mm_sub_epi32(y, x);
2449     u[5] = _mm_add_epi32(u[5], rnding);
2450     u[5] = _mm_srai_epi32(u[5], bit);
2451 
2452     x = _mm_mullo_epi32(u[7], cospi16);
2453     y = _mm_mullo_epi32(u[6], cospi16);
2454     u[6] = _mm_mullo_epi32(u[6], cospim48);
2455     u[6] = _mm_add_epi32(u[6], x);
2456     u[6] = _mm_add_epi32(u[6], rnding);
2457     u[6] = _mm_srai_epi32(u[6], bit);
2458 
2459     x = _mm_mullo_epi32(u[7], cospim48);
2460     u[7] = _mm_sub_epi32(y, x);
2461     u[7] = _mm_add_epi32(u[7], rnding);
2462     u[7] = _mm_srai_epi32(u[7], bit);
2463 
2464     x = _mm_mullo_epi32(u[13], cospi48);
2465     y = _mm_mullo_epi32(u[12], cospi48);
2466     u[12] = _mm_mullo_epi32(u[12], cospi16);
2467     u[12] = _mm_add_epi32(u[12], x);
2468     u[12] = _mm_add_epi32(u[12], rnding);
2469     u[12] = _mm_srai_epi32(u[12], bit);
2470 
2471     x = _mm_mullo_epi32(u[13], cospi16);
2472     u[13] = _mm_sub_epi32(y, x);
2473     u[13] = _mm_add_epi32(u[13], rnding);
2474     u[13] = _mm_srai_epi32(u[13], bit);
2475 
2476     x = _mm_mullo_epi32(u[15], cospi16);
2477     y = _mm_mullo_epi32(u[14], cospi16);
2478     u[14] = _mm_mullo_epi32(u[14], cospim48);
2479     u[14] = _mm_add_epi32(u[14], x);
2480     u[14] = _mm_add_epi32(u[14], rnding);
2481     u[14] = _mm_srai_epi32(u[14], bit);
2482 
2483     x = _mm_mullo_epi32(u[15], cospim48);
2484     u[15] = _mm_sub_epi32(y, x);
2485     u[15] = _mm_add_epi32(u[15], rnding);
2486     u[15] = _mm_srai_epi32(u[15], bit);
2487 
2488     // stage 7
2489     addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2490     addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2491     addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2492     addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2493     addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2494     addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2495     addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2496     addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2497 
2498     // stage 8
2499     y = _mm_mullo_epi32(u[2], cospi32);
2500     x = _mm_mullo_epi32(u[3], cospi32);
2501     u[2] = _mm_add_epi32(y, x);
2502     u[2] = _mm_add_epi32(u[2], rnding);
2503     u[2] = _mm_srai_epi32(u[2], bit);
2504 
2505     u[3] = _mm_sub_epi32(y, x);
2506     u[3] = _mm_add_epi32(u[3], rnding);
2507     u[3] = _mm_srai_epi32(u[3], bit);
2508     y = _mm_mullo_epi32(u[6], cospi32);
2509     x = _mm_mullo_epi32(u[7], cospi32);
2510     u[6] = _mm_add_epi32(y, x);
2511     u[6] = _mm_add_epi32(u[6], rnding);
2512     u[6] = _mm_srai_epi32(u[6], bit);
2513 
2514     u[7] = _mm_sub_epi32(y, x);
2515     u[7] = _mm_add_epi32(u[7], rnding);
2516     u[7] = _mm_srai_epi32(u[7], bit);
2517 
2518     y = _mm_mullo_epi32(u[10], cospi32);
2519     x = _mm_mullo_epi32(u[11], cospi32);
2520     u[10] = _mm_add_epi32(y, x);
2521     u[10] = _mm_add_epi32(u[10], rnding);
2522     u[10] = _mm_srai_epi32(u[10], bit);
2523 
2524     u[11] = _mm_sub_epi32(y, x);
2525     u[11] = _mm_add_epi32(u[11], rnding);
2526     u[11] = _mm_srai_epi32(u[11], bit);
2527 
2528     y = _mm_mullo_epi32(u[14], cospi32);
2529     x = _mm_mullo_epi32(u[15], cospi32);
2530     u[14] = _mm_add_epi32(y, x);
2531     u[14] = _mm_add_epi32(u[14], rnding);
2532     u[14] = _mm_srai_epi32(u[14], bit);
2533 
2534     u[15] = _mm_sub_epi32(y, x);
2535     u[15] = _mm_add_epi32(u[15], rnding);
2536     u[15] = _mm_srai_epi32(u[15], bit);
2537 
2538     // stage 9
2539     if (do_cols) {
2540       out[0] = u[0];
2541       out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
2542       out[2] = u[12];
2543       out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
2544       out[4] = u[6];
2545       out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
2546       out[6] = u[10];
2547       out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
2548       out[8] = u[3];
2549       out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
2550       out[10] = u[15];
2551       out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
2552       out[12] = u[5];
2553       out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
2554       out[14] = u[9];
2555       out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
2556     } else {
2557       const int log_range_out = AOMMAX(16, bd + 6);
2558       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2559       const __m128i clamp_hi_out =
2560           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2561 
2562       neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
2563                        &clamp_hi_out, out_shift);
2564       neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2565                        &clamp_hi_out, out_shift);
2566       neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2567                        &clamp_hi_out, out_shift);
2568       neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2569                        &clamp_hi_out, out_shift);
2570       neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2571                        &clamp_hi_out, out_shift);
2572       neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2573                        &clamp_hi_out, out_shift);
2574       neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2575                        &clamp_hi_out, out_shift);
2576       neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2577                        &clamp_hi_out, out_shift);
2578     }
2579   }
2580 }
2581 
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2582 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2583                              int bd, int out_shift) {
2584   const int32_t *cospi = cospi_arr(bit);
2585   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2586   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2587   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2588   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2589   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2590   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2591   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2592   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2593   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2594   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2595   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2596   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2597   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2598   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2599   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2600   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2601   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2602   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2603   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2604   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2605   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2606   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2607   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2608   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2609   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2610   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2611   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2612   __m128i u[16], v[16], x, y;
2613 
2614   {
2615     // stage 0
2616     // stage 1
2617     u[0] = in[0];
2618     u[1] = in[8];
2619     u[2] = in[4];
2620     u[3] = in[12];
2621     u[4] = in[2];
2622     u[5] = in[10];
2623     u[6] = in[6];
2624     u[7] = in[14];
2625     u[8] = in[1];
2626     u[9] = in[9];
2627     u[10] = in[5];
2628     u[11] = in[13];
2629     u[12] = in[3];
2630     u[13] = in[11];
2631     u[14] = in[7];
2632     u[15] = in[15];
2633 
2634     // stage 2
2635     v[0] = u[0];
2636     v[1] = u[1];
2637     v[2] = u[2];
2638     v[3] = u[3];
2639     v[4] = u[4];
2640     v[5] = u[5];
2641     v[6] = u[6];
2642     v[7] = u[7];
2643 
2644     v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2645     v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2646     v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2647     v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2648     v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2649     v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2650     v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2651     v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2652 
2653     // stage 3
2654     u[0] = v[0];
2655     u[1] = v[1];
2656     u[2] = v[2];
2657     u[3] = v[3];
2658     u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2659     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2660     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2661     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2662     addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2663     addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2664     addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2665     addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2666 
2667     // stage 4
2668     x = _mm_mullo_epi32(u[0], cospi32);
2669     y = _mm_mullo_epi32(u[1], cospi32);
2670     v[0] = _mm_add_epi32(x, y);
2671     v[0] = _mm_add_epi32(v[0], rnding);
2672     v[0] = _mm_srai_epi32(v[0], bit);
2673 
2674     v[1] = _mm_sub_epi32(x, y);
2675     v[1] = _mm_add_epi32(v[1], rnding);
2676     v[1] = _mm_srai_epi32(v[1], bit);
2677 
2678     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2679     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2680     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2681     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2682     v[8] = u[8];
2683     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2684     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2685     v[11] = u[11];
2686     v[12] = u[12];
2687     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2688     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2689     v[15] = u[15];
2690 
2691     // stage 5
2692     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2693     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2694     u[4] = v[4];
2695 
2696     x = _mm_mullo_epi32(v[5], cospi32);
2697     y = _mm_mullo_epi32(v[6], cospi32);
2698     u[5] = _mm_sub_epi32(y, x);
2699     u[5] = _mm_add_epi32(u[5], rnding);
2700     u[5] = _mm_srai_epi32(u[5], bit);
2701 
2702     u[6] = _mm_add_epi32(y, x);
2703     u[6] = _mm_add_epi32(u[6], rnding);
2704     u[6] = _mm_srai_epi32(u[6], bit);
2705 
2706     u[7] = v[7];
2707     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2708     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2709     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2710     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2711 
2712     // stage 6
2713     addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2714     addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2715     addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2716     addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2717     v[8] = u[8];
2718     v[9] = u[9];
2719 
2720     x = _mm_mullo_epi32(u[10], cospi32);
2721     y = _mm_mullo_epi32(u[13], cospi32);
2722     v[10] = _mm_sub_epi32(y, x);
2723     v[10] = _mm_add_epi32(v[10], rnding);
2724     v[10] = _mm_srai_epi32(v[10], bit);
2725 
2726     v[13] = _mm_add_epi32(x, y);
2727     v[13] = _mm_add_epi32(v[13], rnding);
2728     v[13] = _mm_srai_epi32(v[13], bit);
2729 
2730     x = _mm_mullo_epi32(u[11], cospi32);
2731     y = _mm_mullo_epi32(u[12], cospi32);
2732     v[11] = _mm_sub_epi32(y, x);
2733     v[11] = _mm_add_epi32(v[11], rnding);
2734     v[11] = _mm_srai_epi32(v[11], bit);
2735 
2736     v[12] = _mm_add_epi32(x, y);
2737     v[12] = _mm_add_epi32(v[12], rnding);
2738     v[12] = _mm_srai_epi32(v[12], bit);
2739 
2740     v[14] = u[14];
2741     v[15] = u[15];
2742 
2743     // stage 7
2744     if (do_cols) {
2745       addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
2746       addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
2747       addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
2748       addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
2749       addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
2750       addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
2751       addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
2752       addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
2753     } else {
2754       const int log_range_out = AOMMAX(16, bd + 6);
2755       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2756           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2757       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2758           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2759 
2760       addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
2761                           &clamp_hi_out, out_shift);
2762       addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
2763                           &clamp_hi_out, out_shift);
2764       addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
2765                           &clamp_hi_out, out_shift);
2766       addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
2767                           &clamp_hi_out, out_shift);
2768       addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
2769                           &clamp_hi_out, out_shift);
2770       addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
2771                           &clamp_hi_out, out_shift);
2772       addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
2773                           &clamp_hi_out, out_shift);
2774       addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
2775                           &clamp_hi_out, out_shift);
2776     }
2777   }
2778 }
2779 
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2780 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2781                               int bd, int out_shift) {
2782   const int32_t *cospi = cospi_arr(bit);
2783   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2784   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2785   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2786   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2787   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2788   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2789   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2790   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2791   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2792   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2793   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2794   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2795   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2796   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2797   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2798   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2799   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2800   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2801   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2802   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2803   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2804   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2805   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2806   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2807   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2808   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2809   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2810   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2811   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2812   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2813   __m128i u[16], v[16], x, y;
2814 
2815   // Calculate the column 0, 1, 2, 3
2816   {
2817     // stage 0
2818     // stage 1
2819     // stage 2
2820     v[0] = _mm_mullo_epi32(in[15], cospi2);
2821     x = _mm_mullo_epi32(in[0], cospi62);
2822     v[0] = _mm_add_epi32(v[0], x);
2823     v[0] = _mm_add_epi32(v[0], rnding);
2824     v[0] = _mm_srai_epi32(v[0], bit);
2825 
2826     v[1] = _mm_mullo_epi32(in[15], cospi62);
2827     x = _mm_mullo_epi32(in[0], cospi2);
2828     v[1] = _mm_sub_epi32(v[1], x);
2829     v[1] = _mm_add_epi32(v[1], rnding);
2830     v[1] = _mm_srai_epi32(v[1], bit);
2831 
2832     v[2] = _mm_mullo_epi32(in[13], cospi10);
2833     x = _mm_mullo_epi32(in[2], cospi54);
2834     v[2] = _mm_add_epi32(v[2], x);
2835     v[2] = _mm_add_epi32(v[2], rnding);
2836     v[2] = _mm_srai_epi32(v[2], bit);
2837 
2838     v[3] = _mm_mullo_epi32(in[13], cospi54);
2839     x = _mm_mullo_epi32(in[2], cospi10);
2840     v[3] = _mm_sub_epi32(v[3], x);
2841     v[3] = _mm_add_epi32(v[3], rnding);
2842     v[3] = _mm_srai_epi32(v[3], bit);
2843 
2844     v[4] = _mm_mullo_epi32(in[11], cospi18);
2845     x = _mm_mullo_epi32(in[4], cospi46);
2846     v[4] = _mm_add_epi32(v[4], x);
2847     v[4] = _mm_add_epi32(v[4], rnding);
2848     v[4] = _mm_srai_epi32(v[4], bit);
2849 
2850     v[5] = _mm_mullo_epi32(in[11], cospi46);
2851     x = _mm_mullo_epi32(in[4], cospi18);
2852     v[5] = _mm_sub_epi32(v[5], x);
2853     v[5] = _mm_add_epi32(v[5], rnding);
2854     v[5] = _mm_srai_epi32(v[5], bit);
2855 
2856     v[6] = _mm_mullo_epi32(in[9], cospi26);
2857     x = _mm_mullo_epi32(in[6], cospi38);
2858     v[6] = _mm_add_epi32(v[6], x);
2859     v[6] = _mm_add_epi32(v[6], rnding);
2860     v[6] = _mm_srai_epi32(v[6], bit);
2861 
2862     v[7] = _mm_mullo_epi32(in[9], cospi38);
2863     x = _mm_mullo_epi32(in[6], cospi26);
2864     v[7] = _mm_sub_epi32(v[7], x);
2865     v[7] = _mm_add_epi32(v[7], rnding);
2866     v[7] = _mm_srai_epi32(v[7], bit);
2867 
2868     v[8] = _mm_mullo_epi32(in[7], cospi34);
2869     x = _mm_mullo_epi32(in[8], cospi30);
2870     v[8] = _mm_add_epi32(v[8], x);
2871     v[8] = _mm_add_epi32(v[8], rnding);
2872     v[8] = _mm_srai_epi32(v[8], bit);
2873 
2874     v[9] = _mm_mullo_epi32(in[7], cospi30);
2875     x = _mm_mullo_epi32(in[8], cospi34);
2876     v[9] = _mm_sub_epi32(v[9], x);
2877     v[9] = _mm_add_epi32(v[9], rnding);
2878     v[9] = _mm_srai_epi32(v[9], bit);
2879 
2880     v[10] = _mm_mullo_epi32(in[5], cospi42);
2881     x = _mm_mullo_epi32(in[10], cospi22);
2882     v[10] = _mm_add_epi32(v[10], x);
2883     v[10] = _mm_add_epi32(v[10], rnding);
2884     v[10] = _mm_srai_epi32(v[10], bit);
2885 
2886     v[11] = _mm_mullo_epi32(in[5], cospi22);
2887     x = _mm_mullo_epi32(in[10], cospi42);
2888     v[11] = _mm_sub_epi32(v[11], x);
2889     v[11] = _mm_add_epi32(v[11], rnding);
2890     v[11] = _mm_srai_epi32(v[11], bit);
2891 
2892     v[12] = _mm_mullo_epi32(in[3], cospi50);
2893     x = _mm_mullo_epi32(in[12], cospi14);
2894     v[12] = _mm_add_epi32(v[12], x);
2895     v[12] = _mm_add_epi32(v[12], rnding);
2896     v[12] = _mm_srai_epi32(v[12], bit);
2897 
2898     v[13] = _mm_mullo_epi32(in[3], cospi14);
2899     x = _mm_mullo_epi32(in[12], cospi50);
2900     v[13] = _mm_sub_epi32(v[13], x);
2901     v[13] = _mm_add_epi32(v[13], rnding);
2902     v[13] = _mm_srai_epi32(v[13], bit);
2903 
2904     v[14] = _mm_mullo_epi32(in[1], cospi58);
2905     x = _mm_mullo_epi32(in[14], cospi6);
2906     v[14] = _mm_add_epi32(v[14], x);
2907     v[14] = _mm_add_epi32(v[14], rnding);
2908     v[14] = _mm_srai_epi32(v[14], bit);
2909 
2910     v[15] = _mm_mullo_epi32(in[1], cospi6);
2911     x = _mm_mullo_epi32(in[14], cospi58);
2912     v[15] = _mm_sub_epi32(v[15], x);
2913     v[15] = _mm_add_epi32(v[15], rnding);
2914     v[15] = _mm_srai_epi32(v[15], bit);
2915 
2916     // stage 3
2917     addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2918     addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2919     addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2920     addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2921     addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2922     addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2923     addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2924     addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2925 
2926     // stage 4
2927     v[0] = u[0];
2928     v[1] = u[1];
2929     v[2] = u[2];
2930     v[3] = u[3];
2931     v[4] = u[4];
2932     v[5] = u[5];
2933     v[6] = u[6];
2934     v[7] = u[7];
2935 
2936     v[8] = _mm_mullo_epi32(u[8], cospi8);
2937     x = _mm_mullo_epi32(u[9], cospi56);
2938     v[8] = _mm_add_epi32(v[8], x);
2939     v[8] = _mm_add_epi32(v[8], rnding);
2940     v[8] = _mm_srai_epi32(v[8], bit);
2941 
2942     v[9] = _mm_mullo_epi32(u[8], cospi56);
2943     x = _mm_mullo_epi32(u[9], cospi8);
2944     v[9] = _mm_sub_epi32(v[9], x);
2945     v[9] = _mm_add_epi32(v[9], rnding);
2946     v[9] = _mm_srai_epi32(v[9], bit);
2947 
2948     v[10] = _mm_mullo_epi32(u[10], cospi40);
2949     x = _mm_mullo_epi32(u[11], cospi24);
2950     v[10] = _mm_add_epi32(v[10], x);
2951     v[10] = _mm_add_epi32(v[10], rnding);
2952     v[10] = _mm_srai_epi32(v[10], bit);
2953 
2954     v[11] = _mm_mullo_epi32(u[10], cospi24);
2955     x = _mm_mullo_epi32(u[11], cospi40);
2956     v[11] = _mm_sub_epi32(v[11], x);
2957     v[11] = _mm_add_epi32(v[11], rnding);
2958     v[11] = _mm_srai_epi32(v[11], bit);
2959 
2960     v[12] = _mm_mullo_epi32(u[12], cospim56);
2961     x = _mm_mullo_epi32(u[13], cospi8);
2962     v[12] = _mm_add_epi32(v[12], x);
2963     v[12] = _mm_add_epi32(v[12], rnding);
2964     v[12] = _mm_srai_epi32(v[12], bit);
2965 
2966     v[13] = _mm_mullo_epi32(u[12], cospi8);
2967     x = _mm_mullo_epi32(u[13], cospim56);
2968     v[13] = _mm_sub_epi32(v[13], x);
2969     v[13] = _mm_add_epi32(v[13], rnding);
2970     v[13] = _mm_srai_epi32(v[13], bit);
2971 
2972     v[14] = _mm_mullo_epi32(u[14], cospim24);
2973     x = _mm_mullo_epi32(u[15], cospi40);
2974     v[14] = _mm_add_epi32(v[14], x);
2975     v[14] = _mm_add_epi32(v[14], rnding);
2976     v[14] = _mm_srai_epi32(v[14], bit);
2977 
2978     v[15] = _mm_mullo_epi32(u[14], cospi40);
2979     x = _mm_mullo_epi32(u[15], cospim24);
2980     v[15] = _mm_sub_epi32(v[15], x);
2981     v[15] = _mm_add_epi32(v[15], rnding);
2982     v[15] = _mm_srai_epi32(v[15], bit);
2983 
2984     // stage 5
2985     addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2986     addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2987     addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2988     addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2989     addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2990     addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2991     addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2992     addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2993 
2994     // stage 6
2995     v[0] = u[0];
2996     v[1] = u[1];
2997     v[2] = u[2];
2998     v[3] = u[3];
2999 
3000     v[4] = _mm_mullo_epi32(u[4], cospi16);
3001     x = _mm_mullo_epi32(u[5], cospi48);
3002     v[4] = _mm_add_epi32(v[4], x);
3003     v[4] = _mm_add_epi32(v[4], rnding);
3004     v[4] = _mm_srai_epi32(v[4], bit);
3005 
3006     v[5] = _mm_mullo_epi32(u[4], cospi48);
3007     x = _mm_mullo_epi32(u[5], cospi16);
3008     v[5] = _mm_sub_epi32(v[5], x);
3009     v[5] = _mm_add_epi32(v[5], rnding);
3010     v[5] = _mm_srai_epi32(v[5], bit);
3011 
3012     v[6] = _mm_mullo_epi32(u[6], cospim48);
3013     x = _mm_mullo_epi32(u[7], cospi16);
3014     v[6] = _mm_add_epi32(v[6], x);
3015     v[6] = _mm_add_epi32(v[6], rnding);
3016     v[6] = _mm_srai_epi32(v[6], bit);
3017 
3018     v[7] = _mm_mullo_epi32(u[6], cospi16);
3019     x = _mm_mullo_epi32(u[7], cospim48);
3020     v[7] = _mm_sub_epi32(v[7], x);
3021     v[7] = _mm_add_epi32(v[7], rnding);
3022     v[7] = _mm_srai_epi32(v[7], bit);
3023 
3024     v[8] = u[8];
3025     v[9] = u[9];
3026     v[10] = u[10];
3027     v[11] = u[11];
3028 
3029     v[12] = _mm_mullo_epi32(u[12], cospi16);
3030     x = _mm_mullo_epi32(u[13], cospi48);
3031     v[12] = _mm_add_epi32(v[12], x);
3032     v[12] = _mm_add_epi32(v[12], rnding);
3033     v[12] = _mm_srai_epi32(v[12], bit);
3034 
3035     v[13] = _mm_mullo_epi32(u[12], cospi48);
3036     x = _mm_mullo_epi32(u[13], cospi16);
3037     v[13] = _mm_sub_epi32(v[13], x);
3038     v[13] = _mm_add_epi32(v[13], rnding);
3039     v[13] = _mm_srai_epi32(v[13], bit);
3040 
3041     v[14] = _mm_mullo_epi32(u[14], cospim48);
3042     x = _mm_mullo_epi32(u[15], cospi16);
3043     v[14] = _mm_add_epi32(v[14], x);
3044     v[14] = _mm_add_epi32(v[14], rnding);
3045     v[14] = _mm_srai_epi32(v[14], bit);
3046 
3047     v[15] = _mm_mullo_epi32(u[14], cospi16);
3048     x = _mm_mullo_epi32(u[15], cospim48);
3049     v[15] = _mm_sub_epi32(v[15], x);
3050     v[15] = _mm_add_epi32(v[15], rnding);
3051     v[15] = _mm_srai_epi32(v[15], bit);
3052 
3053     // stage 7
3054     addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3055     addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3056     addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3057     addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3058     addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3059     addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3060     addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3061     addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3062 
3063     // stage 8
3064     v[0] = u[0];
3065     v[1] = u[1];
3066 
3067     y = _mm_mullo_epi32(u[2], cospi32);
3068     x = _mm_mullo_epi32(u[3], cospi32);
3069     v[2] = _mm_add_epi32(y, x);
3070     v[2] = _mm_add_epi32(v[2], rnding);
3071     v[2] = _mm_srai_epi32(v[2], bit);
3072 
3073     v[3] = _mm_sub_epi32(y, x);
3074     v[3] = _mm_add_epi32(v[3], rnding);
3075     v[3] = _mm_srai_epi32(v[3], bit);
3076 
3077     v[4] = u[4];
3078     v[5] = u[5];
3079 
3080     y = _mm_mullo_epi32(u[6], cospi32);
3081     x = _mm_mullo_epi32(u[7], cospi32);
3082     v[6] = _mm_add_epi32(y, x);
3083     v[6] = _mm_add_epi32(v[6], rnding);
3084     v[6] = _mm_srai_epi32(v[6], bit);
3085 
3086     v[7] = _mm_sub_epi32(y, x);
3087     v[7] = _mm_add_epi32(v[7], rnding);
3088     v[7] = _mm_srai_epi32(v[7], bit);
3089 
3090     v[8] = u[8];
3091     v[9] = u[9];
3092 
3093     y = _mm_mullo_epi32(u[10], cospi32);
3094     x = _mm_mullo_epi32(u[11], cospi32);
3095     v[10] = _mm_add_epi32(y, x);
3096     v[10] = _mm_add_epi32(v[10], rnding);
3097     v[10] = _mm_srai_epi32(v[10], bit);
3098 
3099     v[11] = _mm_sub_epi32(y, x);
3100     v[11] = _mm_add_epi32(v[11], rnding);
3101     v[11] = _mm_srai_epi32(v[11], bit);
3102 
3103     v[12] = u[12];
3104     v[13] = u[13];
3105 
3106     y = _mm_mullo_epi32(u[14], cospi32);
3107     x = _mm_mullo_epi32(u[15], cospi32);
3108     v[14] = _mm_add_epi32(y, x);
3109     v[14] = _mm_add_epi32(v[14], rnding);
3110     v[14] = _mm_srai_epi32(v[14], bit);
3111 
3112     v[15] = _mm_sub_epi32(y, x);
3113     v[15] = _mm_add_epi32(v[15], rnding);
3114     v[15] = _mm_srai_epi32(v[15], bit);
3115 
3116     // stage 9
3117     if (do_cols) {
3118       out[0] = v[0];
3119       out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
3120       out[2] = v[12];
3121       out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
3122       out[4] = v[6];
3123       out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
3124       out[6] = v[10];
3125       out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
3126       out[8] = v[3];
3127       out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
3128       out[10] = v[15];
3129       out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
3130       out[12] = v[5];
3131       out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
3132       out[14] = v[9];
3133       out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
3134     } else {
3135       const int log_range_out = AOMMAX(16, bd + 6);
3136       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3137       const __m128i clamp_hi_out =
3138           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3139 
3140       neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
3141                        &clamp_hi_out, out_shift);
3142       neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3143                        &clamp_hi_out, out_shift);
3144       neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3145                        &clamp_hi_out, out_shift);
3146       neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3147                        &clamp_hi_out, out_shift);
3148       neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3149                        &clamp_hi_out, out_shift);
3150       neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3151                        &clamp_hi_out, out_shift);
3152       neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3153                        &clamp_hi_out, out_shift);
3154       neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3155                        &clamp_hi_out, out_shift);
3156     }
3157   }
3158 }
iidentity16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3159 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3160                                int bd, int out_shift) {
3161   (void)bit;
3162   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3163   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3164   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3165   __m128i v[16];
3166   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
3167   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
3168   __m128i a0, a1, a2, a3;
3169 
3170   for (int i = 0; i < 16; i += 8) {
3171     a0 = _mm_mullo_epi32(in[i], fact);
3172     a1 = _mm_mullo_epi32(in[i + 1], fact);
3173     a0 = _mm_add_epi32(a0, offset);
3174     a1 = _mm_add_epi32(a1, offset);
3175     v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
3176     v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
3177 
3178     a2 = _mm_mullo_epi32(in[i + 2], fact);
3179     a3 = _mm_mullo_epi32(in[i + 3], fact);
3180     a2 = _mm_add_epi32(a2, offset);
3181     a3 = _mm_add_epi32(a3, offset);
3182     v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
3183     v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
3184 
3185     a0 = _mm_mullo_epi32(in[i + 4], fact);
3186     a1 = _mm_mullo_epi32(in[i + 5], fact);
3187     a0 = _mm_add_epi32(a0, offset);
3188     a1 = _mm_add_epi32(a1, offset);
3189     v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
3190     v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
3191 
3192     a2 = _mm_mullo_epi32(in[i + 6], fact);
3193     a3 = _mm_mullo_epi32(in[i + 7], fact);
3194     a2 = _mm_add_epi32(a2, offset);
3195     a3 = _mm_add_epi32(a3, offset);
3196     v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
3197     v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
3198   }
3199 
3200   if (!do_cols) {
3201     const int log_range_out = AOMMAX(16, bd + 6);
3202     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3203         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3204     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3205         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3206 
3207     shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
3208   } else {
3209     highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
3210   }
3211 }
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3212 static INLINE void idct64_stage8_sse4_1(
3213     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
3214     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
3215     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
3216     const __m128i *rnding, int bit) {
3217   int i;
3218   __m128i temp1, temp2, temp3, temp4;
3219   temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
3220   u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
3221   u[10] = temp1;
3222   temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
3223   u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
3224   u[11] = temp2;
3225 
3226   for (i = 16; i < 20; ++i) {
3227     addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
3228     addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
3229                   clamp_hi);
3230   }
3231 
3232   temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
3233   temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
3234   temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
3235   temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3236   u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3237   u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3238   u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3239   u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3240   u[36] = temp1;
3241   u[37] = temp2;
3242   u[38] = temp3;
3243   u[39] = temp4;
3244 
3245   temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3246   temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3247   temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3248   temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3249   u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3250   u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3251   u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3252   u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3253   u[40] = temp1;
3254   u[41] = temp2;
3255   u[42] = temp3;
3256   u[43] = temp4;
3257 }
3258 
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3259 static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3260                                         const __m128i *cospi32,
3261                                         const __m128i *clamp_lo,
3262                                         const __m128i *clamp_hi,
3263                                         const __m128i *rnding, int bit) {
3264   int i;
3265   __m128i temp1, temp2, temp3, temp4;
3266   for (i = 0; i < 8; ++i) {
3267     addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3268   }
3269 
3270   temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3271   temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3272   temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3273   temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3274   u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3275   u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3276   u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3277   u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3278   u[20] = temp1;
3279   u[21] = temp2;
3280   u[22] = temp3;
3281   u[23] = temp4;
3282   for (i = 32; i < 40; i++) {
3283     addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3284   }
3285 
3286   for (i = 48; i < 56; i++) {
3287     addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3288   }
3289 }
3290 
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3291 static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3292                                          const __m128i *cospi32,
3293                                          const __m128i *clamp_lo,
3294                                          const __m128i *clamp_hi,
3295                                          const __m128i *rnding, int bit) {
3296   __m128i temp1, temp2, temp3, temp4;
3297   for (int i = 0; i < 16; i++) {
3298     addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3299   }
3300 
3301   temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3302   temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3303   temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3304   temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3305   u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3306   u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3307   u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3308   u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3309   u[40] = temp1;
3310   u[41] = temp2;
3311   u[42] = temp3;
3312   u[43] = temp4;
3313 
3314   temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3315   temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3316   temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3317   temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3318   u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3319   u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3320   u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3321   u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3322   u[44] = temp1;
3323   u[45] = temp2;
3324   u[46] = temp3;
3325   u[47] = temp4;
3326 }
3327 
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const int log_range)3328 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3329                                          int bd, int out_shift,
3330                                          const int log_range) {
3331   if (do_cols) {
3332     for (int i = 0; i < 32; i++) {
3333       addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
3334     }
3335   } else {
3336     const int log_range_out = AOMMAX(16, bd + 6);
3337     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3338         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3339     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3340         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3341 
3342     for (int i = 0; i < 32; i++) {
3343       addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
3344                           &clamp_lo_out, &clamp_hi_out, out_shift);
3345     }
3346   }
3347 }
3348 
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3349 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3350                                   int do_cols, int bd, int out_shift) {
3351   const int32_t *cospi = cospi_arr(bit);
3352   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3353   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3354   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3355   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3356 
3357   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3358 
3359   {
3360     __m128i x;
3361 
3362     // stage 1
3363     // stage 2
3364     // stage 3
3365     // stage 4
3366     // stage 5
3367     // stage 6
3368     x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3369 
3370     // stage 8
3371     // stage 9
3372     // stage 10
3373     // stage 11
3374     if (do_cols) {
3375       x = _mm_max_epi32(x, clamp_lo);
3376       x = _mm_min_epi32(x, clamp_hi);
3377     } else {
3378       const int log_range_out = AOMMAX(16, bd + 6);
3379       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3380           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3381       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3382           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3383 
3384       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3385       x = _mm_add_epi32(x, offset);
3386       x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3387 
3388       x = _mm_max_epi32(x, clamp_lo_out);
3389       x = _mm_min_epi32(x, clamp_hi_out);
3390     }
3391 
3392     out[0] = x;
3393     out[1] = x;
3394     out[2] = x;
3395     out[3] = x;
3396     out[4] = x;
3397     out[5] = x;
3398     out[6] = x;
3399     out[7] = x;
3400     out[8] = x;
3401     out[9] = x;
3402     out[10] = x;
3403     out[11] = x;
3404     out[12] = x;
3405     out[13] = x;
3406     out[14] = x;
3407     out[15] = x;
3408     out[16] = x;
3409     out[17] = x;
3410     out[18] = x;
3411     out[19] = x;
3412     out[20] = x;
3413     out[21] = x;
3414     out[22] = x;
3415     out[23] = x;
3416     out[24] = x;
3417     out[25] = x;
3418     out[26] = x;
3419     out[27] = x;
3420     out[28] = x;
3421     out[29] = x;
3422     out[30] = x;
3423     out[31] = x;
3424     out[32] = x;
3425     out[33] = x;
3426     out[34] = x;
3427     out[35] = x;
3428     out[36] = x;
3429     out[37] = x;
3430     out[38] = x;
3431     out[39] = x;
3432     out[40] = x;
3433     out[41] = x;
3434     out[42] = x;
3435     out[43] = x;
3436     out[44] = x;
3437     out[45] = x;
3438     out[46] = x;
3439     out[47] = x;
3440     out[48] = x;
3441     out[49] = x;
3442     out[50] = x;
3443     out[51] = x;
3444     out[52] = x;
3445     out[53] = x;
3446     out[54] = x;
3447     out[55] = x;
3448     out[56] = x;
3449     out[57] = x;
3450     out[58] = x;
3451     out[59] = x;
3452     out[60] = x;
3453     out[61] = x;
3454     out[62] = x;
3455     out[63] = x;
3456   }
3457 }
3458 
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3459 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3460                                   int do_cols, int bd, int out_shift) {
3461   int i, j;
3462   const int32_t *cospi = cospi_arr(bit);
3463   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3464   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3465   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3466   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3467 
3468   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3469   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3470   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3471   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3472   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3473   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3474   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3475   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3476   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3477   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3478   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3479   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3480   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3481   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3482   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3483   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3484   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3485   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3486   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3487   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3488   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3489   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3490   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3491   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3492   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3493   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3494   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3495   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3496   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3497   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3498   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3499   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3500   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3501   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3502   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3503   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3504   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3505   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3506 
3507   {
3508     __m128i u[64];
3509 
3510     // stage 1
3511     u[0] = in[0];
3512     u[8] = in[4];
3513     u[16] = in[2];
3514     u[24] = in[6];
3515     u[32] = in[1];
3516     u[40] = in[5];
3517     u[48] = in[3];
3518     u[56] = in[7];
3519 
3520     // stage 2
3521     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3522     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3523     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3524     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3525     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3526     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3527     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3528     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3529 
3530     // stage 3
3531     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3532     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3533     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3534     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3535     u[33] = u[32];
3536     u[38] = u[39];
3537     u[41] = u[40];
3538     u[46] = u[47];
3539     u[49] = u[48];
3540     u[54] = u[55];
3541     u[57] = u[56];
3542     u[62] = u[63];
3543 
3544     // stage 4
3545     __m128i temp1, temp2;
3546     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3547     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3548     u[17] = u[16];
3549     u[22] = u[23];
3550     u[25] = u[24];
3551     u[30] = u[31];
3552 
3553     temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3554     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3555     u[33] = temp1;
3556 
3557     temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3558     u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3559     u[57] = temp2;
3560 
3561     temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3562     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3563     u[41] = temp1;
3564 
3565     temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3566     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3567     u[46] = temp2;
3568 
3569     // stage 5
3570     u[9] = u[8];
3571     u[14] = u[15];
3572 
3573     temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3574     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3575     u[17] = temp1;
3576 
3577     temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3578     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3579     u[22] = temp2;
3580 
3581     u[35] = u[32];
3582     u[34] = u[33];
3583     u[36] = u[39];
3584     u[37] = u[38];
3585     u[43] = u[40];
3586     u[42] = u[41];
3587     u[44] = u[47];
3588     u[45] = u[46];
3589     u[51] = u[48];
3590     u[50] = u[49];
3591     u[52] = u[55];
3592     u[53] = u[54];
3593     u[59] = u[56];
3594     u[58] = u[57];
3595     u[60] = u[63];
3596     u[61] = u[62];
3597 
3598     // stage 6
3599     temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3600     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3601     u[0] = temp1;
3602 
3603     temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3604     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3605     u[9] = temp2;
3606     u[19] = u[16];
3607     u[18] = u[17];
3608     u[20] = u[23];
3609     u[21] = u[22];
3610     u[27] = u[24];
3611     u[26] = u[25];
3612     u[28] = u[31];
3613     u[29] = u[30];
3614 
3615     temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3616     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3617     u[34] = temp1;
3618     temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3619     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3620     u[35] = temp2;
3621     temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3622     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3623     u[36] = temp1;
3624     temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3625     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3626     u[37] = temp2;
3627     temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3628     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3629     u[42] = temp1;
3630     temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3631     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3632     u[43] = temp2;
3633     temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3634     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3635     u[44] = temp1;
3636     temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3637     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3638     u[45] = temp2;
3639 
3640     // stage 7
3641     u[3] = u[0];
3642     u[2] = u[1];
3643     u[11] = u[8];
3644     u[10] = u[9];
3645     u[12] = u[15];
3646     u[13] = u[14];
3647 
3648     temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3649     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3650     u[18] = temp1;
3651     temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3652     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3653     u[19] = temp2;
3654     temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3655     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3656     u[20] = temp1;
3657     temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3658     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3659     u[21] = temp2;
3660     for (i = 32; i < 64; i += 16) {
3661       for (j = i; j < i + 4; j++) {
3662         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3663         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3664                       &clamp_hi);
3665       }
3666     }
3667 
3668     // stage 8
3669     u[7] = u[0];
3670     u[6] = u[1];
3671     u[5] = u[2];
3672     u[4] = u[3];
3673     u[9] = u[9];
3674 
3675     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3676                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3677 
3678     // stage 9
3679     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3680                          bit);
3681 
3682     // stage 10
3683     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3684                           bit);
3685 
3686     // stage 11
3687     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3688   }
3689 }
3690 
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3691 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3692                                    int do_cols, int bd, int out_shift) {
3693   int i, j;
3694   const int32_t *cospi = cospi_arr(bit);
3695   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3696   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3697   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3698   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3699 
3700   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3701   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3702   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3703   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3704   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3705   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3706   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3707   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3708   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3709   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3710   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3711   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3712   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3713   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3714   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3715   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3716   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3717   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3718   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3719   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3720   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3721   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3722   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3723   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3724   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3725   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3726   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3727   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3728   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3729   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3730   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3731   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3732   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3733 
3734   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3735   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3736   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3737   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3738   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3739   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3740   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3741   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3742   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3743   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3744   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3745   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3746   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3747   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3748   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3749   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3750   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3751   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3752   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3753   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3754   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3755 
3756   {
3757     __m128i u[64];
3758     __m128i tmp1, tmp2, tmp3, tmp4;
3759     // stage 1
3760     u[0] = in[0];
3761     u[32] = in[1];
3762     u[36] = in[9];
3763     u[40] = in[5];
3764     u[44] = in[13];
3765     u[48] = in[3];
3766     u[52] = in[11];
3767     u[56] = in[7];
3768     u[60] = in[15];
3769     u[16] = in[2];
3770     u[20] = in[10];
3771     u[24] = in[6];
3772     u[28] = in[14];
3773     u[4] = in[8];
3774     u[8] = in[4];
3775     u[12] = in[12];
3776 
3777     // stage 2
3778     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3779     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3780     u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3781     u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3782     u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3783     u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3784     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3785     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3786     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3787     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3788     u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3789     u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3790     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3791     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3792     u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3793     u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3794 
3795     // stage 3
3796     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3797     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3798     u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3799     u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3800     u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3801     u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3802     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3803     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3804     u[33] = u[32];
3805     u[34] = u[35];
3806     u[37] = u[36];
3807     u[38] = u[39];
3808     u[41] = u[40];
3809     u[42] = u[43];
3810     u[45] = u[44];
3811     u[46] = u[47];
3812     u[49] = u[48];
3813     u[50] = u[51];
3814     u[53] = u[52];
3815     u[54] = u[55];
3816     u[57] = u[56];
3817     u[58] = u[59];
3818     u[61] = u[60];
3819     u[62] = u[63];
3820 
3821     // stage 4
3822     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3823     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3824     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3825     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3826 
3827     u[17] = u[16];
3828     u[18] = u[19];
3829     u[21] = u[20];
3830     u[22] = u[23];
3831     u[25] = u[24];
3832     u[26] = u[27];
3833     u[29] = u[28];
3834     u[30] = u[31];
3835 
3836     tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3837     tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3838     tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3839     tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3840     u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3841     u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3842     u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3843     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3844     u[33] = tmp1;
3845     u[34] = tmp2;
3846     u[37] = tmp3;
3847     u[38] = tmp4;
3848 
3849     tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3850     tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3851     tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3852     tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3853     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3854     u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3855     u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3856     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3857     u[41] = tmp1;
3858     u[42] = tmp2;
3859     u[45] = tmp3;
3860     u[46] = tmp4;
3861 
3862     // stage 5
3863     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3864     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3865 
3866     u[9] = u[8];
3867     u[10] = u[11];
3868     u[13] = u[12];
3869     u[14] = u[15];
3870 
3871     tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3872     tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3873     tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3874     tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3875     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3876     u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3877     u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3878     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3879     u[17] = tmp1;
3880     u[18] = tmp2;
3881     u[21] = tmp3;
3882     u[22] = tmp4;
3883 
3884     for (i = 32; i < 64; i += 8) {
3885       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3886                     &clamp_hi);
3887       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3888                     &clamp_hi);
3889 
3890       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3891                     &clamp_hi);
3892       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3893                     &clamp_hi);
3894     }
3895 
3896     // stage 6
3897     tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3898     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3899     u[0] = tmp1;
3900     u[5] = u[4];
3901     u[6] = u[7];
3902 
3903     tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3904     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3905     u[9] = tmp1;
3906     tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3907     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3908     u[10] = tmp2;
3909 
3910     for (i = 16; i < 32; i += 8) {
3911       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3912                     &clamp_hi);
3913       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3914                     &clamp_hi);
3915 
3916       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3917                     &clamp_hi);
3918       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3919                     &clamp_hi);
3920     }
3921 
3922     tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3923     tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3924     tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3925     tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3926     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3927     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3928     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3929     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3930     u[34] = tmp1;
3931     u[35] = tmp2;
3932     u[36] = tmp3;
3933     u[37] = tmp4;
3934 
3935     tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3936     tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3937     tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3938     tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3939     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3940     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3941     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3942     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3943     u[42] = tmp1;
3944     u[43] = tmp2;
3945     u[44] = tmp3;
3946     u[45] = tmp4;
3947 
3948     // stage 7
3949     u[3] = u[0];
3950     u[2] = u[1];
3951     tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3952     u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3953     u[5] = tmp1;
3954     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3955     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3956     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3957     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3958 
3959     tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3960     tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3961     tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3962     tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3963     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3964     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3965     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3966     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3967     u[18] = tmp1;
3968     u[19] = tmp2;
3969     u[20] = tmp3;
3970     u[21] = tmp4;
3971 
3972     for (i = 32; i < 64; i += 16) {
3973       for (j = i; j < i + 4; j++) {
3974         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3975         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3976                       &clamp_hi);
3977       }
3978     }
3979 
3980     // stage 8
3981     for (i = 0; i < 4; ++i) {
3982       addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3983     }
3984 
3985     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3986                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3987 
3988     // stage 9
3989     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3990                          bit);
3991 
3992     // stage 10
3993     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3994                           bit);
3995 
3996     // stage 11
3997     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3998   }
3999 }
4000 
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4001 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4002                              int bd, int out_shift) {
4003   int i, j;
4004   const int32_t *cospi = cospi_arr(bit);
4005   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
4006   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4007   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4008   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4009 
4010   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
4011   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4012   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
4013   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4014   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
4015   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4016   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
4017   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4018   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
4019   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4020   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
4021   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4022   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
4023   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4024   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
4025   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4026   const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
4027   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4028   const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
4029   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4030   const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
4031   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4032   const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
4033   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4034   const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
4035   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4036   const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
4037   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4038   const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
4039   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4040   const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
4041   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4042   const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
4043   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4044   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4045   const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
4046   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4047   const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
4048   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4049   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4050   const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
4051   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4052   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
4053   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4054   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4055   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
4056   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4057   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
4058   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4059   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4060   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
4061 
4062   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4063   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4064   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
4065   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4066   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4067   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4068   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
4069   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4070   const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
4071   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4072   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4073   const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
4074   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4075   const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
4076   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4077   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
4078   const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
4079   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4080   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
4081   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4082   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4083   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
4084   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4085   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
4086   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4087   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
4088   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
4089 
4090   {
4091     __m128i u[64], v[64];
4092 
4093     // stage 1
4094     u[32] = in[1];
4095     u[34] = in[17];
4096     u[36] = in[9];
4097     u[38] = in[25];
4098     u[40] = in[5];
4099     u[42] = in[21];
4100     u[44] = in[13];
4101     u[46] = in[29];
4102     u[48] = in[3];
4103     u[50] = in[19];
4104     u[52] = in[11];
4105     u[54] = in[27];
4106     u[56] = in[7];
4107     u[58] = in[23];
4108     u[60] = in[15];
4109     u[62] = in[31];
4110 
4111     v[16] = in[2];
4112     v[18] = in[18];
4113     v[20] = in[10];
4114     v[22] = in[26];
4115     v[24] = in[6];
4116     v[26] = in[22];
4117     v[28] = in[14];
4118     v[30] = in[30];
4119 
4120     u[8] = in[4];
4121     u[10] = in[20];
4122     u[12] = in[12];
4123     u[14] = in[28];
4124 
4125     v[4] = in[8];
4126     v[6] = in[24];
4127 
4128     u[0] = in[0];
4129     u[2] = in[16];
4130 
4131     // stage 2
4132     v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
4133     v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
4134     v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
4135     v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
4136     v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
4137     v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
4138     v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
4139     v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
4140     v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
4141     v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
4142     v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
4143     v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
4144     v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
4145     v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
4146     v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
4147     v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
4148     v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
4149     v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
4150     v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
4151     v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
4152     v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
4153     v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
4154     v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
4155     v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
4156     v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
4157     v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
4158     v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
4159     v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
4160     v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
4161     v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
4162     v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
4163     v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
4164 
4165     // stage 3
4166     u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
4167     u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
4168     u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
4169     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
4170     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
4171     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
4172     u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
4173     u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
4174     u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
4175     u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
4176     u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
4177     u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
4178     u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
4179     u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
4180     u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
4181     u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
4182 
4183     for (i = 32; i < 64; i += 4) {
4184       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4185                     &clamp_hi);
4186       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4187                     &clamp_hi);
4188     }
4189 
4190     // stage 4
4191     v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
4192     v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
4193     v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
4194     v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
4195     v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
4196     v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
4197     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
4198     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
4199 
4200     for (i = 16; i < 32; i += 4) {
4201       addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
4202                     &clamp_hi);
4203       addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
4204                     &clamp_hi);
4205     }
4206 
4207     for (i = 32; i < 64; i += 4) {
4208       v[i + 0] = u[i + 0];
4209       v[i + 3] = u[i + 3];
4210     }
4211 
4212     v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
4213     v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
4214     v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
4215     v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
4216     v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
4217     v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
4218     v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
4219     v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
4220     v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
4221     v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
4222     v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
4223     v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
4224     v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
4225     v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
4226     v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
4227     v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
4228 
4229     // stage 5
4230     u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
4231     u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
4232     u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
4233     u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
4234 
4235     for (i = 8; i < 16; i += 4) {
4236       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4237                     &clamp_hi);
4238       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4239                     &clamp_hi);
4240     }
4241 
4242     for (i = 16; i < 32; i += 4) {
4243       u[i + 0] = v[i + 0];
4244       u[i + 3] = v[i + 3];
4245     }
4246 
4247     u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4248     u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4249     u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4250     u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4251     u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4252     u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4253     u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4254     u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4255 
4256     for (i = 32; i < 64; i += 8) {
4257       addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4258                     &clamp_hi);
4259       addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4260                     &clamp_hi);
4261 
4262       addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4263                     &clamp_hi);
4264       addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4265                     &clamp_hi);
4266     }
4267 
4268     // stage 6
4269     v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4270     v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4271     v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4272     v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4273 
4274     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4275     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4276 
4277     for (i = 8; i < 16; i += 4) {
4278       v[i + 0] = u[i + 0];
4279       v[i + 3] = u[i + 3];
4280     }
4281 
4282     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4283     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4284     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4285     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4286 
4287     for (i = 16; i < 32; i += 8) {
4288       addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4289                     &clamp_hi);
4290       addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4291                     &clamp_hi);
4292 
4293       addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4294                     &clamp_hi);
4295       addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4296                     &clamp_hi);
4297     }
4298 
4299     for (i = 32; i < 64; i += 8) {
4300       v[i + 0] = u[i + 0];
4301       v[i + 1] = u[i + 1];
4302       v[i + 6] = u[i + 6];
4303       v[i + 7] = u[i + 7];
4304     }
4305 
4306     v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4307     v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4308     v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4309     v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4310     v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4311     v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4312     v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4313     v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4314     v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4315     v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4316     v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4317     v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4318     v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4319     v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4320     v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4321     v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4322 
4323     // stage 7
4324     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4325     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4326 
4327     u[4] = v[4];
4328     u[7] = v[7];
4329     u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4330     u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4331 
4332     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4333     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4334     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4335     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4336 
4337     for (i = 16; i < 32; i += 8) {
4338       u[i + 0] = v[i + 0];
4339       u[i + 1] = v[i + 1];
4340       u[i + 6] = v[i + 6];
4341       u[i + 7] = v[i + 7];
4342     }
4343 
4344     u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4345     u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4346     u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4347     u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4348     u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4349     u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4350     u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4351     u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4352 
4353     for (i = 32; i < 64; i += 16) {
4354       for (j = i; j < i + 4; j++) {
4355         addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4356         addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4357                       &clamp_hi);
4358       }
4359     }
4360 
4361     // stage 8
4362     for (i = 0; i < 4; ++i) {
4363       addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4364     }
4365 
4366     v[8] = u[8];
4367     v[9] = u[9];
4368     v[14] = u[14];
4369     v[15] = u[15];
4370 
4371     v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4372     v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4373     v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4374     v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4375 
4376     for (i = 16; i < 20; ++i) {
4377       addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4378       addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4379                     &clamp_hi);
4380     }
4381 
4382     for (i = 32; i < 36; ++i) {
4383       v[i] = u[i];
4384       v[i + 12] = u[i + 12];
4385       v[i + 16] = u[i + 16];
4386       v[i + 28] = u[i + 28];
4387     }
4388 
4389     v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4390     v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4391     v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4392     v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4393     v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4394     v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4395     v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4396     v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4397     v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4398     v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4399     v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4400     v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4401     v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4402     v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4403     v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4404     v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4405 
4406     // stage 9
4407     for (i = 0; i < 8; ++i) {
4408       addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4409     }
4410 
4411     for (i = 16; i < 20; ++i) {
4412       u[i] = v[i];
4413       u[i + 12] = v[i + 12];
4414     }
4415 
4416     u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4417     u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4418     u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4419     u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4420     u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4421     u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4422     u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4423     u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4424 
4425     for (i = 32; i < 40; i++) {
4426       addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4427     }
4428 
4429     for (i = 48; i < 56; i++) {
4430       addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4431     }
4432 
4433     // stage 10
4434     for (i = 0; i < 16; i++) {
4435       addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4436     }
4437 
4438     for (i = 32; i < 40; i++) v[i] = u[i];
4439 
4440     v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4441     v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4442     v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4443     v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4444     v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4445     v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4446     v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4447     v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4448     v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4449     v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4450     v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4451     v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4452     v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4453     v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4454     v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4455     v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4456 
4457     for (i = 56; i < 64; i++) v[i] = u[i];
4458 
4459     // stage 11
4460     if (do_cols) {
4461       for (i = 0; i < 32; i++) {
4462         addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
4463       }
4464     } else {
4465       const int log_range_out = AOMMAX(16, bd + 6);
4466       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4467           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4468       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4469           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4470 
4471       for (i = 0; i < 32; i++) {
4472         addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
4473                             &clamp_lo_out, &clamp_hi_out, out_shift);
4474       }
4475     }
4476   }
4477 }
4478 
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4479 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4480                                   int do_cols, int bd, int out_shift) {
4481   const int32_t *cospi = cospi_arr(bit);
4482   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4483   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4484   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4485   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4486   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4487   __m128i bf1;
4488 
4489   // stage 0
4490   // stage 1
4491   bf1 = in[0];
4492 
4493   // stage 2
4494   // stage 3
4495   // stage 4
4496   // stage 5
4497   bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4498 
4499   // stage 6
4500   // stage 7
4501   // stage 8
4502   // stage 9
4503   if (do_cols) {
4504     bf1 = _mm_max_epi32(bf1, clamp_lo);
4505     bf1 = _mm_min_epi32(bf1, clamp_hi);
4506   } else {
4507     const int log_range_out = AOMMAX(16, bd + 6);
4508     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4509         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4510     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4511         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4512 
4513     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4514     bf1 = _mm_add_epi32(bf1, offset);
4515     bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4516     bf1 = _mm_max_epi32(bf1, clamp_lo_out);
4517     bf1 = _mm_min_epi32(bf1, clamp_hi_out);
4518   }
4519   out[0] = bf1;
4520   out[1] = bf1;
4521   out[2] = bf1;
4522   out[3] = bf1;
4523   out[4] = bf1;
4524   out[5] = bf1;
4525   out[6] = bf1;
4526   out[7] = bf1;
4527   out[8] = bf1;
4528   out[9] = bf1;
4529   out[10] = bf1;
4530   out[11] = bf1;
4531   out[12] = bf1;
4532   out[13] = bf1;
4533   out[14] = bf1;
4534   out[15] = bf1;
4535   out[16] = bf1;
4536   out[17] = bf1;
4537   out[18] = bf1;
4538   out[19] = bf1;
4539   out[20] = bf1;
4540   out[21] = bf1;
4541   out[22] = bf1;
4542   out[23] = bf1;
4543   out[24] = bf1;
4544   out[25] = bf1;
4545   out[26] = bf1;
4546   out[27] = bf1;
4547   out[28] = bf1;
4548   out[29] = bf1;
4549   out[30] = bf1;
4550   out[31] = bf1;
4551 }
4552 
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4553 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4554                                   int do_cols, int bd, int out_shift) {
4555   const int32_t *cospi = cospi_arr(bit);
4556   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4557   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4558   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4559   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4560   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4561   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4562   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4563   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4564   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4565   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4566   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4567   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4568   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4569   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4570   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4571   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4572   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4573   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4574   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4575   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4576   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4577   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4578   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4579   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4580   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4581   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4582   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4583   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4584   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4585   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4586   __m128i bf1[32];
4587 
4588   // stage 0
4589   // stage 1
4590   bf1[0] = in[0];
4591   bf1[4] = in[4];
4592   bf1[8] = in[2];
4593   bf1[12] = in[6];
4594   bf1[16] = in[1];
4595   bf1[20] = in[5];
4596   bf1[24] = in[3];
4597   bf1[28] = in[7];
4598 
4599   // stage 2
4600   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4601   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4602   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4603   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4604   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4605   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4606   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4607   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4608 
4609   // stage 3
4610   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4611   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4612 
4613   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4614   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4615   bf1[17] = bf1[16];
4616   bf1[18] = bf1[19];
4617   bf1[21] = bf1[20];
4618   bf1[22] = bf1[23];
4619   bf1[25] = bf1[24];
4620   bf1[26] = bf1[27];
4621   bf1[29] = bf1[28];
4622   bf1[30] = bf1[31];
4623 
4624   // stage 4 :
4625   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4626   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4627 
4628   bf1[9] = bf1[8];
4629   bf1[10] = bf1[11];
4630   bf1[13] = bf1[12];
4631   bf1[14] = bf1[15];
4632 
4633   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4634                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4635 
4636   // stage 5
4637   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4638   bf1[1] = bf1[0];
4639   bf1[5] = bf1[4];
4640   bf1[6] = bf1[7];
4641 
4642   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4643                        &clamp_hi, &rounding, bit);
4644 
4645   // stage 6
4646   bf1[3] = bf1[0];
4647   bf1[2] = bf1[1];
4648 
4649   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4650                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4651 
4652   // stage 7
4653   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4654                        &rounding, bit);
4655 
4656   // stage 8
4657   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4658                        &rounding, bit);
4659 
4660   // stage 9
4661   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4662 }
4663 
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4664 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4665                                    int do_cols, int bd, int out_shift) {
4666   const int32_t *cospi = cospi_arr(bit);
4667   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4668   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4669   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4670   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4671   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4672   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4673   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4674   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4675   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4676   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4677   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4678   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4679   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4680   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4681   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4682   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4683   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4684   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4685   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4686   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4687   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4688   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4689   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4690   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4691   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4692   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4693   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4694   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4695   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4696   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4697   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4698   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4699   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4700   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4701   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4702   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4703   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4704   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4705   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4706   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4707   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4708   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4709   __m128i bf1[32];
4710 
4711   // stage 0
4712   // stage 1
4713 
4714   bf1[0] = in[0];
4715   bf1[2] = in[8];
4716   bf1[4] = in[4];
4717   bf1[6] = in[12];
4718   bf1[8] = in[2];
4719   bf1[10] = in[10];
4720   bf1[12] = in[6];
4721   bf1[14] = in[14];
4722   bf1[16] = in[1];
4723   bf1[18] = in[9];
4724   bf1[20] = in[5];
4725   bf1[22] = in[13];
4726   bf1[24] = in[3];
4727   bf1[26] = in[11];
4728   bf1[28] = in[7];
4729   bf1[30] = in[15];
4730 
4731   // stage 2
4732   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4733   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4734   bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4735   bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4736   bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4737   bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4738   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4739   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4740   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4741   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4742   bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4743   bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4744   bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4745   bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4746   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4747   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4748 
4749   // stage 3
4750   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4751   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4752   bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4753   bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4754   bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4755   bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4756   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4757   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4758 
4759   addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4760   addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4761   addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4762   addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4763   addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4764   addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4765   addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4766   addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4767   // stage 4
4768   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4769   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4770   bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4771   bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4772 
4773   addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4774   addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4775   addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4776   addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4777 
4778   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4779                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4780 
4781   // stage 5
4782   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4783   bf1[1] = bf1[0];
4784   bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4785   bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4786 
4787   addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4788   addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4789 
4790   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4791                        &clamp_hi, &rounding, bit);
4792 
4793   // stage 6
4794   addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4795   addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4796 
4797   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4798                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4799 
4800   // stage 7
4801   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4802                        &rounding, bit);
4803 
4804   // stage 8
4805   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4806                        &rounding, bit);
4807 
4808   // stage 9
4809   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4810 }
4811 
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4812 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4813                              int bd, int out_shift) {
4814   const int32_t *cospi = cospi_arr(bit);
4815   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4816   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4817   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4818   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4819   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4820   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4821   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4822   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4823   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4824   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4825   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4826   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4827   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4828   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4829   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4830   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4831   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4832   const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4833   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4834   const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4835   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4836   const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4837   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4838   const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4839   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4840   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4841   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4842   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4843   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4844   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4845   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4846   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4847   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4848   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4849   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4850   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4851   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4852   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4853   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4854   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4855   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4856   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4857   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4858   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4859   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4860   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4861   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4862   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4863   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4864   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4865   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4866   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4867   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4868   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4869   __m128i bf1[32], bf0[32];
4870 
4871   // stage 0
4872   // stage 1
4873   bf1[0] = in[0];
4874   bf1[1] = in[16];
4875   bf1[2] = in[8];
4876   bf1[3] = in[24];
4877   bf1[4] = in[4];
4878   bf1[5] = in[20];
4879   bf1[6] = in[12];
4880   bf1[7] = in[28];
4881   bf1[8] = in[2];
4882   bf1[9] = in[18];
4883   bf1[10] = in[10];
4884   bf1[11] = in[26];
4885   bf1[12] = in[6];
4886   bf1[13] = in[22];
4887   bf1[14] = in[14];
4888   bf1[15] = in[30];
4889   bf1[16] = in[1];
4890   bf1[17] = in[17];
4891   bf1[18] = in[9];
4892   bf1[19] = in[25];
4893   bf1[20] = in[5];
4894   bf1[21] = in[21];
4895   bf1[22] = in[13];
4896   bf1[23] = in[29];
4897   bf1[24] = in[3];
4898   bf1[25] = in[19];
4899   bf1[26] = in[11];
4900   bf1[27] = in[27];
4901   bf1[28] = in[7];
4902   bf1[29] = in[23];
4903   bf1[30] = in[15];
4904   bf1[31] = in[31];
4905 
4906   // stage 2
4907   bf0[0] = bf1[0];
4908   bf0[1] = bf1[1];
4909   bf0[2] = bf1[2];
4910   bf0[3] = bf1[3];
4911   bf0[4] = bf1[4];
4912   bf0[5] = bf1[5];
4913   bf0[6] = bf1[6];
4914   bf0[7] = bf1[7];
4915   bf0[8] = bf1[8];
4916   bf0[9] = bf1[9];
4917   bf0[10] = bf1[10];
4918   bf0[11] = bf1[11];
4919   bf0[12] = bf1[12];
4920   bf0[13] = bf1[13];
4921   bf0[14] = bf1[14];
4922   bf0[15] = bf1[15];
4923   bf0[16] =
4924       half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4925   bf0[17] =
4926       half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4927   bf0[18] =
4928       half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4929   bf0[19] =
4930       half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4931   bf0[20] =
4932       half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4933   bf0[21] =
4934       half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4935   bf0[22] =
4936       half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4937   bf0[23] =
4938       half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4939   bf0[24] =
4940       half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4941   bf0[25] =
4942       half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4943   bf0[26] =
4944       half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4945   bf0[27] =
4946       half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4947   bf0[28] =
4948       half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4949   bf0[29] =
4950       half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4951   bf0[30] =
4952       half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4953   bf0[31] =
4954       half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4955 
4956   // stage 3
4957   bf1[0] = bf0[0];
4958   bf1[1] = bf0[1];
4959   bf1[2] = bf0[2];
4960   bf1[3] = bf0[3];
4961   bf1[4] = bf0[4];
4962   bf1[5] = bf0[5];
4963   bf1[6] = bf0[6];
4964   bf1[7] = bf0[7];
4965   bf1[8] =
4966       half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4967   bf1[9] =
4968       half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4969   bf1[10] =
4970       half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4971   bf1[11] =
4972       half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4973   bf1[12] =
4974       half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4975   bf1[13] =
4976       half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4977   bf1[14] =
4978       half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4979   bf1[15] =
4980       half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4981 
4982   addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4983   addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4984   addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4985   addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4986   addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4987   addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4988   addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4989   addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4990 
4991   // stage 4
4992   bf0[0] = bf1[0];
4993   bf0[1] = bf1[1];
4994   bf0[2] = bf1[2];
4995   bf0[3] = bf1[3];
4996   bf0[4] =
4997       half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4998   bf0[5] =
4999       half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
5000   bf0[6] =
5001       half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
5002   bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
5003 
5004   addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
5005   addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
5006   addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
5007   addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
5008 
5009   bf0[16] = bf1[16];
5010   bf0[17] =
5011       half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
5012   bf0[18] =
5013       half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
5014   bf0[19] = bf1[19];
5015   bf0[20] = bf1[20];
5016   bf0[21] =
5017       half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
5018   bf0[22] =
5019       half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
5020   bf0[23] = bf1[23];
5021   bf0[24] = bf1[24];
5022   bf0[25] =
5023       half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
5024   bf0[26] =
5025       half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
5026   bf0[27] = bf1[27];
5027   bf0[28] = bf1[28];
5028   bf0[29] =
5029       half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
5030   bf0[30] =
5031       half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
5032   bf0[31] = bf1[31];
5033 
5034   // stage 5
5035   bf1[0] =
5036       half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
5037   bf1[1] =
5038       half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
5039   bf1[2] =
5040       half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
5041   bf1[3] =
5042       half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
5043   addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
5044   addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
5045   bf1[8] = bf0[8];
5046   bf1[9] =
5047       half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
5048   bf1[10] =
5049       half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
5050   bf1[11] = bf0[11];
5051   bf1[12] = bf0[12];
5052   bf1[13] =
5053       half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
5054   bf1[14] =
5055       half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
5056   bf1[15] = bf0[15];
5057   addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
5058   addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
5059   addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
5060   addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
5061   addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
5062   addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
5063   addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
5064   addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
5065 
5066   // stage 6
5067   addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
5068   addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
5069   bf0[4] = bf1[4];
5070   bf0[5] =
5071       half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5072   bf0[6] =
5073       half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5074   bf0[7] = bf1[7];
5075   addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
5076   addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
5077   addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
5078   addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
5079   bf0[16] = bf1[16];
5080   bf0[17] = bf1[17];
5081   bf0[18] =
5082       half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
5083   bf0[19] =
5084       half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
5085   bf0[20] =
5086       half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
5087   bf0[21] =
5088       half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
5089   bf0[22] = bf1[22];
5090   bf0[23] = bf1[23];
5091   bf0[24] = bf1[24];
5092   bf0[25] = bf1[25];
5093   bf0[26] =
5094       half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
5095   bf0[27] =
5096       half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
5097   bf0[28] =
5098       half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
5099   bf0[29] =
5100       half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
5101   bf0[30] = bf1[30];
5102   bf0[31] = bf1[31];
5103 
5104   // stage 7
5105   addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
5106   addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
5107   addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
5108   addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
5109   bf1[8] = bf0[8];
5110   bf1[9] = bf0[9];
5111   bf1[10] =
5112       half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5113   bf1[11] =
5114       half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5115   bf1[12] =
5116       half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5117   bf1[13] =
5118       half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5119   bf1[14] = bf0[14];
5120   bf1[15] = bf0[15];
5121   addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
5122   addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
5123   addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
5124   addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
5125   addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
5126   addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
5127   addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
5128   addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
5129 
5130   // stage 8
5131   addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
5132   addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
5133   addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
5134   addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
5135   addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
5136   addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
5137   addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
5138   addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
5139   bf0[16] = bf1[16];
5140   bf0[17] = bf1[17];
5141   bf0[18] = bf1[18];
5142   bf0[19] = bf1[19];
5143   bf0[20] =
5144       half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5145   bf0[21] =
5146       half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5147   bf0[22] =
5148       half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5149   bf0[23] =
5150       half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5151   bf0[24] =
5152       half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5153   bf0[25] =
5154       half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5155   bf0[26] =
5156       half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5157   bf0[27] =
5158       half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5159   bf0[28] = bf1[28];
5160   bf0[29] = bf1[29];
5161   bf0[30] = bf1[30];
5162   bf0[31] = bf1[31];
5163 
5164   // stage 9
5165   if (do_cols) {
5166     addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
5167     addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
5168     addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
5169     addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
5170     addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
5171     addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
5172     addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
5173     addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
5174     addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
5175     addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
5176     addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
5177     addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
5178     addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
5179     addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
5180     addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
5181     addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
5182   } else {
5183     const int log_range_out = AOMMAX(16, bd + 6);
5184     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
5185         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5186     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
5187         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5188 
5189     addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
5190                         &clamp_hi_out, out_shift);
5191     addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
5192                         &clamp_hi_out, out_shift);
5193     addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
5194                         &clamp_hi_out, out_shift);
5195     addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
5196                         &clamp_hi_out, out_shift);
5197     addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
5198                         &clamp_hi_out, out_shift);
5199     addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
5200                         &clamp_hi_out, out_shift);
5201     addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
5202                         &clamp_hi_out, out_shift);
5203     addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
5204                         &clamp_hi_out, out_shift);
5205     addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
5206                         &clamp_hi_out, out_shift);
5207     addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
5208                         &clamp_hi_out, out_shift);
5209     addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
5210                         &clamp_hi_out, out_shift);
5211     addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
5212                         &clamp_hi_out, out_shift);
5213     addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
5214                         &clamp_hi_out, out_shift);
5215     addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
5216                         &clamp_hi_out, out_shift);
5217     addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
5218                         &clamp_hi_out, out_shift);
5219     addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
5220                         &clamp_hi_out, out_shift);
5221   }
5222 }
5223 
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5224 void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5225                                         int stride,
5226                                         const TxfmParam *txfm_param) {
5227   int bd = txfm_param->bd;
5228   const TX_TYPE tx_type = txfm_param->tx_type;
5229   const int32_t *src = cast_to_int32(input);
5230   switch (tx_type) {
5231     case IDTX:
5232     case H_DCT:
5233     case H_ADST:
5234     case H_FLIPADST:
5235     case V_DCT:
5236     case V_ADST:
5237     case V_FLIPADST:
5238       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5239                                                 txfm_param->tx_size,
5240                                                 txfm_param->eob, bd);
5241       break;
5242     default:
5243       av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5244                                     tx_type, bd);
5245       break;
5246   }
5247 }
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5248 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5249                                         int stride,
5250                                         const TxfmParam *txfm_param) {
5251   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5252   int eob = txfm_param->eob;
5253   int bd = txfm_param->bd;
5254   int lossless = txfm_param->lossless;
5255   const int32_t *src = cast_to_int32(input);
5256   const TX_TYPE tx_type = txfm_param->tx_type;
5257   if (lossless) {
5258     assert(tx_type == DCT_DCT);
5259     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5260     return;
5261   }
5262   av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5263                                 bd);
5264 }
iidentity32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)5265 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
5266                                int bd, int out_shift) {
5267   (void)bit;
5268   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
5269   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
5270   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
5271   __m128i v[32];
5272   for (int i = 0; i < 32; i += 16) {
5273     v[i] = _mm_slli_epi32(in[i], 2);
5274     v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
5275     v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
5276     v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
5277     v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
5278     v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
5279     v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
5280     v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
5281     v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
5282     v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
5283     v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
5284     v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
5285     v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
5286     v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
5287     v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
5288     v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
5289   }
5290 
5291   if (!do_cols) {
5292     const int log_range_out = AOMMAX(16, bd + 6);
5293     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
5294         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
5295     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
5296         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
5297     shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
5298   } else {
5299     highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
5300   }
5301 }
5302 static const transform_1d_sse4_1
5303     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5304       {
5305           { idct4x4_sse4_1, NULL, NULL, NULL },
5306           { iadst4x4_sse4_1, NULL, NULL, NULL },
5307           { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
5308       },
5309       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5310         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5311         { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
5312       {
5313           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5314             NULL },
5315           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5316             NULL },
5317           { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
5318       },
5319       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5320           idct32x32_sse4_1 },
5321         { NULL, NULL, NULL, NULL },
5322         { iidentity32_sse4_1, NULL, NULL, NULL } },
5323       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5324           idct64x64_sse4_1 },
5325         { NULL, NULL, NULL, NULL },
5326         { NULL, NULL, NULL, NULL } }
5327     };
highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5328 static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
5329                                                     uint16_t *output,
5330                                                     int stride, TX_TYPE tx_type,
5331                                                     TX_SIZE tx_size, int eob,
5332                                                     const int bd) {
5333   __m128i buf1[64];
5334   int eobx, eoby;
5335   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
5336   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5337   const int txw_idx = get_txw_idx(tx_size);
5338   const int txh_idx = get_txh_idx(tx_size);
5339   const int txfm_size_col = tx_size_wide[tx_size];
5340   const int txfm_size_row = tx_size_high[tx_size];
5341   const int input_stride = AOMMIN(32, txfm_size_col);
5342   const int buf_size_w_div4 = input_stride >> 2;
5343   const int buf_size_h_div8 = (eoby + 8) >> 3;
5344   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5345   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5346   const transform_1d_sse4_1 row_txfm =
5347       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5348   const transform_1d_sse4_1 col_txfm =
5349       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5350   int ud_flip, lr_flip;
5351   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5352 
5353   for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5354     __m128i buf0[16];
5355     const int32_t *input_row = input + i * input_stride * 4;
5356     for (int j = 0; j < buf_size_w_div4; ++j) {
5357       __m128i *buf0_cur = buf0 + j * 4;
5358       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5359     }
5360     if (rect_type == 1 || rect_type == -1) {
5361       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5362                                            NewInvSqrt2);
5363     }
5364     row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5365 
5366     __m128i *_buf1 = buf1 + i * 4;
5367 
5368     for (int j = 0; j < buf_size_w_div4; ++j) {
5369       _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5370       _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5371       _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5372       _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5373     }
5374   }
5375   for (int i = 0; i < buf_size_w_div4; i++) {
5376     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5377              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5378 
5379     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5380                                     buf1 + i * txfm_size_row, txfm_size_row,
5381                                     -shift[1]);
5382   }
5383 
5384   // write to buffer
5385   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5386     highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
5387                                    stride, ud_flip, txfm_size_row, bd);
5388   }
5389 }
highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5390 static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
5391                                                     uint16_t *output,
5392                                                     int stride, TX_TYPE tx_type,
5393                                                     TX_SIZE tx_size, int eob,
5394                                                     const int bd) {
5395   __m128i buf1[64];
5396   int eobx, eoby;
5397   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
5398   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5399   const int txw_idx = get_txw_idx(tx_size);
5400   const int txh_idx = get_txh_idx(tx_size);
5401   const int txfm_size_col = tx_size_wide[tx_size];
5402   const int txfm_size_row = tx_size_high[tx_size];
5403   const int input_stride = AOMMIN(32, txfm_size_col);
5404   const int buf_size_w_div8 = input_stride >> 2;
5405   const int row_max = AOMMIN(32, txfm_size_row);
5406   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5407   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5408   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5409   const transform_1d_sse4_1 row_txfm =
5410       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5411   const transform_1d_sse4_1 col_txfm =
5412       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5413   int ud_flip, lr_flip;
5414   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5415 
5416   for (int i = 0; i < (row_max >> 2); ++i) {
5417     __m128i buf0[16];
5418     const int32_t *input_row = input + i * input_stride * 4;
5419     for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
5420       __m128i *buf0_cur = buf0 + j * 4;
5421       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5422 
5423       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5424                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5425     }
5426     if (rect_type == 1 || rect_type == -1) {
5427       av1_round_shift_rect_array_32_sse4_1(
5428           buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
5429     }
5430     row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5431 
5432     __m128i *_buf1 = buf1 + i * 4;
5433     if (lr_flip) {
5434       for (int j = 0; j < buf_size_w_div8; ++j) {
5435         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5436                       buf0[4 * j],
5437                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5438                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5439                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5440                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5441       }
5442     } else {
5443       for (int j = 0; j < buf_size_w_div8; ++j) {
5444         TRANSPOSE_4X4(
5445             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5446             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5447             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5448       }
5449     }
5450   }
5451   for (int i = 0; i < buf_size_w_div8; i++) {
5452     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5453              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5454 
5455     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5456                                     buf1 + i * txfm_size_row, txfm_size_row,
5457                                     -shift[1]);
5458   }
5459 
5460   // write to buffer
5461   {
5462     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5463       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5464                                      output + 8 * i, stride, ud_flip,
5465                                      txfm_size_row, bd);
5466     }
5467   }
5468 }
highbd_inv_txfm2d_add_idtx_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5469 static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
5470                                               uint16_t *output, int stride,
5471                                               TX_TYPE tx_type, TX_SIZE tx_size,
5472                                               int eob, const int bd) {
5473   (void)eob;
5474   __m128i buf1[64 * 4];
5475   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5476   const int txw_idx = get_txw_idx(tx_size);
5477   const int txh_idx = get_txh_idx(tx_size);
5478   const int txfm_size_col = tx_size_wide[tx_size];
5479   const int txfm_size_row = tx_size_high[tx_size];
5480   const int input_stride = AOMMIN(32, txfm_size_col);
5481   const int row_max = AOMMIN(32, txfm_size_row);
5482   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5483   const transform_1d_sse4_1 row_txfm =
5484       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5485   const transform_1d_sse4_1 col_txfm =
5486       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5487 
5488   for (int i = 0; i < (row_max >> 2); ++i) {
5489     __m128i buf0[32];
5490     const int32_t *input_row = input + i * input_stride * 4;
5491     for (int j = 0; j < (input_stride >> 2); ++j) {
5492       __m128i *buf0_cur = buf0 + j * 4;
5493       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5494     }
5495     if (rect_type == 1 || rect_type == -1) {
5496       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5497                                            NewInvSqrt2);
5498     }
5499     row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5500 
5501     __m128i *_buf1 = buf1 + i * 4;
5502     for (int j = 0; j < (input_stride >> 2); ++j) {
5503       _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5504       _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5505       _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5506       _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5507     }
5508   }
5509   for (int i = 0; i < (input_stride >> 2); i++) {
5510     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5511              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5512 
5513     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5514                                     buf1 + i * txfm_size_row, txfm_size_row,
5515                                     -shift[1]);
5516   }
5517 
5518   // write to buffer
5519   {
5520     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5521       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5522                                      output + 8 * i, stride, 0, txfm_size_row,
5523                                      bd);
5524     }
5525   }
5526 }
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5527 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5528                                                     uint16_t *output,
5529                                                     int stride, TX_TYPE tx_type,
5530                                                     TX_SIZE tx_size, int eob,
5531                                                     const int bd) {
5532   __m128i buf1[64 * 16];
5533   int eobx, eoby;
5534   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5535   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5536   const int txw_idx = get_txw_idx(tx_size);
5537   const int txh_idx = get_txh_idx(tx_size);
5538   const int txfm_size_col = tx_size_wide[tx_size];
5539   const int txfm_size_row = tx_size_high[tx_size];
5540   const int buf_size_w_div8 = txfm_size_col >> 2;
5541   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5542   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5543   const int input_stride = AOMMIN(32, txfm_size_col);
5544   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5545 
5546   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5547   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5548   const transform_1d_sse4_1 row_txfm =
5549       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5550   const transform_1d_sse4_1 col_txfm =
5551       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5552 
5553   assert(col_txfm != NULL);
5554   assert(row_txfm != NULL);
5555   int ud_flip, lr_flip;
5556   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5557 
5558   // 1st stage: column transform
5559   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5560     __m128i buf0[64];
5561     const int32_t *input_row = input + i * input_stride * 4;
5562     for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5563       __m128i *buf0_cur = buf0 + j * 4;
5564       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5565 
5566       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5567                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5568     }
5569     if (rect_type == 1 || rect_type == -1) {
5570       av1_round_shift_rect_array_32_sse4_1(
5571           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
5572     }
5573     row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5574 
5575     __m128i *_buf1 = buf1 + i * 4;
5576     if (lr_flip) {
5577       for (int j = 0; j < buf_size_w_div8; ++j) {
5578         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5579                       buf0[4 * j],
5580                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5581                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5582                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5583                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5584       }
5585     } else {
5586       for (int j = 0; j < buf_size_w_div8; ++j) {
5587         TRANSPOSE_4X4(
5588             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5589             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5590             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5591       }
5592     }
5593   }
5594   // 2nd stage: column transform
5595   for (int i = 0; i < buf_size_w_div8; i++) {
5596     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5597              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5598 
5599     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5600                                     buf1 + i * txfm_size_row, txfm_size_row,
5601                                     -shift[1]);
5602   }
5603 
5604   // write to buffer
5605   {
5606     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5607       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5608                                      output + 8 * i, stride, ud_flip,
5609                                      txfm_size_row, bd);
5610     }
5611   }
5612 }
5613 
highbd_inv_txfm2d_add_4x8_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5614 static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
5615                                             uint16_t *output, int stride,
5616                                             TX_TYPE tx_type, TX_SIZE tx_size,
5617                                             int eob, const int bd) {
5618   (void)eob;
5619   __m128i buf1[8];
5620   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5621   const int txw_idx = get_txw_idx(tx_size);
5622   const int txh_idx = get_txh_idx(tx_size);
5623   const int txfm_size_col = tx_size_wide[tx_size];
5624   const int txfm_size_row = tx_size_high[tx_size];
5625   const transform_1d_sse4_1 row_txfm =
5626       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5627   const transform_1d_sse4_1 col_txfm =
5628       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
5629   const int input_stride = AOMMIN(32, txfm_size_col);
5630 
5631   assert(col_txfm != NULL);
5632   assert(row_txfm != NULL);
5633   int ud_flip, lr_flip;
5634   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5635 
5636   // 1st stage: column transform
5637   __m128i buf0[8];
5638   const int32_t *input_row = input;
5639   __m128i *buf0_cur = buf0;
5640   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5641   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
5642                                        NewInvSqrt2);
5643   row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5644   row_txfm(buf0 + 4, buf0 + 4, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5645            -shift[0]);
5646 
5647   if (lr_flip) {
5648     TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
5649                   buf1[3]);
5650 
5651     TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
5652                   buf1[7]);
5653   } else {
5654     TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
5655                   buf1[3]);
5656 
5657     TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
5658                   buf1[7]);
5659   }
5660 
5661   // 2nd stage: column transform
5662   col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5663 
5664   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5665 
5666   // write to buffer
5667   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5668                                  bd);
5669 }
5670 
highbd_inv_txfm2d_add_8x4_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5671 static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
5672                                             uint16_t *output, int stride,
5673                                             TX_TYPE tx_type, TX_SIZE tx_size,
5674                                             int eob, const int bd) {
5675   (void)eob;
5676   __m128i buf1[8];
5677   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5678   const int txw_idx = get_txw_idx(tx_size);
5679   const int txh_idx = get_txh_idx(tx_size);
5680   const int txfm_size_col = tx_size_wide[tx_size];
5681   const int txfm_size_row = tx_size_high[tx_size];
5682   const transform_1d_sse4_1 row_txfm =
5683       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
5684   const transform_1d_sse4_1 col_txfm =
5685       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5686 
5687   assert(col_txfm != NULL);
5688   assert(row_txfm != NULL);
5689   int ud_flip, lr_flip;
5690   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5691 
5692   // 1st stage: column transform
5693   __m128i buf0[8];
5694   const int32_t *input_row = input;
5695   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5696 
5697   TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
5698                 buf1[3]);
5699   TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
5700                 buf1[7]);
5701 
5702   av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
5703                                        NewInvSqrt2);
5704   row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5705 
5706   __m128i *buf1_ptr;
5707   if (lr_flip) {
5708     flip_buf_sse2(buf0, buf1, txfm_size_col);
5709     buf1_ptr = buf1;
5710   } else {
5711     buf1_ptr = buf0;
5712   }
5713 
5714   // 2nd stage: column transform
5715   for (int i = 0; i < 2; i++) {
5716     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5717              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5718   }
5719   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5720   // write to buffer
5721   highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
5722                                  txfm_size_row, bd);
5723 }
5724 
highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5725 static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
5726                                               uint16_t *output, int stride,
5727                                               TX_TYPE tx_type, TX_SIZE tx_size,
5728                                               int eob, const int bd) {
5729   (void)eob;
5730   __m128i buf1[16];
5731   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5732   const int txw_idx = get_txw_idx(tx_size);
5733   const int txh_idx = get_txh_idx(tx_size);
5734   const int txfm_size_col = tx_size_wide[tx_size];
5735   const int txfm_size_row = tx_size_high[tx_size];
5736   const int buf_size_h_div8 = txfm_size_row >> 2;
5737   const transform_1d_sse4_1 row_txfm =
5738       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5739   const transform_1d_sse4_1 col_txfm =
5740       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
5741   const int input_stride = AOMMIN(32, txfm_size_col);
5742 
5743   assert(col_txfm != NULL);
5744   assert(row_txfm != NULL);
5745   int ud_flip, lr_flip;
5746   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5747 
5748   // 1st stage: column transform
5749   __m128i buf0[16];
5750   const int32_t *input_row = input;
5751   __m128i *buf0_cur = buf0;
5752   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5753   for (int i = 0; i < (txfm_size_row >> 2); i++) {
5754     row_txfm(buf0 + (i << 2), buf0 + (i << 2),
5755              inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5756   }
5757 
5758   av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]);
5759 
5760   if (lr_flip) {
5761     for (int j = 0; j < buf_size_h_div8; ++j) {
5762       TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5763                     buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
5764                     buf1[4 * j + 3]);
5765     }
5766   } else {
5767     for (int j = 0; j < buf_size_h_div8; ++j) {
5768       TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
5769                     buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
5770                     buf1[4 * j + 2], buf1[4 * j + 3]);
5771     }
5772   }
5773 
5774   // 2nd stage: column transform
5775   col_txfm(buf1, buf1, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5776 
5777   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5778 
5779   // write to buffer
5780   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5781                                  bd);
5782 }
5783 
highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5784 static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
5785                                               uint16_t *output, int stride,
5786                                               TX_TYPE tx_type, TX_SIZE tx_size,
5787                                               int eob, const int bd) {
5788   (void)eob;
5789   __m128i buf1[16];
5790   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5791   const int txw_idx = get_txw_idx(tx_size);
5792   const int txh_idx = get_txh_idx(tx_size);
5793   const int txfm_size_col = tx_size_wide[tx_size];
5794   const int txfm_size_row = tx_size_high[tx_size];
5795   const int buf_size_w_div8 = txfm_size_col >> 2;
5796   const transform_1d_sse4_1 row_txfm =
5797       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
5798   const transform_1d_sse4_1 col_txfm =
5799       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5800 
5801   assert(col_txfm != NULL);
5802   assert(row_txfm != NULL);
5803   int ud_flip, lr_flip;
5804   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5805 
5806   // 1st stage: column transform
5807   __m128i buf0[16];
5808   const int32_t *input_row = input;
5809   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5810 
5811   for (int j = 0; j < buf_size_w_div8; j++) {
5812     TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
5813                   buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
5814   }
5815   row_txfm(buf1, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5816 
5817   __m128i *buf1_ptr;
5818   if (lr_flip) {
5819     flip_buf_sse2(buf0, buf1, txfm_size_col);
5820     buf1_ptr = buf1;
5821   } else {
5822     buf1_ptr = buf0;
5823   }
5824 
5825   // 2nd stage: column transform
5826   for (int i = 0; i < buf_size_w_div8; i++) {
5827     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5828              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5829   }
5830   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5831 
5832   // write to buffer
5833   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5834     highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
5835                                    output + 8 * i, stride, ud_flip,
5836                                    txfm_size_row, bd);
5837   }
5838 }
5839 
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5840 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5841                                                uint8_t *output, int stride,
5842                                                TX_TYPE tx_type, TX_SIZE tx_size,
5843                                                int eob, const int bd) {
5844   switch (tx_type) {
5845     case DCT_DCT:
5846     case ADST_DCT:
5847     case DCT_ADST:
5848     case ADST_ADST:
5849     case FLIPADST_DCT:
5850     case DCT_FLIPADST:
5851     case FLIPADST_FLIPADST:
5852     case ADST_FLIPADST:
5853     case FLIPADST_ADST:
5854       highbd_inv_txfm2d_add_no_identity_sse41(
5855           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5856           bd);
5857       break;
5858     case V_DCT:
5859     case V_ADST:
5860     case V_FLIPADST:
5861       highbd_inv_txfm2d_add_h_identity_ssse41(
5862           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5863           bd);
5864       break;
5865     case H_DCT:
5866     case H_ADST:
5867     case H_FLIPADST:
5868       highbd_inv_txfm2d_add_v_identity_ssse41(
5869           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5870           bd);
5871       break;
5872     case IDTX:
5873       highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
5874                                         stride, tx_type, tx_size, eob, bd);
5875       break;
5876     default: assert(0); break;
5877   }
5878 }
5879 
av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5880 void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5881                                         int stride,
5882                                         const TxfmParam *txfm_param) {
5883   int bd = txfm_param->bd;
5884   const TX_TYPE tx_type = txfm_param->tx_type;
5885   const TX_SIZE tx_size = txfm_param->tx_size;
5886   int eob = txfm_param->eob;
5887   highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5888                                   tx_type, tx_size, eob, bd);
5889 }
5890 
av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5891 void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5892                                         int stride,
5893                                         const TxfmParam *txfm_param) {
5894   int bd = txfm_param->bd;
5895   const TX_TYPE tx_type = txfm_param->tx_type;
5896   const TX_SIZE tx_size = txfm_param->tx_size;
5897   int eob = txfm_param->eob;
5898   highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5899                                   tx_type, tx_size, eob, bd);
5900 }
5901 
av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5902 void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
5903                                          int stride,
5904                                          const TxfmParam *txfm_param) {
5905   int bd = txfm_param->bd;
5906   const TX_TYPE tx_type = txfm_param->tx_type;
5907   const TX_SIZE tx_size = txfm_param->tx_size;
5908   int eob = txfm_param->eob;
5909   highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5910                                     tx_type, tx_size, eob, bd);
5911 }
5912 
av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5913 void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5914                                          int stride,
5915                                          const TxfmParam *txfm_param) {
5916   int bd = txfm_param->bd;
5917   const TX_TYPE tx_type = txfm_param->tx_type;
5918   const TX_SIZE tx_size = txfm_param->tx_size;
5919   int eob = txfm_param->eob;
5920   highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5921                                     tx_type, tx_size, eob, bd);
5922 }
5923 
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5924 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5925                                     int stride, const TxfmParam *txfm_param) {
5926   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5927   const TX_SIZE tx_size = txfm_param->tx_size;
5928   switch (tx_size) {
5929     case TX_4X8:
5930       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
5931       break;
5932     case TX_8X4:
5933       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
5934       break;
5935     default:
5936       // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
5937       // cause test vector mismatches.
5938       av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
5939       break;
5940   }
5941 }
5942