• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_config.h"
13 #include "config/av1_rtcd.h"
14 
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 
19 // TODO(venkatsanampudi@ittiam.com): move this to header file
20 
21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
22 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
23                                           4 * 5793 };
24 
25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
26 
idct4_sse2(const __m128i * input,__m128i * output)27 static void idct4_sse2(const __m128i *input, __m128i *output) {
28   const int8_t cos_bit = INV_COS_BIT;
29   const int32_t *cospi = cospi_arr(INV_COS_BIT);
30   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
31 
32   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
33   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
34   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
35   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
36 
37   // stage 1
38   __m128i x[4];
39   x[0] = input[0];
40   x[1] = input[2];
41   x[2] = input[1];
42   x[3] = input[3];
43 
44   // stage 2
45   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
46   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
47 
48   // stage 3
49   btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
50   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
51 }
52 
idct4_w4_sse2(const __m128i * input,__m128i * output)53 static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
54   const int8_t cos_bit = INV_COS_BIT;
55   const int32_t *cospi = cospi_arr(INV_COS_BIT);
56   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
57 
58   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
59   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
60   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
61   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
62 
63   // stage 1
64   __m128i x[4];
65   x[0] = input[0];
66   x[1] = input[2];
67   x[2] = input[1];
68   x[3] = input[3];
69 
70   // stage 2
71   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
72   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
73 
74   // stage 3
75   btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
76   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
77 }
78 
idct8_low1_ssse3(const __m128i * input,__m128i * output)79 static void idct8_low1_ssse3(const __m128i *input, __m128i *output) {
80   const int32_t *cospi = cospi_arr(INV_COS_BIT);
81 
82   // stage 1
83   __m128i x[2];
84   x[0] = input[0];
85 
86   // stage 2
87   // stage 3
88   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
89 
90   // stage 4
91   // stage 5
92   output[0] = x[0];
93   output[7] = x[0];
94   output[1] = x[1];
95   output[6] = x[1];
96   output[2] = x[1];
97   output[5] = x[1];
98   output[3] = x[0];
99   output[4] = x[0];
100 }
101 
idct8_sse2(const __m128i * input,__m128i * output)102 static void idct8_sse2(const __m128i *input, __m128i *output) {
103   const int8_t cos_bit = INV_COS_BIT;
104   const int32_t *cospi = cospi_arr(INV_COS_BIT);
105   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
106 
107   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
108   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
109   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
110   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
111   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
112   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
113   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
114   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
115   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
116 
117   // stage 1
118   __m128i x[8];
119   x[0] = input[0];
120   x[1] = input[4];
121   x[2] = input[2];
122   x[3] = input[6];
123   x[4] = input[1];
124   x[5] = input[5];
125   x[6] = input[3];
126   x[7] = input[7];
127 
128   // stage 2
129   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
130   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
131 
132   // stage 3
133   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
134   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
135   btf_16_adds_subs_sse2(x[4], x[5]);
136   btf_16_subs_adds_sse2(x[7], x[6]);
137 
138   // stage 4
139   btf_16_adds_subs_sse2(x[0], x[3]);
140   btf_16_adds_subs_sse2(x[1], x[2]);
141   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
142 
143   // stage 5
144   btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
145   btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
146   btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
147   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
148 }
149 
idct8_w4_sse2(const __m128i * input,__m128i * output)150 static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
151   const int8_t cos_bit = INV_COS_BIT;
152   const int32_t *cospi = cospi_arr(INV_COS_BIT);
153   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
154 
155   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
156   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
157   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
158   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
159   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
160   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
161   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
162   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
163   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
164 
165   // stage 1
166   __m128i x[8];
167   x[0] = input[0];
168   x[1] = input[4];
169   x[2] = input[2];
170   x[3] = input[6];
171   x[4] = input[1];
172   x[5] = input[5];
173   x[6] = input[3];
174   x[7] = input[7];
175 
176   // stage 2
177   btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
178   btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
179 
180   // stage 3
181   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
182   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
183   btf_16_adds_subs_sse2(x[4], x[5]);
184   btf_16_subs_adds_sse2(x[7], x[6]);
185 
186   // stage 4
187   btf_16_adds_subs_sse2(x[0], x[3]);
188   btf_16_adds_subs_sse2(x[1], x[2]);
189   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
190 
191   // stage 5
192   btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
193   btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
194   btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
195   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
196 }
197 
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)198 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
199                                       const __m128i __rounding,
200                                       int8_t cos_bit) {
201   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
202   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
203   btf_16_adds_subs_sse2(x[0], x[3]);
204   btf_16_adds_subs_sse2(x[1], x[2]);
205   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
206   btf_16_adds_subs_sse2(x[8], x[11]);
207   btf_16_adds_subs_sse2(x[9], x[10]);
208   btf_16_subs_adds_sse2(x[15], x[12]);
209   btf_16_subs_adds_sse2(x[14], x[13]);
210 }
211 
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)212 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
213                                       const __m128i __rounding,
214                                       int8_t cos_bit) {
215   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
216   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
217   btf_16_adds_subs_sse2(x[0], x[7]);
218   btf_16_adds_subs_sse2(x[1], x[6]);
219   btf_16_adds_subs_sse2(x[2], x[5]);
220   btf_16_adds_subs_sse2(x[3], x[4]);
221   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
222   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
223 }
224 
idct16_stage7_sse2(__m128i * output,__m128i * x)225 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
226   btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
227   btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
228   btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
229   btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
230   btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
231   btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
232   btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
233   btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
234 }
235 
idct16_low1_ssse3(const __m128i * input,__m128i * output)236 static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
237   const int32_t *cospi = cospi_arr(INV_COS_BIT);
238 
239   // stage 1
240   __m128i x[2];
241   x[0] = input[0];
242 
243   // stage 2
244   // stage 3
245   // stage 4
246   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
247 
248   // stage 5
249   // stage 6
250   // stage 7
251   output[0] = x[0];
252   output[15] = x[0];
253   output[1] = x[1];
254   output[14] = x[1];
255   output[2] = x[1];
256   output[13] = x[1];
257   output[3] = x[0];
258   output[12] = x[0];
259   output[4] = x[0];
260   output[11] = x[0];
261   output[5] = x[1];
262   output[10] = x[1];
263   output[6] = x[1];
264   output[9] = x[1];
265   output[7] = x[0];
266   output[8] = x[0];
267 }
268 
idct16_low8_ssse3(const __m128i * input,__m128i * output)269 static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
270   const int8_t cos_bit = INV_COS_BIT;
271   const int32_t *cospi = cospi_arr(INV_COS_BIT);
272   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
273   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
274   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
275   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
276 
277   // stage 1
278   __m128i x[16];
279   x[0] = input[0];
280   x[2] = input[4];
281   x[4] = input[2];
282   x[6] = input[6];
283   x[8] = input[1];
284   x[10] = input[5];
285   x[12] = input[3];
286   x[14] = input[7];
287 
288   // stage 2
289   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
290   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
291   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
292   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
293 
294   // stage 3
295   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
296   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
297   btf_16_adds_subs_sse2(x[8], x[9]);
298   btf_16_subs_adds_sse2(x[11], x[10]);
299   btf_16_adds_subs_sse2(x[12], x[13]);
300   btf_16_subs_adds_sse2(x[15], x[14]);
301 
302   // stage 4
303   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
304   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
305   btf_16_adds_subs_sse2(x[4], x[5]);
306   btf_16_subs_adds_sse2(x[7], x[6]);
307   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
308   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
309 
310   idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
311   idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
312   idct16_stage7_sse2(output, x);
313 }
314 
idct16_sse2(const __m128i * input,__m128i * output)315 static void idct16_sse2(const __m128i *input, __m128i *output) {
316   const int8_t cos_bit = INV_COS_BIT;
317   const int32_t *cospi = cospi_arr(INV_COS_BIT);
318   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
319 
320   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
321   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
322   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
323   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
324   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
325   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
326   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
327   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
328   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
329   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
330   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
331   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
332   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
333   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
334   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
335   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
336   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
337   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
338   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
339 
340   // stage 1
341   __m128i x[16];
342   x[0] = input[0];
343   x[1] = input[8];
344   x[2] = input[4];
345   x[3] = input[12];
346   x[4] = input[2];
347   x[5] = input[10];
348   x[6] = input[6];
349   x[7] = input[14];
350   x[8] = input[1];
351   x[9] = input[9];
352   x[10] = input[5];
353   x[11] = input[13];
354   x[12] = input[3];
355   x[13] = input[11];
356   x[14] = input[7];
357   x[15] = input[15];
358 
359   // stage 2
360   btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
361   btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
362   btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
363   btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
364 
365   // stage 3
366   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
367   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
368   btf_16_adds_subs_sse2(x[8], x[9]);
369   btf_16_subs_adds_sse2(x[11], x[10]);
370   btf_16_adds_subs_sse2(x[12], x[13]);
371   btf_16_subs_adds_sse2(x[15], x[14]);
372 
373   // stage 4
374   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
375   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
376   btf_16_adds_subs_sse2(x[4], x[5]);
377   btf_16_subs_adds_sse2(x[7], x[6]);
378   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
379   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
380 
381   // stage 5~7
382   idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
383   idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
384   idct16_stage7_sse2(output, x);
385 }
386 
idct16_w4_sse2(const __m128i * input,__m128i * output)387 static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
388   const int8_t cos_bit = INV_COS_BIT;
389   const int32_t *cospi = cospi_arr(INV_COS_BIT);
390   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
391 
392   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
393   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
394   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
395   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
396   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
397   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
398   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
399   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
400   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
401   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
402   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
403   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
404   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
405   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
406   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
407   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
408   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
409   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
410   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
411   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
412 
413   // stage 1
414   __m128i x[16];
415   x[0] = input[0];
416   x[1] = input[8];
417   x[2] = input[4];
418   x[3] = input[12];
419   x[4] = input[2];
420   x[5] = input[10];
421   x[6] = input[6];
422   x[7] = input[14];
423   x[8] = input[1];
424   x[9] = input[9];
425   x[10] = input[5];
426   x[11] = input[13];
427   x[12] = input[3];
428   x[13] = input[11];
429   x[14] = input[7];
430   x[15] = input[15];
431 
432   // stage 2
433   btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
434   btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
435   btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
436   btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
437 
438   // stage 3
439   btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
440   btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
441   btf_16_adds_subs_sse2(x[8], x[9]);
442   btf_16_subs_adds_sse2(x[11], x[10]);
443   btf_16_adds_subs_sse2(x[12], x[13]);
444   btf_16_subs_adds_sse2(x[15], x[14]);
445 
446   // stage 4
447   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
448   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
449   btf_16_adds_subs_sse2(x[4], x[5]);
450   btf_16_subs_adds_sse2(x[7], x[6]);
451   btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
452   btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
453 
454   // stage 5
455   btf_16_adds_subs_sse2(x[0], x[3]);
456   btf_16_adds_subs_sse2(x[1], x[2]);
457   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
458   btf_16_adds_subs_sse2(x[8], x[11]);
459   btf_16_adds_subs_sse2(x[9], x[10]);
460   btf_16_subs_adds_sse2(x[15], x[12]);
461   btf_16_subs_adds_sse2(x[14], x[13]);
462 
463   // stage 6
464   btf_16_adds_subs_sse2(x[0], x[7]);
465   btf_16_adds_subs_sse2(x[1], x[6]);
466   btf_16_adds_subs_sse2(x[2], x[5]);
467   btf_16_adds_subs_sse2(x[3], x[4]);
468   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
469   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
470 
471   // stage 7
472   idct16_stage7_sse2(output, x);
473 }
474 
idct32_high16_stage3_sse2(__m128i * x)475 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
476   btf_16_adds_subs_sse2(x[16], x[17]);
477   btf_16_subs_adds_sse2(x[19], x[18]);
478   btf_16_adds_subs_sse2(x[20], x[21]);
479   btf_16_subs_adds_sse2(x[23], x[22]);
480   btf_16_adds_subs_sse2(x[24], x[25]);
481   btf_16_subs_adds_sse2(x[27], x[26]);
482   btf_16_adds_subs_sse2(x[28], x[29]);
483   btf_16_subs_adds_sse2(x[31], x[30]);
484 }
485 
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)486 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
487                                              const __m128i __rounding,
488                                              int8_t cos_bit) {
489   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
490   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
491   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
492   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
493   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
494   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
495   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
496   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
497   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
498   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
499 }
500 
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)501 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
502                                              const __m128i __rounding,
503                                              int8_t cos_bit) {
504   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
505   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
506   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
507   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
508   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
509   btf_16_adds_subs_sse2(x[16], x[19]);
510   btf_16_adds_subs_sse2(x[17], x[18]);
511   btf_16_subs_adds_sse2(x[23], x[20]);
512   btf_16_subs_adds_sse2(x[22], x[21]);
513   btf_16_adds_subs_sse2(x[24], x[27]);
514   btf_16_adds_subs_sse2(x[25], x[26]);
515   btf_16_subs_adds_sse2(x[31], x[28]);
516   btf_16_subs_adds_sse2(x[30], x[29]);
517 }
518 
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)519 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
520                                              const __m128i __rounding,
521                                              int8_t cos_bit) {
522   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
523   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
524   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
525   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
526   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
527   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
528   btf_16_adds_subs_sse2(x[8], x[11]);
529   btf_16_adds_subs_sse2(x[9], x[10]);
530   btf_16_subs_adds_sse2(x[15], x[12]);
531   btf_16_subs_adds_sse2(x[14], x[13]);
532   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
533   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
534   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
535   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
536 }
537 
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)538 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
539                                       const __m128i __rounding,
540                                       int8_t cos_bit) {
541   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
542   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
543   btf_16_adds_subs_sse2(x[0], x[7]);
544   btf_16_adds_subs_sse2(x[1], x[6]);
545   btf_16_adds_subs_sse2(x[2], x[5]);
546   btf_16_adds_subs_sse2(x[3], x[4]);
547   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
548   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
549   btf_16_adds_subs_sse2(x[16], x[23]);
550   btf_16_adds_subs_sse2(x[17], x[22]);
551   btf_16_adds_subs_sse2(x[18], x[21]);
552   btf_16_adds_subs_sse2(x[19], x[20]);
553   btf_16_subs_adds_sse2(x[31], x[24]);
554   btf_16_subs_adds_sse2(x[30], x[25]);
555   btf_16_subs_adds_sse2(x[29], x[26]);
556   btf_16_subs_adds_sse2(x[28], x[27]);
557 }
558 
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)559 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
560                                       const __m128i __rounding,
561                                       int8_t cos_bit) {
562   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
563   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
564   btf_16_adds_subs_sse2(x[0], x[15]);
565   btf_16_adds_subs_sse2(x[1], x[14]);
566   btf_16_adds_subs_sse2(x[2], x[13]);
567   btf_16_adds_subs_sse2(x[3], x[12]);
568   btf_16_adds_subs_sse2(x[4], x[11]);
569   btf_16_adds_subs_sse2(x[5], x[10]);
570   btf_16_adds_subs_sse2(x[6], x[9]);
571   btf_16_adds_subs_sse2(x[7], x[8]);
572   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
573   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
574   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
575   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
576 }
577 
idct32_stage9_sse2(__m128i * output,__m128i * x)578 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
579   btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
580   btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
581   btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
582   btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
583   btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
584   btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
585   btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
586   btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
587   btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
588   btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
589   btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
590   btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
591   btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
592   btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
593   btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
594   btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
595 }
596 
idct32_low1_ssse3(const __m128i * input,__m128i * output)597 static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
598   const int32_t *cospi = cospi_arr(INV_COS_BIT);
599 
600   // stage 1
601   __m128i x[2];
602   x[0] = input[0];
603 
604   // stage 2
605   // stage 3
606   // stage 4
607   // stage 5
608   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
609 
610   // stage 6
611   // stage 7
612   // stage 8
613   // stage 9
614   output[0] = x[0];
615   output[31] = x[0];
616   output[1] = x[1];
617   output[30] = x[1];
618   output[2] = x[1];
619   output[29] = x[1];
620   output[3] = x[0];
621   output[28] = x[0];
622   output[4] = x[0];
623   output[27] = x[0];
624   output[5] = x[1];
625   output[26] = x[1];
626   output[6] = x[1];
627   output[25] = x[1];
628   output[7] = x[0];
629   output[24] = x[0];
630   output[8] = x[0];
631   output[23] = x[0];
632   output[9] = x[1];
633   output[22] = x[1];
634   output[10] = x[1];
635   output[21] = x[1];
636   output[11] = x[0];
637   output[20] = x[0];
638   output[12] = x[0];
639   output[19] = x[0];
640   output[13] = x[1];
641   output[18] = x[1];
642   output[14] = x[1];
643   output[17] = x[1];
644   output[15] = x[0];
645   output[16] = x[0];
646 }
647 
idct32_low8_ssse3(const __m128i * input,__m128i * output)648 static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
649   const int8_t cos_bit = INV_COS_BIT;
650   const int32_t *cospi = cospi_arr(INV_COS_BIT);
651   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
652 
653   // stage 1
654   __m128i x[32];
655   x[0] = input[0];
656   x[4] = input[4];
657   x[8] = input[2];
658   x[12] = input[6];
659   x[16] = input[1];
660   x[20] = input[5];
661   x[24] = input[3];
662   x[28] = input[7];
663 
664   // stage 2
665   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
666   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
667   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
668   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
669 
670   // stage 3
671   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
672   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
673   x[17] = x[16];
674   x[18] = x[19];
675   x[21] = x[20];
676   x[22] = x[23];
677   x[25] = x[24];
678   x[26] = x[27];
679   x[29] = x[28];
680   x[30] = x[31];
681 
682   // stage 4
683   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
684   x[9] = x[8];
685   x[10] = x[11];
686   x[13] = x[12];
687   x[14] = x[15];
688   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
689 
690   // stage 5
691   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
692   x[5] = x[4];
693   x[6] = x[7];
694   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
695   // stage 6
696   x[3] = x[0];
697   x[2] = x[1];
698   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
699 
700   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
701   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
702   idct32_stage9_sse2(output, x);
703 }
704 
idct32_low16_ssse3(const __m128i * input,__m128i * output)705 static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
706   const int8_t cos_bit = INV_COS_BIT;
707   const int32_t *cospi = cospi_arr(INV_COS_BIT);
708   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
709 
710   // stage 1
711   __m128i x[32];
712   x[0] = input[0];
713   x[2] = input[8];
714   x[4] = input[4];
715   x[6] = input[12];
716   x[8] = input[2];
717   x[10] = input[10];
718   x[12] = input[6];
719   x[14] = input[14];
720   x[16] = input[1];
721   x[18] = input[9];
722   x[20] = input[5];
723   x[22] = input[13];
724   x[24] = input[3];
725   x[26] = input[11];
726   x[28] = input[7];
727   x[30] = input[15];
728 
729   // stage 2
730   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
731   btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
732   btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
733   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
734   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
735   btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
736   btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
737   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
738 
739   // stage 3
740   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
741   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
742   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
743   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
744   idct32_high16_stage3_sse2(x);
745 
746   // stage 4
747   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
748   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
749   btf_16_adds_subs_sse2(x[8], x[9]);
750   btf_16_subs_adds_sse2(x[11], x[10]);
751   btf_16_adds_subs_sse2(x[12], x[13]);
752   btf_16_subs_adds_sse2(x[15], x[14]);
753   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
754 
755   // stage 5
756   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
757   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
758   btf_16_adds_subs_sse2(x[4], x[5]);
759   btf_16_subs_adds_sse2(x[7], x[6]);
760   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
761 
762   btf_16_adds_subs_sse2(x[0], x[3]);
763   btf_16_adds_subs_sse2(x[1], x[2]);
764   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
765 
766   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
767   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
768   idct32_stage9_sse2(output, x);
769 }
770 
idct32_sse2(const __m128i * input,__m128i * output)771 static void idct32_sse2(const __m128i *input, __m128i *output) {
772   const int8_t cos_bit = INV_COS_BIT;
773   const int32_t *cospi = cospi_arr(INV_COS_BIT);
774   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
775 
776   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
777   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
778   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
779   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
780   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
781   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
782   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
783   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
784   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
785   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
786   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
787   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
788   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
789   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
790   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
791   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
792   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
793   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
794   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
795   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
796   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
797   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
798   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
799   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
800   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
801   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
802   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
803   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
804   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
805   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
806   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
807   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
808 
809   // stage 1
810   __m128i x[32];
811   x[0] = input[0];
812   x[1] = input[16];
813   x[2] = input[8];
814   x[3] = input[24];
815   x[4] = input[4];
816   x[5] = input[20];
817   x[6] = input[12];
818   x[7] = input[28];
819   x[8] = input[2];
820   x[9] = input[18];
821   x[10] = input[10];
822   x[11] = input[26];
823   x[12] = input[6];
824   x[13] = input[22];
825   x[14] = input[14];
826   x[15] = input[30];
827   x[16] = input[1];
828   x[17] = input[17];
829   x[18] = input[9];
830   x[19] = input[25];
831   x[20] = input[5];
832   x[21] = input[21];
833   x[22] = input[13];
834   x[23] = input[29];
835   x[24] = input[3];
836   x[25] = input[19];
837   x[26] = input[11];
838   x[27] = input[27];
839   x[28] = input[7];
840   x[29] = input[23];
841   x[30] = input[15];
842   x[31] = input[31];
843 
844   // stage 2
845   btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
846   btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
847   btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
848   btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
849   btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
850   btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
851   btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
852   btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
853 
854   // stage 3
855   btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
856   btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
857   btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
858   btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
859   idct32_high16_stage3_sse2(x);
860 
861   // stage 4
862   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
863   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
864   btf_16_adds_subs_sse2(x[8], x[9]);
865   btf_16_subs_adds_sse2(x[11], x[10]);
866   btf_16_adds_subs_sse2(x[12], x[13]);
867   btf_16_subs_adds_sse2(x[15], x[14]);
868   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
869 
870   // stage 5
871   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
872   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
873   btf_16_adds_subs_sse2(x[4], x[5]);
874   btf_16_adds_subs_sse2(x[7], x[6]);
875   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
876 
877   // stage 6
878   btf_16_adds_subs_sse2(x[0], x[3]);
879   btf_16_adds_subs_sse2(x[1], x[2]);
880   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
881 
882   // stage 7~8
883   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
884   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
885   idct32_stage9_sse2(output, x);
886 }
887 
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)888 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
889                                              const __m128i __rounding,
890                                              int8_t cos_bit) {
891   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
892   const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
893   const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
894   const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
895   const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
896   const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
897   const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
898   const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
899   const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
900   const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
901   const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
902   const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
903   btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
904   btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
905   btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
906   btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
907   btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
908   btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
909   btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
910   btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
911 }
912 
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)913 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
914                                              const __m128i __rounding,
915                                              int8_t cos_bit) {
916   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
917   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
918   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
919   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
920   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
921   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
922   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
923   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
924   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
925   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
926   btf_16_adds_subs_sse2(x[32], x[35]);
927   btf_16_adds_subs_sse2(x[33], x[34]);
928   btf_16_subs_adds_sse2(x[39], x[36]);
929   btf_16_subs_adds_sse2(x[38], x[37]);
930   btf_16_adds_subs_sse2(x[40], x[43]);
931   btf_16_adds_subs_sse2(x[41], x[42]);
932   btf_16_subs_adds_sse2(x[47], x[44]);
933   btf_16_subs_adds_sse2(x[46], x[45]);
934   btf_16_adds_subs_sse2(x[48], x[51]);
935   btf_16_adds_subs_sse2(x[49], x[50]);
936   btf_16_subs_adds_sse2(x[55], x[52]);
937   btf_16_subs_adds_sse2(x[54], x[53]);
938   btf_16_adds_subs_sse2(x[56], x[59]);
939   btf_16_adds_subs_sse2(x[57], x[58]);
940   btf_16_subs_adds_sse2(x[63], x[60]);
941   btf_16_subs_adds_sse2(x[62], x[61]);
942 }
943 
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)944 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
945                                              const __m128i __rounding,
946                                              int8_t cos_bit) {
947   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
948   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
949   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
950   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
951   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
952   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
953   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
954   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
955   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
956   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
957   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
958   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
959   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
960   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
961 }
962 
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)963 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
964                                              const __m128i __rounding,
965                                              int8_t cos_bit) {
966   btf_16_adds_subs_sse2(x[16], x[19]);
967   btf_16_adds_subs_sse2(x[17], x[18]);
968   btf_16_subs_adds_sse2(x[23], x[20]);
969   btf_16_subs_adds_sse2(x[22], x[21]);
970   btf_16_adds_subs_sse2(x[24], x[27]);
971   btf_16_adds_subs_sse2(x[25], x[26]);
972   btf_16_subs_adds_sse2(x[31], x[28]);
973   btf_16_subs_adds_sse2(x[30], x[29]);
974   idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
975 }
976 
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)977 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
978                                              const __m128i __rounding,
979                                              int8_t cos_bit) {
980   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
981   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
982   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
983   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
984   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
985   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
986   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
987   btf_16_adds_subs_sse2(x[32], x[39]);
988   btf_16_adds_subs_sse2(x[33], x[38]);
989   btf_16_adds_subs_sse2(x[34], x[37]);
990   btf_16_adds_subs_sse2(x[35], x[36]);
991   btf_16_subs_adds_sse2(x[47], x[40]);
992   btf_16_subs_adds_sse2(x[46], x[41]);
993   btf_16_subs_adds_sse2(x[45], x[42]);
994   btf_16_subs_adds_sse2(x[44], x[43]);
995   btf_16_adds_subs_sse2(x[48], x[55]);
996   btf_16_adds_subs_sse2(x[49], x[54]);
997   btf_16_adds_subs_sse2(x[50], x[53]);
998   btf_16_adds_subs_sse2(x[51], x[52]);
999   btf_16_subs_adds_sse2(x[63], x[56]);
1000   btf_16_subs_adds_sse2(x[62], x[57]);
1001   btf_16_subs_adds_sse2(x[61], x[58]);
1002   btf_16_subs_adds_sse2(x[60], x[59]);
1003 }
1004 
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1005 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
1006                                              const __m128i __rounding,
1007                                              int8_t cos_bit) {
1008   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1009   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1010   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1011   btf_16_adds_subs_sse2(x[16], x[23]);
1012   btf_16_adds_subs_sse2(x[17], x[22]);
1013   btf_16_adds_subs_sse2(x[18], x[21]);
1014   btf_16_adds_subs_sse2(x[19], x[20]);
1015   btf_16_subs_adds_sse2(x[31], x[24]);
1016   btf_16_subs_adds_sse2(x[30], x[25]);
1017   btf_16_subs_adds_sse2(x[29], x[26]);
1018   btf_16_subs_adds_sse2(x[28], x[27]);
1019   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
1020   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
1021   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
1022   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
1023   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
1024   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
1025   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
1026   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
1027 }
1028 
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1029 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
1030                                       const __m128i __rounding,
1031                                       int8_t cos_bit) {
1032   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1033   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1034   btf_16_adds_subs_sse2(x[0], x[15]);
1035   btf_16_adds_subs_sse2(x[1], x[14]);
1036   btf_16_adds_subs_sse2(x[2], x[13]);
1037   btf_16_adds_subs_sse2(x[3], x[12]);
1038   btf_16_adds_subs_sse2(x[4], x[11]);
1039   btf_16_adds_subs_sse2(x[5], x[10]);
1040   btf_16_adds_subs_sse2(x[6], x[9]);
1041   btf_16_adds_subs_sse2(x[7], x[8]);
1042   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
1043   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
1044   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
1045   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
1046   btf_16_adds_subs_sse2(x[32], x[47]);
1047   btf_16_adds_subs_sse2(x[33], x[46]);
1048   btf_16_adds_subs_sse2(x[34], x[45]);
1049   btf_16_adds_subs_sse2(x[35], x[44]);
1050   btf_16_adds_subs_sse2(x[36], x[43]);
1051   btf_16_adds_subs_sse2(x[37], x[42]);
1052   btf_16_adds_subs_sse2(x[38], x[41]);
1053   btf_16_adds_subs_sse2(x[39], x[40]);
1054   btf_16_subs_adds_sse2(x[63], x[48]);
1055   btf_16_subs_adds_sse2(x[62], x[49]);
1056   btf_16_subs_adds_sse2(x[61], x[50]);
1057   btf_16_subs_adds_sse2(x[60], x[51]);
1058   btf_16_subs_adds_sse2(x[59], x[52]);
1059   btf_16_subs_adds_sse2(x[58], x[53]);
1060   btf_16_subs_adds_sse2(x[57], x[54]);
1061   btf_16_subs_adds_sse2(x[56], x[55]);
1062 }
1063 
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1064 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
1065                                        const __m128i __rounding,
1066                                        int8_t cos_bit) {
1067   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1068   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1069   btf_16_adds_subs_sse2(x[0], x[31]);
1070   btf_16_adds_subs_sse2(x[1], x[30]);
1071   btf_16_adds_subs_sse2(x[2], x[29]);
1072   btf_16_adds_subs_sse2(x[3], x[28]);
1073   btf_16_adds_subs_sse2(x[4], x[27]);
1074   btf_16_adds_subs_sse2(x[5], x[26]);
1075   btf_16_adds_subs_sse2(x[6], x[25]);
1076   btf_16_adds_subs_sse2(x[7], x[24]);
1077   btf_16_adds_subs_sse2(x[8], x[23]);
1078   btf_16_adds_subs_sse2(x[9], x[22]);
1079   btf_16_adds_subs_sse2(x[10], x[21]);
1080   btf_16_adds_subs_sse2(x[11], x[20]);
1081   btf_16_adds_subs_sse2(x[12], x[19]);
1082   btf_16_adds_subs_sse2(x[13], x[18]);
1083   btf_16_adds_subs_sse2(x[14], x[17]);
1084   btf_16_adds_subs_sse2(x[15], x[16]);
1085   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
1086   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
1087   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
1088   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
1089   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
1090   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
1091   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
1092   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
1093 }
1094 
idct64_stage11_sse2(__m128i * output,__m128i * x)1095 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1096   btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1097   btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1098   btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1099   btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1100   btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1101   btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1102   btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1103   btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1104   btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1105   btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1106   btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1107   btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1108   btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1109   btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1110   btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1111   btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1112   btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1113   btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1114   btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1115   btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1116   btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1117   btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1118   btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1119   btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1120   btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1121   btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1122   btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1123   btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1124   btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1125   btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1126   btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1127   btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1128 }
1129 
idct64_low1_ssse3(const __m128i * input,__m128i * output)1130 static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
1131   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1132 
1133   // stage 1
1134   __m128i x[32];
1135   x[0] = input[0];
1136 
1137   // stage 2
1138   // stage 3
1139   // stage 4
1140   // stage 5
1141   // stage 6
1142   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1143 
1144   // stage 7
1145   // stage 8
1146   // stage 9
1147   // stage 10
1148   // stage 11
1149   output[0] = x[0];
1150   output[63] = x[0];
1151   output[1] = x[1];
1152   output[62] = x[1];
1153   output[2] = x[1];
1154   output[61] = x[1];
1155   output[3] = x[0];
1156   output[60] = x[0];
1157   output[4] = x[0];
1158   output[59] = x[0];
1159   output[5] = x[1];
1160   output[58] = x[1];
1161   output[6] = x[1];
1162   output[57] = x[1];
1163   output[7] = x[0];
1164   output[56] = x[0];
1165   output[8] = x[0];
1166   output[55] = x[0];
1167   output[9] = x[1];
1168   output[54] = x[1];
1169   output[10] = x[1];
1170   output[53] = x[1];
1171   output[11] = x[0];
1172   output[52] = x[0];
1173   output[12] = x[0];
1174   output[51] = x[0];
1175   output[13] = x[1];
1176   output[50] = x[1];
1177   output[14] = x[1];
1178   output[49] = x[1];
1179   output[15] = x[0];
1180   output[48] = x[0];
1181   output[16] = x[0];
1182   output[47] = x[0];
1183   output[17] = x[1];
1184   output[46] = x[1];
1185   output[18] = x[1];
1186   output[45] = x[1];
1187   output[19] = x[0];
1188   output[44] = x[0];
1189   output[20] = x[0];
1190   output[43] = x[0];
1191   output[21] = x[1];
1192   output[42] = x[1];
1193   output[22] = x[1];
1194   output[41] = x[1];
1195   output[23] = x[0];
1196   output[40] = x[0];
1197   output[24] = x[0];
1198   output[39] = x[0];
1199   output[25] = x[1];
1200   output[38] = x[1];
1201   output[26] = x[1];
1202   output[37] = x[1];
1203   output[27] = x[0];
1204   output[36] = x[0];
1205   output[28] = x[0];
1206   output[35] = x[0];
1207   output[29] = x[1];
1208   output[34] = x[1];
1209   output[30] = x[1];
1210   output[33] = x[1];
1211   output[31] = x[0];
1212   output[32] = x[0];
1213 }
1214 
idct64_low8_ssse3(const __m128i * input,__m128i * output)1215 static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
1216   const int8_t cos_bit = INV_COS_BIT;
1217   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1218   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1219   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1220   const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1221   const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1222   const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1223   const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1224   const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1225   const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1226   const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1227   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1228   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1229   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1230   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1231   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1232   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1233   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1234   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1235 
1236   // stage 1
1237   __m128i x[64];
1238   x[0] = input[0];
1239   x[8] = input[4];
1240   x[16] = input[2];
1241   x[24] = input[6];
1242   x[32] = input[1];
1243   x[40] = input[5];
1244   x[48] = input[3];
1245   x[56] = input[7];
1246 
1247   // stage 2
1248   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1249   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1250   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1251   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1252 
1253   // stage 3
1254   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1255   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1256   x[33] = x[32];
1257   x[38] = x[39];
1258   x[41] = x[40];
1259   x[46] = x[47];
1260   x[49] = x[48];
1261   x[54] = x[55];
1262   x[57] = x[56];
1263   x[62] = x[63];
1264 
1265   // stage 4
1266   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1267   x[17] = x[16];
1268   x[22] = x[23];
1269   x[25] = x[24];
1270   x[30] = x[31];
1271   btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
1272   btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
1273   btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
1274   btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
1275 
1276   // stage 5
1277   x[9] = x[8];
1278   x[14] = x[15];
1279   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
1280   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
1281   x[35] = x[32];
1282   x[34] = x[33];
1283   x[36] = x[39];
1284   x[37] = x[38];
1285   x[43] = x[40];
1286   x[42] = x[41];
1287   x[44] = x[47];
1288   x[45] = x[46];
1289   x[51] = x[48];
1290   x[50] = x[49];
1291   x[52] = x[55];
1292   x[53] = x[54];
1293   x[59] = x[56];
1294   x[58] = x[57];
1295   x[60] = x[63];
1296   x[61] = x[62];
1297 
1298   // stage 6
1299   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1300   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1301   x[19] = x[16];
1302   x[18] = x[17];
1303   x[20] = x[23];
1304   x[21] = x[22];
1305   x[27] = x[24];
1306   x[26] = x[25];
1307   x[28] = x[31];
1308   x[29] = x[30];
1309   idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1310 
1311   // stage 7
1312   x[3] = x[0];
1313   x[2] = x[1];
1314   x[11] = x[8];
1315   x[10] = x[9];
1316   x[12] = x[15];
1317   x[13] = x[14];
1318   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1319 
1320   // stage 8
1321   x[7] = x[0];
1322   x[6] = x[1];
1323   x[5] = x[2];
1324   x[4] = x[3];
1325   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1326   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1327   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1328 
1329   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1330   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1331   idct64_stage11_sse2(output, x);
1332 }
1333 
idct64_low16_ssse3(const __m128i * input,__m128i * output)1334 static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
1335   const int8_t cos_bit = INV_COS_BIT;
1336   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1337   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1338 
1339   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1340   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1341   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1342   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1343   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1344 
1345   // stage 1
1346   __m128i x[64];
1347   x[0] = input[0];
1348   x[4] = input[8];
1349   x[8] = input[4];
1350   x[12] = input[12];
1351   x[16] = input[2];
1352   x[20] = input[10];
1353   x[24] = input[6];
1354   x[28] = input[14];
1355   x[32] = input[1];
1356   x[36] = input[9];
1357   x[40] = input[5];
1358   x[44] = input[13];
1359   x[48] = input[3];
1360   x[52] = input[11];
1361   x[56] = input[7];
1362   x[60] = input[15];
1363 
1364   // stage 2
1365   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1366   btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1367   btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1368   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1369   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1370   btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1371   btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1372   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1373 
1374   // stage 3
1375   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1376   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1377   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1378   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1379   x[33] = x[32];
1380   x[34] = x[35];
1381   x[37] = x[36];
1382   x[38] = x[39];
1383   x[41] = x[40];
1384   x[42] = x[43];
1385   x[45] = x[44];
1386   x[46] = x[47];
1387   x[49] = x[48];
1388   x[50] = x[51];
1389   x[53] = x[52];
1390   x[54] = x[55];
1391   x[57] = x[56];
1392   x[58] = x[59];
1393   x[61] = x[60];
1394   x[62] = x[63];
1395 
1396   // stage 4
1397   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1398   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1399   x[17] = x[16];
1400   x[18] = x[19];
1401   x[21] = x[20];
1402   x[22] = x[23];
1403   x[25] = x[24];
1404   x[26] = x[27];
1405   x[29] = x[28];
1406   x[30] = x[31];
1407   idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1408 
1409   // stage 5
1410   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1411   x[9] = x[8];
1412   x[10] = x[11];
1413   x[13] = x[12];
1414   x[14] = x[15];
1415   idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1416 
1417   // stage 6
1418   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1419   x[5] = x[4];
1420   x[6] = x[7];
1421   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1422   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1423   idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1424 
1425   // stage 7
1426   x[3] = x[0];
1427   x[2] = x[1];
1428   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1429   btf_16_adds_subs_sse2(x[8], x[11]);
1430   btf_16_adds_subs_sse2(x[9], x[10]);
1431   btf_16_subs_adds_sse2(x[15], x[12]);
1432   btf_16_subs_adds_sse2(x[14], x[13]);
1433   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1434 
1435   // stage 8
1436   btf_16_adds_subs_sse2(x[0], x[7]);
1437   btf_16_adds_subs_sse2(x[1], x[6]);
1438   btf_16_adds_subs_sse2(x[2], x[5]);
1439   btf_16_adds_subs_sse2(x[3], x[4]);
1440   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1441   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1442   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1443 
1444   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1445   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1446   idct64_stage11_sse2(output, x);
1447 }
1448 
idct64_low32_ssse3(const __m128i * input,__m128i * output)1449 static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
1450   const int8_t cos_bit = INV_COS_BIT;
1451   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1452   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1453 
1454   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1455   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1456   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1457   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1458   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1459 
1460   // stage 1
1461   __m128i x[64];
1462   x[0] = input[0];
1463   x[2] = input[16];
1464   x[4] = input[8];
1465   x[6] = input[24];
1466   x[8] = input[4];
1467   x[10] = input[20];
1468   x[12] = input[12];
1469   x[14] = input[28];
1470   x[16] = input[2];
1471   x[18] = input[18];
1472   x[20] = input[10];
1473   x[22] = input[26];
1474   x[24] = input[6];
1475   x[26] = input[22];
1476   x[28] = input[14];
1477   x[30] = input[30];
1478   x[32] = input[1];
1479   x[34] = input[17];
1480   x[36] = input[9];
1481   x[38] = input[25];
1482   x[40] = input[5];
1483   x[42] = input[21];
1484   x[44] = input[13];
1485   x[46] = input[29];
1486   x[48] = input[3];
1487   x[50] = input[19];
1488   x[52] = input[11];
1489   x[54] = input[27];
1490   x[56] = input[7];
1491   x[58] = input[23];
1492   x[60] = input[15];
1493   x[62] = input[31];
1494 
1495   // stage 2
1496   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1497   btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1498   btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1499   btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1500   btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1501   btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1502   btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1503   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1504   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1505   btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1506   btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1507   btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1508   btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1509   btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1510   btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1511   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1512 
1513   // stage 3
1514   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1515   btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1516   btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1517   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1518   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1519   btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1520   btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1521   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1522   btf_16_adds_subs_sse2(x[32], x[33]);
1523   btf_16_subs_adds_sse2(x[35], x[34]);
1524   btf_16_adds_subs_sse2(x[36], x[37]);
1525   btf_16_subs_adds_sse2(x[39], x[38]);
1526   btf_16_adds_subs_sse2(x[40], x[41]);
1527   btf_16_subs_adds_sse2(x[43], x[42]);
1528   btf_16_adds_subs_sse2(x[44], x[45]);
1529   btf_16_subs_adds_sse2(x[47], x[46]);
1530   btf_16_adds_subs_sse2(x[48], x[49]);
1531   btf_16_subs_adds_sse2(x[51], x[50]);
1532   btf_16_adds_subs_sse2(x[52], x[53]);
1533   btf_16_subs_adds_sse2(x[55], x[54]);
1534   btf_16_adds_subs_sse2(x[56], x[57]);
1535   btf_16_subs_adds_sse2(x[59], x[58]);
1536   btf_16_adds_subs_sse2(x[60], x[61]);
1537   btf_16_subs_adds_sse2(x[63], x[62]);
1538 
1539   // stage 4
1540   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1541   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1542   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1543   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1544   btf_16_adds_subs_sse2(x[16], x[17]);
1545   btf_16_subs_adds_sse2(x[19], x[18]);
1546   btf_16_adds_subs_sse2(x[20], x[21]);
1547   btf_16_subs_adds_sse2(x[23], x[22]);
1548   btf_16_adds_subs_sse2(x[24], x[25]);
1549   btf_16_subs_adds_sse2(x[27], x[26]);
1550   btf_16_adds_subs_sse2(x[28], x[29]);
1551   btf_16_subs_adds_sse2(x[31], x[30]);
1552   idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1553 
1554   // stage 5
1555   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1556   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1557   btf_16_adds_subs_sse2(x[8], x[9]);
1558   btf_16_subs_adds_sse2(x[11], x[10]);
1559   btf_16_adds_subs_sse2(x[12], x[13]);
1560   btf_16_subs_adds_sse2(x[15], x[14]);
1561   idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1562 
1563   // stage 6
1564   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1565   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1566   btf_16_adds_subs_sse2(x[4], x[5]);
1567   btf_16_subs_adds_sse2(x[7], x[6]);
1568   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1569   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1570   idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1571 
1572   // stage 7
1573   btf_16_adds_subs_sse2(x[0], x[3]);
1574   btf_16_adds_subs_sse2(x[1], x[2]);
1575   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1576   btf_16_adds_subs_sse2(x[8], x[11]);
1577   btf_16_adds_subs_sse2(x[9], x[10]);
1578   btf_16_subs_adds_sse2(x[15], x[12]);
1579   btf_16_subs_adds_sse2(x[14], x[13]);
1580   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1581 
1582   // stage 8
1583   btf_16_adds_subs_sse2(x[0], x[7]);
1584   btf_16_adds_subs_sse2(x[1], x[6]);
1585   btf_16_adds_subs_sse2(x[2], x[5]);
1586   btf_16_adds_subs_sse2(x[3], x[4]);
1587   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1588   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1589   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1590 
1591   // stage 9~11
1592   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1593   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1594   idct64_stage11_sse2(output, x);
1595 }
1596 
iadst4_sse2(const __m128i * input,__m128i * output)1597 static void iadst4_sse2(const __m128i *input, __m128i *output) {
1598   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1599   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1600   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1601   const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1602   const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1603   const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1604   const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1605   const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1606   const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1607   __m128i x0[4];
1608   x0[0] = input[0];
1609   x0[1] = input[1];
1610   x0[2] = input[2];
1611   x0[3] = input[3];
1612 
1613   __m128i u[4];
1614   u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1615   u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1616   u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1617   u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1618 
1619   __m128i x1[16];
1620   x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
1621   x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1622   x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
1623   x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1624   x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
1625   x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1626   x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
1627   x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1628   x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
1629   x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1630   x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
1631   x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1632   x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
1633   x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1634   x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
1635   x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1636 
1637   __m128i x2[8];
1638   x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1639   x2[1] = _mm_add_epi32(x1[1], x1[5]);
1640   x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1641   x2[3] = _mm_add_epi32(x1[3], x1[7]);
1642   x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
1643   x2[5] = _mm_add_epi32(x1[9], x1[11]);
1644   x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1645   x2[7] = _mm_add_epi32(x1[13], x1[15]);
1646 
1647   const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1648   for (int i = 0; i < 4; ++i) {
1649     __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1650     __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1651     out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1652     out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1653     output[i] = _mm_packs_epi32(out0, out1);
1654   }
1655 }
1656 
iadst4_w4_sse2(const __m128i * input,__m128i * output)1657 static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
1658   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1659   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1660   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1661   const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1662   const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1663   const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1664   const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1665   const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1666   const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1667   __m128i x0[4];
1668   x0[0] = input[0];
1669   x0[1] = input[1];
1670   x0[2] = input[2];
1671   x0[3] = input[3];
1672 
1673   __m128i u[2];
1674   u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1675   u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1676 
1677   __m128i x1[8];
1678   x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
1679   x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
1680   x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
1681   x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
1682   x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
1683   x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
1684   x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
1685   x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
1686 
1687   __m128i x2[4];
1688   x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1689   x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1690   x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
1691   x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1692 
1693   const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1694   for (int i = 0; i < 4; ++i) {
1695     __m128i out0 = _mm_add_epi32(x2[i], rounding);
1696     out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1697     output[i] = _mm_packs_epi32(out0, out0);
1698   }
1699 }
1700 
iadst8_low1_ssse3(const __m128i * input,__m128i * output)1701 static void iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
1702   const int8_t cos_bit = INV_COS_BIT;
1703   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1704   const __m128i __zero = _mm_setzero_si128();
1705   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1706 
1707   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1708   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1709   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1710   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1711 
1712   // stage 1
1713   __m128i x[8];
1714   x[1] = input[0];
1715 
1716   // stage 2
1717   btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1718 
1719   // stage 3
1720   x[4] = x[0];
1721   x[5] = x[1];
1722 
1723   // stage 4
1724   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1725 
1726   // stage 5
1727   x[2] = x[0];
1728   x[3] = x[1];
1729   x[6] = x[4];
1730   x[7] = x[5];
1731 
1732   // stage 6
1733   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1734   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1735 
1736   // stage 7
1737   output[0] = x[0];
1738   output[1] = _mm_subs_epi16(__zero, x[4]);
1739   output[2] = x[6];
1740   output[3] = _mm_subs_epi16(__zero, x[2]);
1741   output[4] = x[3];
1742   output[5] = _mm_subs_epi16(__zero, x[7]);
1743   output[6] = x[5];
1744   output[7] = _mm_subs_epi16(__zero, x[1]);
1745 }
1746 
iadst8_sse2(const __m128i * input,__m128i * output)1747 static void iadst8_sse2(const __m128i *input, __m128i *output) {
1748   const int8_t cos_bit = INV_COS_BIT;
1749   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1750   const __m128i __zero = _mm_setzero_si128();
1751   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1752 
1753   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1754   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1755   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1756   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1757   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1758   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1759   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1760   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1761   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1762   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1763   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1764   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1765   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1766 
1767   // stage 1
1768   __m128i x[8];
1769   x[0] = input[7];
1770   x[1] = input[0];
1771   x[2] = input[5];
1772   x[3] = input[2];
1773   x[4] = input[3];
1774   x[5] = input[4];
1775   x[6] = input[1];
1776   x[7] = input[6];
1777 
1778   // stage 2
1779   btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1780   btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1781   btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1782   btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1783 
1784   // stage 3
1785   btf_16_adds_subs_sse2(x[0], x[4]);
1786   btf_16_adds_subs_sse2(x[1], x[5]);
1787   btf_16_adds_subs_sse2(x[2], x[6]);
1788   btf_16_adds_subs_sse2(x[3], x[7]);
1789 
1790   // stage 4
1791   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1792   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1793 
1794   // stage 5
1795   btf_16_adds_subs_sse2(x[0], x[2]);
1796   btf_16_adds_subs_sse2(x[1], x[3]);
1797   btf_16_adds_subs_sse2(x[4], x[6]);
1798   btf_16_adds_subs_sse2(x[5], x[7]);
1799 
1800   // stage 6
1801   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1802   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1803 
1804   // stage 7
1805   output[0] = x[0];
1806   output[1] = _mm_subs_epi16(__zero, x[4]);
1807   output[2] = x[6];
1808   output[3] = _mm_subs_epi16(__zero, x[2]);
1809   output[4] = x[3];
1810   output[5] = _mm_subs_epi16(__zero, x[7]);
1811   output[6] = x[5];
1812   output[7] = _mm_subs_epi16(__zero, x[1]);
1813 }
1814 
iadst8_w4_sse2(const __m128i * input,__m128i * output)1815 static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
1816   const int8_t cos_bit = INV_COS_BIT;
1817   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1818   const __m128i __zero = _mm_setzero_si128();
1819   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1820 
1821   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1822   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1823   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1824   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1825   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1826   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1827   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1828   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1829   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1830   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1831   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1832   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1833   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1834 
1835   // stage 1
1836   __m128i x[8];
1837   x[0] = input[7];
1838   x[1] = input[0];
1839   x[2] = input[5];
1840   x[3] = input[2];
1841   x[4] = input[3];
1842   x[5] = input[4];
1843   x[6] = input[1];
1844   x[7] = input[6];
1845 
1846   // stage 2
1847   btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1848   btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1849   btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1850   btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1851 
1852   // stage 3
1853   btf_16_adds_subs_sse2(x[0], x[4]);
1854   btf_16_adds_subs_sse2(x[1], x[5]);
1855   btf_16_adds_subs_sse2(x[2], x[6]);
1856   btf_16_adds_subs_sse2(x[3], x[7]);
1857 
1858   // stage 4
1859   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1860   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1861 
1862   // stage 5
1863   btf_16_adds_subs_sse2(x[0], x[2]);
1864   btf_16_adds_subs_sse2(x[1], x[3]);
1865   btf_16_adds_subs_sse2(x[4], x[6]);
1866   btf_16_adds_subs_sse2(x[5], x[7]);
1867 
1868   // stage 6
1869   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1870   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1871 
1872   // stage 7
1873   output[0] = x[0];
1874   output[1] = _mm_subs_epi16(__zero, x[4]);
1875   output[2] = x[6];
1876   output[3] = _mm_subs_epi16(__zero, x[2]);
1877   output[4] = x[3];
1878   output[5] = _mm_subs_epi16(__zero, x[7]);
1879   output[6] = x[5];
1880   output[7] = _mm_subs_epi16(__zero, x[1]);
1881 }
1882 
iadst16_stage3_ssse3(__m128i * x)1883 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1884   btf_16_adds_subs_sse2(x[0], x[8]);
1885   btf_16_adds_subs_sse2(x[1], x[9]);
1886   btf_16_adds_subs_sse2(x[2], x[10]);
1887   btf_16_adds_subs_sse2(x[3], x[11]);
1888   btf_16_adds_subs_sse2(x[4], x[12]);
1889   btf_16_adds_subs_sse2(x[5], x[13]);
1890   btf_16_adds_subs_sse2(x[6], x[14]);
1891   btf_16_adds_subs_sse2(x[7], x[15]);
1892 }
1893 
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1894 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
1895                                         const __m128i __rounding,
1896                                         int8_t cos_bit) {
1897   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1898   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1899   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1900   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1901   const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1902   const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1903   btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1904   btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
1905   btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
1906   btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
1907 }
1908 
iadst16_stage5_ssse3(__m128i * x)1909 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1910   btf_16_adds_subs_sse2(x[0], x[4]);
1911   btf_16_adds_subs_sse2(x[1], x[5]);
1912   btf_16_adds_subs_sse2(x[2], x[6]);
1913   btf_16_adds_subs_sse2(x[3], x[7]);
1914   btf_16_adds_subs_sse2(x[8], x[12]);
1915   btf_16_adds_subs_sse2(x[9], x[13]);
1916   btf_16_adds_subs_sse2(x[10], x[14]);
1917   btf_16_adds_subs_sse2(x[11], x[15]);
1918 }
1919 
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1920 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
1921                                         const __m128i __rounding,
1922                                         int8_t cos_bit) {
1923   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1924   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1925   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1926   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1927   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1928   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
1929   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
1930 }
1931 
iadst16_stage7_ssse3(__m128i * x)1932 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1933   btf_16_adds_subs_sse2(x[0], x[2]);
1934   btf_16_adds_subs_sse2(x[1], x[3]);
1935   btf_16_adds_subs_sse2(x[4], x[6]);
1936   btf_16_adds_subs_sse2(x[5], x[7]);
1937   btf_16_adds_subs_sse2(x[8], x[10]);
1938   btf_16_adds_subs_sse2(x[9], x[11]);
1939   btf_16_adds_subs_sse2(x[12], x[14]);
1940   btf_16_adds_subs_sse2(x[13], x[15]);
1941 }
1942 
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1943 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
1944                                         const __m128i __rounding,
1945                                         int8_t cos_bit) {
1946   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1947   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1948   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1949   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1950   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
1951   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
1952 }
1953 
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1954 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1955   const __m128i __zero = _mm_setzero_si128();
1956   output[0] = x[0];
1957   output[1] = _mm_subs_epi16(__zero, x[8]);
1958   output[2] = x[12];
1959   output[3] = _mm_subs_epi16(__zero, x[4]);
1960   output[4] = x[6];
1961   output[5] = _mm_subs_epi16(__zero, x[14]);
1962   output[6] = x[10];
1963   output[7] = _mm_subs_epi16(__zero, x[2]);
1964   output[8] = x[3];
1965   output[9] = _mm_subs_epi16(__zero, x[11]);
1966   output[10] = x[15];
1967   output[11] = _mm_subs_epi16(__zero, x[7]);
1968   output[12] = x[5];
1969   output[13] = _mm_subs_epi16(__zero, x[13]);
1970   output[14] = x[9];
1971   output[15] = _mm_subs_epi16(__zero, x[1]);
1972 }
1973 
iadst16_low1_ssse3(const __m128i * input,__m128i * output)1974 static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
1975   const int8_t cos_bit = INV_COS_BIT;
1976   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1977   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1978 
1979   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1980   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1981   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1982   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1983 
1984   // stage 1
1985   __m128i x[16];
1986   x[1] = input[0];
1987 
1988   // stage 2
1989   btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
1990 
1991   // stage 3
1992   x[8] = x[0];
1993   x[9] = x[1];
1994 
1995   // stage 4
1996   btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1997 
1998   // stage 5
1999   x[4] = x[0];
2000   x[5] = x[1];
2001   x[12] = x[8];
2002   x[13] = x[9];
2003 
2004   // stage 6
2005   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2006   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2007 
2008   // stage 7
2009   x[2] = x[0];
2010   x[3] = x[1];
2011   x[6] = x[4];
2012   x[7] = x[5];
2013   x[10] = x[8];
2014   x[11] = x[9];
2015   x[14] = x[12];
2016   x[15] = x[13];
2017 
2018   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2019   iadst16_stage9_ssse3(output, x);
2020 }
2021 
iadst16_low8_ssse3(const __m128i * input,__m128i * output)2022 static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
2023   const int8_t cos_bit = INV_COS_BIT;
2024   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2025   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2026 
2027   // stage 1
2028   __m128i x[16];
2029   x[1] = input[0];
2030   x[3] = input[2];
2031   x[5] = input[4];
2032   x[7] = input[6];
2033   x[8] = input[7];
2034   x[10] = input[5];
2035   x[12] = input[3];
2036   x[14] = input[1];
2037 
2038   // stage 2
2039   btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2040   btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2041   btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2042   btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2043   btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2044   btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2045   btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2046   btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2047 
2048   // stage 3
2049   iadst16_stage3_ssse3(x);
2050   iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2051   iadst16_stage5_ssse3(x);
2052   iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2053   iadst16_stage7_ssse3(x);
2054   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2055   iadst16_stage9_ssse3(output, x);
2056 }
iadst16_sse2(const __m128i * input,__m128i * output)2057 static void iadst16_sse2(const __m128i *input, __m128i *output) {
2058   const int8_t cos_bit = INV_COS_BIT;
2059   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2060   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2061   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2062   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2063   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2064   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2065   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2066   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2067   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2068   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2069   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2070   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2071   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2072   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2073   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2074   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2075   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2076   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2077 
2078   // stage 1
2079   __m128i x[16];
2080   x[0] = input[15];
2081   x[1] = input[0];
2082   x[2] = input[13];
2083   x[3] = input[2];
2084   x[4] = input[11];
2085   x[5] = input[4];
2086   x[6] = input[9];
2087   x[7] = input[6];
2088   x[8] = input[7];
2089   x[9] = input[8];
2090   x[10] = input[5];
2091   x[11] = input[10];
2092   x[12] = input[3];
2093   x[13] = input[12];
2094   x[14] = input[1];
2095   x[15] = input[14];
2096 
2097   // stage 2
2098   btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2099   btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2100   btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2101   btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2102   btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2103   btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2104   btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2105   btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2106 
2107   // stage 3~9
2108   iadst16_stage3_ssse3(x);
2109   iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2110   iadst16_stage5_ssse3(x);
2111   iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2112   iadst16_stage7_ssse3(x);
2113   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2114   iadst16_stage9_ssse3(output, x);
2115 }
2116 
iadst16_w4_sse2(const __m128i * input,__m128i * output)2117 static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
2118   const int8_t cos_bit = INV_COS_BIT;
2119   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2120   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2121 
2122   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2123   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2124   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2125   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2126   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2127   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2128   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2129   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2130   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2131   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2132   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2133   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2134   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2135   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2136   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2137   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2138   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2139   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2140   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2141   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2142   const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2143   const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2144   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2145   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2146   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2147   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2148   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2149 
2150   // stage 1
2151   __m128i x[16];
2152   x[0] = input[15];
2153   x[1] = input[0];
2154   x[2] = input[13];
2155   x[3] = input[2];
2156   x[4] = input[11];
2157   x[5] = input[4];
2158   x[6] = input[9];
2159   x[7] = input[6];
2160   x[8] = input[7];
2161   x[9] = input[8];
2162   x[10] = input[5];
2163   x[11] = input[10];
2164   x[12] = input[3];
2165   x[13] = input[12];
2166   x[14] = input[1];
2167   x[15] = input[14];
2168 
2169   // stage 2
2170   btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2171   btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2172   btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2173   btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2174   btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2175   btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2176   btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2177   btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2178 
2179   // stage 3
2180   iadst16_stage3_ssse3(x);
2181 
2182   // stage 4
2183   btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2184   btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
2185   btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
2186   btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
2187 
2188   // stage 5
2189   iadst16_stage5_ssse3(x);
2190 
2191   // stage 6
2192   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2193   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
2194   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2195   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
2196 
2197   // stage 7
2198   iadst16_stage7_ssse3(x);
2199 
2200   // stage 8
2201   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
2202   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
2203   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
2204   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
2205 
2206   // stage 9
2207   iadst16_stage9_ssse3(output, x);
2208 }
2209 
iidentity4_ssse3(const __m128i * input,__m128i * output)2210 static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
2211   const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
2212   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2213   for (int i = 0; i < 4; ++i) {
2214     __m128i x = _mm_mulhrs_epi16(input[i], scale);
2215     output[i] = _mm_adds_epi16(x, input[i]);
2216   }
2217 }
2218 
iidentity8_sse2(const __m128i * input,__m128i * output)2219 static void iidentity8_sse2(const __m128i *input, __m128i *output) {
2220   for (int i = 0; i < 8; ++i) {
2221     output[i] = _mm_adds_epi16(input[i], input[i]);
2222   }
2223 }
2224 
iidentity16_ssse3(const __m128i * input,__m128i * output)2225 static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
2226   const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
2227   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2228   for (int i = 0; i < 16; ++i) {
2229     __m128i x = _mm_mulhrs_epi16(input[i], scale);
2230     __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2231     output[i] = _mm_adds_epi16(x, srcx2);
2232   }
2233 }
2234 
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2235 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
2236                                                __m128i res) {
2237   const __m128i zero = _mm_setzero_si128();
2238   __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2239   return _mm_packus_epi16(x0, x0);
2240 }
2241 
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2242 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
2243                                                int stride, int flipud,
2244                                                const int height) {
2245   int j = flipud ? (height - 1) : 0;
2246   const int step = flipud ? -1 : 1;
2247   const __m128i zero = _mm_setzero_si128();
2248   for (int i = 0; i < height; ++i, j += step) {
2249     const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
2250     __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2251     u = _mm_packus_epi16(u, zero);
2252     *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
2253   }
2254 }
2255 
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2256 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
2257                                                int stride, int flipud,
2258                                                const int height) {
2259   int j = flipud ? (height - 1) : 0;
2260   const int step = flipud ? -1 : 1;
2261   for (int i = 0; i < height; ++i, j += step) {
2262     const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
2263     const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2264     _mm_storel_epi64((__m128i *)(output + i * stride), u);
2265   }
2266 }
2267 
2268 // 1D functions process process 8 pixels at one time.
2269 static const transform_1d_ssse3
2270     lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2271       { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
2272       { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
2273       { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
2274       { idct32_sse2, NULL, NULL },
2275       { idct64_low32_ssse3, NULL, NULL },
2276     };
2277 
2278 // functions for blocks with eob at DC and within
2279 // topleft 8x8, 16x16, 32x32 corner
2280 static const transform_1d_ssse3
2281     lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2282       {
2283           { idct4_sse2, idct4_sse2, NULL, NULL },
2284           { iadst4_sse2, iadst4_sse2, NULL, NULL },
2285           { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
2286       },
2287       { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
2288         { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
2289         { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
2290       {
2291           { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
2292           { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
2293           { NULL, NULL, NULL, NULL },
2294       },
2295       { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
2296           idct32_sse2 },
2297         { NULL, NULL, NULL, NULL },
2298         { NULL, NULL, NULL, NULL } },
2299       { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
2300           idct64_low32_ssse3 },
2301         { NULL, NULL, NULL, NULL },
2302         { NULL, NULL, NULL, NULL } }
2303     };
2304 
2305 // 1D functions process process 4 pixels at one time.
2306 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2307 static const transform_1d_ssse3
2308     lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2309       { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
2310       { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
2311       { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
2312       { NULL, NULL, NULL },
2313       { NULL, NULL, NULL },
2314     };
2315 
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)2316 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
2317                                            int stride, int shift, int height,
2318                                            int txw_idx, int rect_type) {
2319   const int32_t *input_row = input;
2320   const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
2321   const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
2322                                           (1 << (NewSqrt2Bits - shift - 1)));
2323   const __m128i one = _mm_set1_epi16(1);
2324   const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2325   if (rect_type != 1 && rect_type != -1) {
2326     for (int i = 0; i < height; ++i) {
2327       const __m128i src = load_32bit_to_16bit(input_row);
2328       input_row += stride;
2329       __m128i lo = _mm_unpacklo_epi16(src, one);
2330       __m128i hi = _mm_unpackhi_epi16(src, one);
2331       lo = _mm_madd_epi16(lo, scale_rounding);
2332       hi = _mm_madd_epi16(hi, scale_rounding);
2333       lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2334       hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2335       out[i] = _mm_packs_epi32(lo, hi);
2336     }
2337   } else {
2338     const __m128i rect_scale =
2339         _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
2340     for (int i = 0; i < height; ++i) {
2341       __m128i src = load_32bit_to_16bit(input_row);
2342       src = _mm_mulhrs_epi16(src, rect_scale);
2343       input_row += stride;
2344       __m128i lo = _mm_unpacklo_epi16(src, one);
2345       __m128i hi = _mm_unpackhi_epi16(src, one);
2346       lo = _mm_madd_epi16(lo, scale_rounding);
2347       hi = _mm_madd_epi16(hi, scale_rounding);
2348       lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2349       hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2350       out[i] = _mm_packs_epi32(lo, hi);
2351     }
2352   }
2353 }
2354 
iidentity_col_8xn_ssse3(uint8_t * output,int stride,__m128i * buf,int shift,int height,int txh_idx)2355 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
2356                                            __m128i *buf, int shift, int height,
2357                                            int txh_idx) {
2358   const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
2359   const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
2360   const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2361   const __m128i one = _mm_set1_epi16(1);
2362   const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2363   const __m128i zero = _mm_setzero_si128();
2364   for (int h = 0; h < height; ++h) {
2365     __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2366     __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2367     lo = _mm_madd_epi16(lo, scale_coeff);
2368     hi = _mm_madd_epi16(hi, scale_coeff);
2369     lo = _mm_srai_epi32(lo, NewSqrt2Bits);
2370     hi = _mm_srai_epi32(hi, NewSqrt2Bits);
2371     lo = _mm_add_epi32(lo, shift_rounding);
2372     hi = _mm_add_epi32(hi, shift_rounding);
2373     lo = _mm_srai_epi32(lo, -shift);
2374     hi = _mm_srai_epi32(hi, -shift);
2375     __m128i x = _mm_packs_epi32(lo, hi);
2376 
2377     const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
2378     x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2379     const __m128i u = _mm_packus_epi16(x, x);
2380     _mm_storel_epi64((__m128i *)(output), u);
2381     output += stride;
2382   }
2383 }
2384 
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size)2385 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
2386                                                    uint8_t *output, int stride,
2387                                                    TX_SIZE tx_size) {
2388   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2389   const int txw_idx = get_txw_idx(tx_size);
2390   const int txh_idx = get_txh_idx(tx_size);
2391   const int txfm_size_col = tx_size_wide[tx_size];
2392   const int txfm_size_row = tx_size_high[tx_size];
2393   const int input_stride = AOMMIN(32, txfm_size_col);
2394   const int row_max = AOMMIN(32, txfm_size_row);
2395   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2396   __m128i buf[32];
2397 
2398   for (int i = 0; i < (input_stride >> 3); ++i) {
2399     iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
2400                             txw_idx, rect_type);
2401     iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
2402                             txh_idx);
2403   }
2404 }
2405 
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2406 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
2407                                            uint8_t *output, int stride,
2408                                            TX_TYPE tx_type, TX_SIZE tx_size_,
2409                                            int eob) {
2410   (void)tx_size_;
2411   (void)eob;
2412   __m128i buf[4];
2413   const TX_SIZE tx_size = TX_4X4;
2414   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2415   const int txw_idx = get_txw_idx(tx_size);
2416   const int txh_idx = get_txh_idx(tx_size);
2417   const int txfm_size_col = tx_size_wide[tx_size];
2418   const int txfm_size_row = tx_size_high[tx_size];
2419 
2420   const transform_1d_ssse3 row_txfm =
2421       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2422   const transform_1d_ssse3 col_txfm =
2423       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2424 
2425   int ud_flip, lr_flip;
2426   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2427   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2428   transpose_16bit_4x4(buf, buf);
2429   row_txfm(buf, buf);
2430   if (lr_flip) {
2431     __m128i temp[4];
2432     flip_buf_sse2(buf, temp, txfm_size_col);
2433     transpose_16bit_4x4(temp, buf);
2434   } else {
2435     transpose_16bit_4x4(buf, buf);
2436   }
2437   col_txfm(buf, buf);
2438   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2439   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2440 }
2441 
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2442 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
2443                                                  __m128i res0, __m128i res1) {
2444   const __m128i zero = _mm_setzero_si128();
2445   __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2446   __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2447   x0 = _mm_adds_epi16(res0, x0);
2448   x1 = _mm_adds_epi16(res1, x1);
2449   return _mm_packus_epi16(x0, x1);
2450 }
2451 
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,int height)2452 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
2453                                                 int stride, int flipud,
2454                                                 int height) {
2455   int j = flipud ? (height - 1) : 0;
2456   const int step = flipud ? -1 : 1;
2457   for (int i = 0; i < height; ++i, j += step) {
2458     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
2459     __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2460     _mm_storeu_si128((__m128i *)(output + i * stride), u);
2461   }
2462 }
2463 
round_shift_ssse3(const __m128i * input,__m128i * output,int size)2464 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
2465                                      int size) {
2466   const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
2467   for (int i = 0; i < size; ++i) {
2468     output[i] = _mm_mulhrs_epi16(input[i], scale);
2469   }
2470 }
2471 
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2472 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
2473     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2474     TX_SIZE tx_size, int eob) {
2475   __m128i buf1[64 * 8];
2476   int eobx, eoby;
2477   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2478   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2479   const int txw_idx = get_txw_idx(tx_size);
2480   const int txh_idx = get_txh_idx(tx_size);
2481   const int txfm_size_col = tx_size_wide[tx_size];
2482   const int txfm_size_row = tx_size_high[tx_size];
2483   const int buf_size_w_div8 = txfm_size_col >> 3;
2484   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2485   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2486   const int input_stride = AOMMIN(32, txfm_size_col);
2487   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2488 
2489   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2490   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2491   const transform_1d_ssse3 row_txfm =
2492       lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2493   const transform_1d_ssse3 col_txfm =
2494       lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2495 
2496   assert(col_txfm != NULL);
2497   assert(row_txfm != NULL);
2498   int ud_flip, lr_flip;
2499   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2500   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
2501     __m128i buf0[64];
2502     const int32_t *input_row = input + i * input_stride * 8;
2503     for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
2504       __m128i *buf0_cur = buf0 + j * 8;
2505       load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2506       transpose_16bit_8x8(buf0_cur, buf0_cur);
2507     }
2508     if (rect_type == 1 || rect_type == -1) {
2509       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
2510     }
2511     row_txfm(buf0, buf0);
2512     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2513     __m128i *_buf1 = buf1 + i * 8;
2514     if (lr_flip) {
2515       for (int j = 0; j < buf_size_w_div8; ++j) {
2516         __m128i temp[8];
2517         flip_buf_sse2(buf0 + 8 * j, temp, 8);
2518         transpose_16bit_8x8(temp,
2519                             _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2520       }
2521     } else {
2522       for (int j = 0; j < buf_size_w_div8; ++j) {
2523         transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2524       }
2525     }
2526   }
2527   for (int i = 0; i < buf_size_w_div8; i++) {
2528     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
2529     round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2530   }
2531 
2532   if (txfm_size_col >= 16) {
2533     for (int i = 0; i < (txfm_size_col >> 4); i++) {
2534       lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2535                                    output + 16 * i, stride, ud_flip,
2536                                    txfm_size_row);
2537     }
2538   } else if (txfm_size_col == 8) {
2539     lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
2540   }
2541 }
2542 
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2543 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
2544     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2545     TX_SIZE tx_size, int eob) {
2546   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2547   int eobx, eoby;
2548   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2549   const int txw_idx = get_txw_idx(tx_size);
2550   const int txh_idx = get_txh_idx(tx_size);
2551   const int txfm_size_col = tx_size_wide[tx_size];
2552   const int txfm_size_row = tx_size_high[tx_size];
2553   const int buf_size_w_div8 = (eobx + 8) >> 3;
2554   const int input_stride = AOMMIN(32, txfm_size_col);
2555   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2556 
2557   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2558   assert(fun_idx < 5);
2559   const transform_1d_ssse3 col_txfm =
2560       lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2561 
2562   assert(col_txfm != NULL);
2563 
2564   int ud_flip, lr_flip;
2565   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2566   for (int i = 0; i < buf_size_w_div8; i++) {
2567     __m128i buf0[64];
2568     iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
2569                             eoby + 1, txw_idx, rect_type);
2570     col_txfm(buf0, buf0);
2571     __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2572     int k = ud_flip ? (txfm_size_row - 1) : 0;
2573     const int step = ud_flip ? -1 : 1;
2574     uint8_t *out = output + 8 * i;
2575     for (int j = 0; j < txfm_size_row; ++j, k += step) {
2576       const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
2577       __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2578       const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2579       _mm_storel_epi64((__m128i *)(out), u);
2580       out += stride;
2581     }
2582   }
2583 }
2584 
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2585 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
2586     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2587     TX_SIZE tx_size, int eob) {
2588   __m128i buf1[64];
2589   int eobx, eoby;
2590   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2591   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2592   const int txw_idx = get_txw_idx(tx_size);
2593   const int txh_idx = get_txh_idx(tx_size);
2594   const int txfm_size_col = tx_size_wide[tx_size];
2595   const int txfm_size_row = tx_size_high[tx_size];
2596   const int buf_size_w_div8 = txfm_size_col >> 3;
2597   const int buf_size_h_div8 = (eoby + 8) >> 3;
2598   const int input_stride = AOMMIN(32, txfm_size_col);
2599   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2600 
2601   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2602   const transform_1d_ssse3 row_txfm =
2603       lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2604 
2605   assert(row_txfm != NULL);
2606   int ud_flip, lr_flip;
2607   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2608   for (int i = 0; i < buf_size_h_div8; i++) {
2609     __m128i buf0[64];
2610     const int32_t *input_row = input + i * input_stride * 8;
2611     for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2612       __m128i *buf0_cur = buf0 + j * 8;
2613       load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2614       transpose_16bit_8x8(buf0_cur, buf0_cur);
2615     }
2616     if (rect_type == 1 || rect_type == -1) {
2617       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
2618     }
2619     row_txfm(buf0, buf0);
2620     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2621     __m128i *_buf1 = buf1;
2622     if (lr_flip) {
2623       for (int j = 0; j < buf_size_w_div8; ++j) {
2624         __m128i temp[8];
2625         flip_buf_sse2(buf0 + 8 * j, temp, 8);
2626         transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2627       }
2628     } else {
2629       for (int j = 0; j < buf_size_w_div8; ++j) {
2630         transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2631       }
2632     }
2633 
2634     for (int j = 0; j < buf_size_w_div8; ++j) {
2635       iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
2636                               buf1 + j * 8, shift[1], 8, txh_idx);
2637     }
2638   }
2639 }
2640 
2641 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2642 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
2643     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2644     TX_SIZE tx_size, int eob) {
2645   switch (tx_type) {
2646     case DCT_DCT:
2647       lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2648                                              tx_size, eob);
2649       break;
2650     case IDTX:
2651       lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2652       break;
2653     case V_DCT:
2654     case V_ADST:
2655     case V_FLIPADST:
2656       lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2657                                             tx_size, eob);
2658       break;
2659     case H_DCT:
2660     case H_ADST:
2661     case H_FLIPADST:
2662       lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2663                                             tx_size, eob);
2664       break;
2665     default:
2666       lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2667                                              tx_size, eob);
2668       break;
2669   }
2670 }
2671 
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2672 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
2673                                            uint8_t *output, int stride,
2674                                            TX_TYPE tx_type, TX_SIZE tx_size_,
2675                                            int eob) {
2676   (void)tx_size_;
2677   (void)eob;
2678   __m128i buf[8];
2679   const TX_SIZE tx_size = TX_4X8;
2680   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2681   const int txw_idx = get_txw_idx(tx_size);
2682   const int txh_idx = get_txh_idx(tx_size);
2683   const int txfm_size_col = tx_size_wide[tx_size];
2684   const int txfm_size_row = tx_size_high[tx_size];
2685 
2686   const transform_1d_ssse3 row_txfm =
2687       lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2688   const transform_1d_ssse3 col_txfm =
2689       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2690 
2691   int ud_flip, lr_flip;
2692   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2693   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2694   transpose_16bit_4x8(buf, buf);
2695   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
2696   row_txfm(buf, buf);
2697   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2698   if (lr_flip) {
2699     __m128i temp[4];
2700     flip_buf_sse2(buf, temp, txfm_size_col);
2701     transpose_16bit_8x4(temp, buf);
2702   } else {
2703     transpose_16bit_8x4(buf, buf);
2704   }
2705   col_txfm(buf, buf);
2706   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2707   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2708 }
2709 
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2710 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
2711                                            uint8_t *output, int stride,
2712                                            TX_TYPE tx_type, TX_SIZE tx_size_,
2713                                            int eob) {
2714   (void)tx_size_;
2715   (void)eob;
2716   __m128i buf[8];
2717   const TX_SIZE tx_size = TX_8X4;
2718   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2719   const int txw_idx = get_txw_idx(tx_size);
2720   const int txh_idx = get_txh_idx(tx_size);
2721   const int txfm_size_col = tx_size_wide[tx_size];
2722   const int txfm_size_row = tx_size_high[tx_size];
2723 
2724   const transform_1d_ssse3 row_txfm =
2725       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2726   const transform_1d_ssse3 col_txfm =
2727       lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2728 
2729   int ud_flip, lr_flip;
2730   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2731   load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2732   transpose_16bit_8x4(buf, buf);
2733   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
2734   row_txfm(buf, buf);
2735   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2736   if (lr_flip) {
2737     __m128i temp[8];
2738     flip_buf_sse2(buf, temp, txfm_size_col);
2739     transpose_16bit_4x8(temp, buf);
2740   } else {
2741     transpose_16bit_4x8(buf, buf);
2742   }
2743   col_txfm(buf, buf);
2744   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2745   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2746 }
2747 
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2748 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
2749                                             uint8_t *output, int stride,
2750                                             TX_TYPE tx_type, TX_SIZE tx_size_,
2751                                             int eob) {
2752   (void)tx_size_;
2753   (void)eob;
2754   __m128i buf[16];
2755   const TX_SIZE tx_size = TX_4X16;
2756   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2757   const int txw_idx = get_txw_idx(tx_size);
2758   const int txh_idx = get_txh_idx(tx_size);
2759   const int txfm_size_col = tx_size_wide[tx_size];
2760   const int txfm_size_row = tx_size_high[tx_size];
2761 
2762   const transform_1d_ssse3 row_txfm =
2763       lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2764   const transform_1d_ssse3 col_txfm =
2765       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2766 
2767   int ud_flip, lr_flip;
2768   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2769 
2770   const int row_one_loop = 8;
2771   for (int i = 0; i < 2; ++i) {
2772     const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2773     __m128i *buf_cur = buf + i * row_one_loop;
2774     load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
2775                                   row_one_loop);
2776     transpose_16bit_4x8(buf_cur, buf_cur);
2777     if (row_txfm == iidentity4_ssse3) {
2778       const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
2779       const __m128i ones = _mm_set1_epi16(1);
2780       for (int j = 0; j < 4; ++j) {
2781         const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
2782         const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
2783         const __m128i buf_32_lo =
2784             _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
2785         const __m128i buf_32_hi =
2786             _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
2787         buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2788       }
2789     } else {
2790       row_txfm(buf_cur, buf_cur);
2791       round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2792     }
2793     if (lr_flip) {
2794       __m128i temp[8];
2795       flip_buf_sse2(buf_cur, temp, txfm_size_col);
2796       transpose_16bit_8x4(temp, buf_cur);
2797     } else {
2798       transpose_16bit_8x4(buf_cur, buf_cur);
2799     }
2800   }
2801   col_txfm(buf, buf);
2802   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2803   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2804 }
2805 
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2806 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
2807                                             uint8_t *output, int stride,
2808                                             TX_TYPE tx_type, TX_SIZE tx_size_,
2809                                             int eob) {
2810   (void)tx_size_;
2811   (void)eob;
2812   __m128i buf[16];
2813   const TX_SIZE tx_size = TX_16X4;
2814   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2815   const int txw_idx = get_txw_idx(tx_size);
2816   const int txh_idx = get_txh_idx(tx_size);
2817   const int txfm_size_col = tx_size_wide[tx_size];
2818   const int txfm_size_row = tx_size_high[tx_size];
2819   const int buf_size_w_div8 = txfm_size_col >> 3;
2820 
2821   const transform_1d_ssse3 row_txfm =
2822       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2823   const transform_1d_ssse3 col_txfm =
2824       lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2825 
2826   int ud_flip, lr_flip;
2827   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2828   const int row_one_loop = 8;
2829   for (int i = 0; i < buf_size_w_div8; ++i) {
2830     const int32_t *input_cur = input + i * row_one_loop;
2831     __m128i *buf_cur = buf + i * row_one_loop;
2832     load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
2833                                txfm_size_row);
2834     transpose_16bit_8x4(buf_cur, buf_cur);
2835   }
2836   if (row_txfm == iidentity16_ssse3) {
2837     const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
2838     const __m128i ones = _mm_set1_epi16(1);
2839     for (int j = 0; j < 16; ++j) {
2840       const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
2841       const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
2842       const __m128i buf_32_lo =
2843           _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
2844       const __m128i buf_32_hi =
2845           _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
2846       buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2847     }
2848   } else {
2849     row_txfm(buf, buf);
2850     round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2851   }
2852   if (lr_flip) {
2853     __m128i temp[16];
2854     flip_buf_sse2(buf, temp, 16);
2855     transpose_16bit_4x8(temp, buf);
2856     transpose_16bit_4x8(temp + 8, buf + 8);
2857   } else {
2858     transpose_16bit_4x8(buf, buf);
2859     transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2860   }
2861   for (int i = 0; i < buf_size_w_div8; i++) {
2862     col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
2863     round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2864   }
2865   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
2866   lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
2867 }
2868 
av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2869 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
2870                                     int stride, TX_TYPE tx_type,
2871                                     TX_SIZE tx_size, int eob) {
2872   switch (tx_size) {
2873     case TX_4X4:
2874       lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
2875                                      eob);
2876       break;
2877     case TX_4X8:
2878       lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
2879                                      eob);
2880       break;
2881     case TX_8X4:
2882       lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
2883                                      eob);
2884       break;
2885     case TX_4X16:
2886       lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
2887                                       eob);
2888       break;
2889     case TX_16X4:
2890       lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
2891                                       eob);
2892       break;
2893     default:
2894       lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
2895                                           tx_size, eob);
2896       break;
2897   }
2898 }
2899 
av1_inv_txfm_add_ssse3(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)2900 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2901                             const TxfmParam *txfm_param) {
2902   if (!txfm_param->lossless) {
2903     const TX_TYPE tx_type = txfm_param->tx_type;
2904     av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
2905                                    txfm_param->tx_size, txfm_param->eob);
2906 
2907   } else {
2908     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2909   }
2910 }
2911