• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_config.h"
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/av1_inv_txfm1d_cfg.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 
21 // TODO(venkatsanampudi@ittiam.com): move this to header file
22 
23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25                                           4 * 5793 };
26 
idct16_stage5_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)27 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28                                       const __m256i _r, int8_t cos_bit) {
29   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34 
35   btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36   btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37   btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39 }
40 
idct16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)41 static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42                                       const __m256i _r, int8_t cos_bit) {
43   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45   btf_16_adds_subs_avx2(&x[0], &x[7]);
46   btf_16_adds_subs_avx2(&x[1], &x[6]);
47   btf_16_adds_subs_avx2(&x[2], &x[5]);
48   btf_16_adds_subs_avx2(&x[3], &x[4]);
49   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51 }
52 
idct16_stage7_avx2(__m256i * output,__m256i * x1)53 static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54   btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55   btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56   btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57   btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58   btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59   btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60   btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61   btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62 }
63 
idct16_avx2(const __m256i * input,__m256i * output)64 static void idct16_avx2(const __m256i *input, __m256i *output) {
65   const int32_t *cospi = cospi_arr(INV_COS_BIT);
66   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
67 
68   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
69   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
70   __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
71   __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
72   __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
73   __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
74   __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
75   __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
76   __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
77   __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
78   __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
79   __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
80   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
81   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
82   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
83   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
84   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
85   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
86   __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
87 
88   // stage 1
89   __m256i x1[16];
90   x1[0] = input[0];
91   x1[1] = input[8];
92   x1[2] = input[4];
93   x1[3] = input[12];
94   x1[4] = input[2];
95   x1[5] = input[10];
96   x1[6] = input[6];
97   x1[7] = input[14];
98   x1[8] = input[1];
99   x1[9] = input[9];
100   x1[10] = input[5];
101   x1[11] = input[13];
102   x1[12] = input[3];
103   x1[13] = input[11];
104   x1[14] = input[7];
105   x1[15] = input[15];
106 
107   // stage 2
108   btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
109                   INV_COS_BIT);
110   btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
111                   INV_COS_BIT);
112   btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
113                   INV_COS_BIT);
114   btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
115                   INV_COS_BIT);
116 
117   // stage 3
118   btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
119                   INV_COS_BIT);
120   btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
121                   INV_COS_BIT);
122   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
123   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
124   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
125   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
126 
127   // stage 4
128   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
129                   INV_COS_BIT);
130   btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
131                   INV_COS_BIT);
132   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
133   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
134   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
135                   INV_COS_BIT);
136   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
137                   INV_COS_BIT);
138 
139   idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
140   idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
141   idct16_stage7_avx2(output, x1);
142 }
143 
idct16_low8_avx2(const __m256i * input,__m256i * output)144 static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
145   const int32_t *cospi = cospi_arr(INV_COS_BIT);
146   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
147 
148   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
149   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
150   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
151 
152   // stage 1
153   __m256i x1[16];
154   x1[0] = input[0];
155   x1[2] = input[4];
156   x1[4] = input[2];
157   x1[6] = input[6];
158   x1[8] = input[1];
159   x1[10] = input[5];
160   x1[12] = input[3];
161   x1[14] = input[7];
162 
163   // stage 2
164   btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
165   btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
166   btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
167   btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
168 
169   // stage 3
170   btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
171   btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
172   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
173   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
174   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
175   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
176 
177   // stage 4
178   btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
179   btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
180   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
181   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
182   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
183                   INV_COS_BIT);
184   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
185                   INV_COS_BIT);
186 
187   idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
188   idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
189   idct16_stage7_avx2(output, x1);
190 }
191 
idct16_low1_avx2(const __m256i * input,__m256i * output)192 static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
193   const int32_t *cospi = cospi_arr(INV_COS_BIT);
194 
195   // stage 1
196   __m256i x1[2];
197   x1[0] = input[0];
198 
199   // stage 2
200   // stage 3
201   // stage 4
202   btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
203 
204   // stage 5
205   // stage 6
206   output[0] = x1[0];
207   output[1] = x1[1];
208   output[2] = x1[1];
209   output[3] = x1[0];
210   output[4] = x1[0];
211   output[5] = x1[1];
212   output[6] = x1[1];
213   output[7] = x1[0];
214   output[8] = x1[0];
215   output[9] = x1[1];
216   output[10] = x1[1];
217   output[11] = x1[0];
218   output[12] = x1[0];
219   output[13] = x1[1];
220   output[14] = x1[1];
221   output[15] = x1[0];
222 }
223 
iadst16_stage3_avx2(__m256i * x)224 static INLINE void iadst16_stage3_avx2(__m256i *x) {
225   btf_16_adds_subs_avx2(&x[0], &x[8]);
226   btf_16_adds_subs_avx2(&x[1], &x[9]);
227   btf_16_adds_subs_avx2(&x[2], &x[10]);
228   btf_16_adds_subs_avx2(&x[3], &x[11]);
229   btf_16_adds_subs_avx2(&x[4], &x[12]);
230   btf_16_adds_subs_avx2(&x[5], &x[13]);
231   btf_16_adds_subs_avx2(&x[6], &x[14]);
232   btf_16_adds_subs_avx2(&x[7], &x[15]);
233 }
234 
iadst16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)235 static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
236                                        const __m256i _r, int8_t cos_bit) {
237   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
238   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
239   const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
240   const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
241   const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
242   const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
243   btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
244   btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
245   btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
246   btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
247 }
248 
iadst16_stage5_avx2(__m256i * x)249 static INLINE void iadst16_stage5_avx2(__m256i *x) {
250   btf_16_adds_subs_avx2(&x[0], &x[4]);
251   btf_16_adds_subs_avx2(&x[1], &x[5]);
252   btf_16_adds_subs_avx2(&x[2], &x[6]);
253   btf_16_adds_subs_avx2(&x[3], &x[7]);
254   btf_16_adds_subs_avx2(&x[8], &x[12]);
255   btf_16_adds_subs_avx2(&x[9], &x[13]);
256   btf_16_adds_subs_avx2(&x[10], &x[14]);
257   btf_16_adds_subs_avx2(&x[11], &x[15]);
258 }
259 
iadst16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)260 static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
261                                        const __m256i _r, int8_t cos_bit) {
262   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
263   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
264   const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
265   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
266   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
267   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
268   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
269 }
270 
iadst16_stage7_avx2(__m256i * x)271 static INLINE void iadst16_stage7_avx2(__m256i *x) {
272   btf_16_adds_subs_avx2(&x[0], &x[2]);
273   btf_16_adds_subs_avx2(&x[1], &x[3]);
274   btf_16_adds_subs_avx2(&x[4], &x[6]);
275   btf_16_adds_subs_avx2(&x[5], &x[7]);
276   btf_16_adds_subs_avx2(&x[8], &x[10]);
277   btf_16_adds_subs_avx2(&x[9], &x[11]);
278   btf_16_adds_subs_avx2(&x[12], &x[14]);
279   btf_16_adds_subs_avx2(&x[13], &x[15]);
280 }
281 
iadst16_stage8_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)282 static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
283                                        const __m256i _r, int8_t cos_bit) {
284   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
285   const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
286   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
287   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
288   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
289   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
290 }
291 
iadst16_stage9_avx2(__m256i * output,__m256i * x1)292 static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
293   const __m256i __zero = _mm256_setzero_si256();
294   output[0] = x1[0];
295   output[1] = _mm256_subs_epi16(__zero, x1[8]);
296   output[2] = x1[12];
297   output[3] = _mm256_subs_epi16(__zero, x1[4]);
298   output[4] = x1[6];
299   output[5] = _mm256_subs_epi16(__zero, x1[14]);
300   output[6] = x1[10];
301   output[7] = _mm256_subs_epi16(__zero, x1[2]);
302   output[8] = x1[3];
303   output[9] = _mm256_subs_epi16(__zero, x1[11]);
304   output[10] = x1[15];
305   output[11] = _mm256_subs_epi16(__zero, x1[7]);
306   output[12] = x1[5];
307   output[13] = _mm256_subs_epi16(__zero, x1[13]);
308   output[14] = x1[9];
309   output[15] = _mm256_subs_epi16(__zero, x1[1]);
310 }
311 
iadst16_avx2(const __m256i * input,__m256i * output)312 static void iadst16_avx2(const __m256i *input, __m256i *output) {
313   const int32_t *cospi = cospi_arr(INV_COS_BIT);
314 
315   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
316 
317   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
318   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
319   __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
320   __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
321   __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
322   __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
323   __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
324   __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
325   __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
326   __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
327   __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
328   __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
329   __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
330   __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
331   __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
332   __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
333 
334   // stage 1
335   __m256i x1[16];
336   x1[0] = input[15];
337   x1[1] = input[0];
338   x1[2] = input[13];
339   x1[3] = input[2];
340   x1[4] = input[11];
341   x1[5] = input[4];
342   x1[6] = input[9];
343   x1[7] = input[6];
344   x1[8] = input[7];
345   x1[9] = input[8];
346   x1[10] = input[5];
347   x1[11] = input[10];
348   x1[12] = input[3];
349   x1[13] = input[12];
350   x1[14] = input[1];
351   x1[15] = input[14];
352 
353   // stage 2
354   btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
355                   INV_COS_BIT);
356   btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
357                   INV_COS_BIT);
358   btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
359                   INV_COS_BIT);
360   btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
361                   INV_COS_BIT);
362   btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
363                   INV_COS_BIT);
364   btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
365                   INV_COS_BIT);
366   btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
367                   INV_COS_BIT);
368   btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
369                   INV_COS_BIT);
370 
371   iadst16_stage3_avx2(x1);
372   iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
373   iadst16_stage5_avx2(x1);
374   iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
375   iadst16_stage7_avx2(x1);
376   iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
377   iadst16_stage9_avx2(output, x1);
378 }
379 
iadst16_low8_avx2(const __m256i * input,__m256i * output)380 static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
381   const int32_t *cospi = cospi_arr(INV_COS_BIT);
382   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
383 
384   // stage 1
385   __m256i x1[16];
386   x1[1] = input[0];
387   x1[3] = input[2];
388   x1[5] = input[4];
389   x1[7] = input[6];
390   x1[8] = input[7];
391   x1[10] = input[5];
392   x1[12] = input[3];
393   x1[14] = input[1];
394 
395   // stage 2
396   btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
397   btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
398   btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
399   btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
400   btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
401   btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
402   btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
403   btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
404 
405   iadst16_stage3_avx2(x1);
406   iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
407   iadst16_stage5_avx2(x1);
408   iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
409   iadst16_stage7_avx2(x1);
410   iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
411   iadst16_stage9_avx2(output, x1);
412 }
413 
iadst16_low1_avx2(const __m256i * input,__m256i * output)414 static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
415   const int32_t *cospi = cospi_arr(INV_COS_BIT);
416   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
417 
418   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
419   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
420   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
421   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
422 
423   // stage 1
424   __m256i x1[16];
425   x1[1] = input[0];
426 
427   // stage 2
428   btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
429 
430   // stage 3
431   x1[8] = x1[0];
432   x1[9] = x1[1];
433 
434   // stage 4
435   btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
436                   INV_COS_BIT);
437 
438   // stage 5
439   x1[4] = x1[0];
440   x1[5] = x1[1];
441 
442   x1[12] = x1[8];
443   x1[13] = x1[9];
444 
445   // stage 6
446   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
447                   INV_COS_BIT);
448   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
449                   INV_COS_BIT);
450 
451   // stage 7
452   x1[2] = x1[0];
453   x1[3] = x1[1];
454   x1[6] = x1[4];
455   x1[7] = x1[5];
456   x1[10] = x1[8];
457   x1[11] = x1[9];
458   x1[14] = x1[12];
459   x1[15] = x1[13];
460 
461   iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
462   iadst16_stage9_avx2(output, x1);
463 }
464 
idct32_high16_stage3_avx2(__m256i * x)465 static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
466   btf_16_adds_subs_avx2(&x[16], &x[17]);
467   btf_16_adds_subs_avx2(&x[19], &x[18]);
468   btf_16_adds_subs_avx2(&x[20], &x[21]);
469   btf_16_adds_subs_avx2(&x[23], &x[22]);
470   btf_16_adds_subs_avx2(&x[24], &x[25]);
471   btf_16_adds_subs_avx2(&x[27], &x[26]);
472   btf_16_adds_subs_avx2(&x[28], &x[29]);
473   btf_16_adds_subs_avx2(&x[31], &x[30]);
474 }
475 
idct32_high16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)476 static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
477                                              const __m256i _r, int8_t cos_bit) {
478   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
479   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
480   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
481   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
482   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
483   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
484   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
485   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
486   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
487   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
488 }
489 
idct32_high24_stage5_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)490 static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
491                                              const __m256i _r, int8_t cos_bit) {
492   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
493   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
494   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
495   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
496   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
497   btf_16_adds_subs_avx2(&x[16], &x[19]);
498   btf_16_adds_subs_avx2(&x[17], &x[18]);
499   btf_16_adds_subs_avx2(&x[23], &x[20]);
500   btf_16_adds_subs_avx2(&x[22], &x[21]);
501   btf_16_adds_subs_avx2(&x[24], &x[27]);
502   btf_16_adds_subs_avx2(&x[25], &x[26]);
503   btf_16_adds_subs_avx2(&x[31], &x[28]);
504   btf_16_adds_subs_avx2(&x[30], &x[29]);
505 }
506 
idct32_high28_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)507 static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
508                                              const __m256i _r, int8_t cos_bit) {
509   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
510   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
511   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
512   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
513   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
514   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
515   btf_16_adds_subs_avx2(&x[8], &x[11]);
516   btf_16_adds_subs_avx2(&x[9], &x[10]);
517   btf_16_adds_subs_avx2(&x[15], &x[12]);
518   btf_16_adds_subs_avx2(&x[14], &x[13]);
519   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
520   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
521   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
522   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
523 }
524 
idct32_stage7_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)525 static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
526                                       const __m256i _r, int8_t cos_bit) {
527   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
528   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
529   btf_16_adds_subs_avx2(&x[0], &x[7]);
530   btf_16_adds_subs_avx2(&x[1], &x[6]);
531   btf_16_adds_subs_avx2(&x[2], &x[5]);
532   btf_16_adds_subs_avx2(&x[3], &x[4]);
533   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
534   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
535   btf_16_adds_subs_avx2(&x[16], &x[23]);
536   btf_16_adds_subs_avx2(&x[17], &x[22]);
537   btf_16_adds_subs_avx2(&x[18], &x[21]);
538   btf_16_adds_subs_avx2(&x[19], &x[20]);
539   btf_16_adds_subs_avx2(&x[31], &x[24]);
540   btf_16_adds_subs_avx2(&x[30], &x[25]);
541   btf_16_adds_subs_avx2(&x[29], &x[26]);
542   btf_16_adds_subs_avx2(&x[28], &x[27]);
543 }
544 
idct32_stage8_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)545 static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
546                                       const __m256i _r, int8_t cos_bit) {
547   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
548   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
549   btf_16_adds_subs_avx2(&x[0], &x[15]);
550   btf_16_adds_subs_avx2(&x[1], &x[14]);
551   btf_16_adds_subs_avx2(&x[2], &x[13]);
552   btf_16_adds_subs_avx2(&x[3], &x[12]);
553   btf_16_adds_subs_avx2(&x[4], &x[11]);
554   btf_16_adds_subs_avx2(&x[5], &x[10]);
555   btf_16_adds_subs_avx2(&x[6], &x[9]);
556   btf_16_adds_subs_avx2(&x[7], &x[8]);
557   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
558   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
559   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
560   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
561 }
562 
idct32_stage9_avx2(__m256i * output,__m256i * x)563 static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
564   btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
565   btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
566   btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
567   btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
568   btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
569   btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
570   btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
571   btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
572   btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
573   btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
574   btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
575   btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
576   btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
577   btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
578   btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
579   btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
580 }
581 
idct32_low1_avx2(const __m256i * input,__m256i * output)582 static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
583   const int32_t *cospi = cospi_arr(INV_COS_BIT);
584 
585   // stage 1
586   __m256i x[2];
587   x[0] = input[0];
588 
589   // stage 2
590   // stage 3
591   // stage 4
592   // stage 5
593   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
594 
595   // stage 6
596   // stage 7
597   // stage 8
598   // stage 9
599   output[0] = x[0];
600   output[31] = x[0];
601   output[1] = x[1];
602   output[30] = x[1];
603   output[2] = x[1];
604   output[29] = x[1];
605   output[3] = x[0];
606   output[28] = x[0];
607   output[4] = x[0];
608   output[27] = x[0];
609   output[5] = x[1];
610   output[26] = x[1];
611   output[6] = x[1];
612   output[25] = x[1];
613   output[7] = x[0];
614   output[24] = x[0];
615   output[8] = x[0];
616   output[23] = x[0];
617   output[9] = x[1];
618   output[22] = x[1];
619   output[10] = x[1];
620   output[21] = x[1];
621   output[11] = x[0];
622   output[20] = x[0];
623   output[12] = x[0];
624   output[19] = x[0];
625   output[13] = x[1];
626   output[18] = x[1];
627   output[14] = x[1];
628   output[17] = x[1];
629   output[15] = x[0];
630   output[16] = x[0];
631 }
632 
idct32_low8_avx2(const __m256i * input,__m256i * output)633 static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
634   const int32_t *cospi = cospi_arr(INV_COS_BIT);
635   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
636 
637   // stage 1
638   __m256i x[32];
639   x[0] = input[0];
640   x[4] = input[4];
641   x[8] = input[2];
642   x[12] = input[6];
643   x[16] = input[1];
644   x[20] = input[5];
645   x[24] = input[3];
646   x[28] = input[7];
647 
648   // stage 2
649   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
650   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
651   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
652   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
653 
654   // stage 3
655   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
656   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
657   x[17] = x[16];
658   x[18] = x[19];
659   x[21] = x[20];
660   x[22] = x[23];
661   x[25] = x[24];
662   x[26] = x[27];
663   x[29] = x[28];
664   x[30] = x[31];
665 
666   // stage 4
667   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
668   x[9] = x[8];
669   x[10] = x[11];
670   x[13] = x[12];
671   x[14] = x[15];
672   idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
673 
674   // stage 5
675   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
676   x[5] = x[4];
677   x[6] = x[7];
678   idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
679   // stage 6
680   x[3] = x[0];
681   x[2] = x[1];
682   idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
683 
684   idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
685   idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
686   idct32_stage9_avx2(output, x);
687 }
688 
idct32_low16_avx2(const __m256i * input,__m256i * output)689 static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
690   const int32_t *cospi = cospi_arr(INV_COS_BIT);
691   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
692 
693   // stage 1
694   __m256i x[32];
695   x[0] = input[0];
696   x[2] = input[8];
697   x[4] = input[4];
698   x[6] = input[12];
699   x[8] = input[2];
700   x[10] = input[10];
701   x[12] = input[6];
702   x[14] = input[14];
703   x[16] = input[1];
704   x[18] = input[9];
705   x[20] = input[5];
706   x[22] = input[13];
707   x[24] = input[3];
708   x[26] = input[11];
709   x[28] = input[7];
710   x[30] = input[15];
711 
712   // stage 2
713   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
714   btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
715   btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
716   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
717   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
718   btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
719   btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
720   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
721 
722   // stage 3
723   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
724   btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
725   btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
726   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
727   idct32_high16_stage3_avx2(x);
728 
729   // stage 4
730   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
731   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
732   btf_16_adds_subs_avx2(&x[8], &x[9]);
733   btf_16_adds_subs_avx2(&x[11], &x[10]);
734   btf_16_adds_subs_avx2(&x[12], &x[13]);
735   btf_16_adds_subs_avx2(&x[15], &x[14]);
736   idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
737 
738   // stage 5
739   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
740   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
741   btf_16_adds_subs_avx2(&x[4], &x[5]);
742   btf_16_adds_subs_avx2(&x[7], &x[6]);
743   idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
744 
745   btf_16_adds_subs_avx2(&x[0], &x[3]);
746   btf_16_adds_subs_avx2(&x[1], &x[2]);
747   idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
748 
749   idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
750   idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
751   idct32_stage9_avx2(output, x);
752 }
753 
idct32_avx2(const __m256i * input,__m256i * output)754 static void idct32_avx2(const __m256i *input, __m256i *output) {
755   const int32_t *cospi = cospi_arr(INV_COS_BIT);
756   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
757 
758   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
759   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
760   __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
761   __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
762   __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
763   __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
764   __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
765   __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
766   __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
767   __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
768   __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
769   __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
770   __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
771   __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
772   __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
773   __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
774   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
775   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
776   __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
777   __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
778   __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
779   __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
780   __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
781   __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
782   __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
783   __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
784   __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
785   __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
786   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
787   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
788   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
789   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
790 
791   // stage 1
792   __m256i x1[32];
793   x1[0] = input[0];
794   x1[1] = input[16];
795   x1[2] = input[8];
796   x1[3] = input[24];
797   x1[4] = input[4];
798   x1[5] = input[20];
799   x1[6] = input[12];
800   x1[7] = input[28];
801   x1[8] = input[2];
802   x1[9] = input[18];
803   x1[10] = input[10];
804   x1[11] = input[26];
805   x1[12] = input[6];
806   x1[13] = input[22];
807   x1[14] = input[14];
808   x1[15] = input[30];
809   x1[16] = input[1];
810   x1[17] = input[17];
811   x1[18] = input[9];
812   x1[19] = input[25];
813   x1[20] = input[5];
814   x1[21] = input[21];
815   x1[22] = input[13];
816   x1[23] = input[29];
817   x1[24] = input[3];
818   x1[25] = input[19];
819   x1[26] = input[11];
820   x1[27] = input[27];
821   x1[28] = input[7];
822   x1[29] = input[23];
823   x1[30] = input[15];
824   x1[31] = input[31];
825 
826   // stage 2
827   btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
828                   INV_COS_BIT);
829   btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
830                   INV_COS_BIT);
831   btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
832                   INV_COS_BIT);
833   btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
834                   INV_COS_BIT);
835   btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
836                   INV_COS_BIT);
837   btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
838                   INV_COS_BIT);
839   btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
840                   INV_COS_BIT);
841   btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
842                   INV_COS_BIT);
843 
844   // stage 3
845   btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
846                   INV_COS_BIT);
847   btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
848                   INV_COS_BIT);
849   btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
850                   INV_COS_BIT);
851   btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
852                   INV_COS_BIT);
853   idct32_high16_stage3_avx2(x1);
854 
855   // stage 4
856   btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
857                   INV_COS_BIT);
858   btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
859                   INV_COS_BIT);
860   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
861   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
862   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
863   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
864   idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
865 
866   // stage 5
867   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
868                   INV_COS_BIT);
869   btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
870                   INV_COS_BIT);
871   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
872   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
873   idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
874 
875   // stage 6
876   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
877   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
878   idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
879 
880   idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
881   idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
882   idct32_stage9_avx2(output, x1);
883 }
884 
idct64_stage4_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)885 static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
886                                              const __m256i _r, int8_t cos_bit) {
887   (void)cos_bit;
888   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
889   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
890   const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
891   const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
892   const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
893   const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
894   const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
895   const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
896   const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
897   const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
898   const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
899   const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
900   btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
901   btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
902   btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
903   btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
904   btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
905   btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
906   btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
907   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
908 }
909 
idct64_stage5_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)910 static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
911                                              const __m256i _r, int8_t cos_bit) {
912   (void)cos_bit;
913   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
914   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
915   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
916   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
917   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
918   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
919   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
920   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
921   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
922   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
923   btf_16_adds_subs_avx2(&x[32], &x[35]);
924   btf_16_adds_subs_avx2(&x[33], &x[34]);
925   btf_16_adds_subs_avx2(&x[39], &x[36]);
926   btf_16_adds_subs_avx2(&x[38], &x[37]);
927   btf_16_adds_subs_avx2(&x[40], &x[43]);
928   btf_16_adds_subs_avx2(&x[41], &x[42]);
929   btf_16_adds_subs_avx2(&x[47], &x[44]);
930   btf_16_adds_subs_avx2(&x[46], &x[45]);
931   btf_16_adds_subs_avx2(&x[48], &x[51]);
932   btf_16_adds_subs_avx2(&x[49], &x[50]);
933   btf_16_adds_subs_avx2(&x[55], &x[52]);
934   btf_16_adds_subs_avx2(&x[54], &x[53]);
935   btf_16_adds_subs_avx2(&x[56], &x[59]);
936   btf_16_adds_subs_avx2(&x[57], &x[58]);
937   btf_16_adds_subs_avx2(&x[63], &x[60]);
938   btf_16_adds_subs_avx2(&x[62], &x[61]);
939 }
940 
idct64_stage6_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)941 static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
942                                              const __m256i _r, int8_t cos_bit) {
943   (void)cos_bit;
944   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
945   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
946   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
947   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
948   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
949   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
950   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
951   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
952   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
953   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
954   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
955   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
956   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
957   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
958 }
959 
idct64_stage6_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)960 static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
961                                              const __m256i _r, int8_t cos_bit) {
962   btf_16_adds_subs_avx2(&x[16], &x[19]);
963   btf_16_adds_subs_avx2(&x[17], &x[18]);
964   btf_16_adds_subs_avx2(&x[23], &x[20]);
965   btf_16_adds_subs_avx2(&x[22], &x[21]);
966   btf_16_adds_subs_avx2(&x[24], &x[27]);
967   btf_16_adds_subs_avx2(&x[25], &x[26]);
968   btf_16_adds_subs_avx2(&x[31], &x[28]);
969   btf_16_adds_subs_avx2(&x[30], &x[29]);
970   idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
971 }
972 
idct64_stage7_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)973 static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
974                                              const __m256i _r, int8_t cos_bit) {
975   (void)cos_bit;
976   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
977   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
978   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
979   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
980   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
981   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
982   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
983   btf_16_adds_subs_avx2(&x[32], &x[39]);
984   btf_16_adds_subs_avx2(&x[33], &x[38]);
985   btf_16_adds_subs_avx2(&x[34], &x[37]);
986   btf_16_adds_subs_avx2(&x[35], &x[36]);
987   btf_16_adds_subs_avx2(&x[47], &x[40]);
988   btf_16_adds_subs_avx2(&x[46], &x[41]);
989   btf_16_adds_subs_avx2(&x[45], &x[42]);
990   btf_16_adds_subs_avx2(&x[44], &x[43]);
991   btf_16_adds_subs_avx2(&x[48], &x[55]);
992   btf_16_adds_subs_avx2(&x[49], &x[54]);
993   btf_16_adds_subs_avx2(&x[50], &x[53]);
994   btf_16_adds_subs_avx2(&x[51], &x[52]);
995   btf_16_adds_subs_avx2(&x[63], &x[56]);
996   btf_16_adds_subs_avx2(&x[62], &x[57]);
997   btf_16_adds_subs_avx2(&x[61], &x[58]);
998   btf_16_adds_subs_avx2(&x[60], &x[59]);
999 }
1000 
idct64_stage8_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1001 static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
1002                                              const __m256i _r, int8_t cos_bit) {
1003   (void)cos_bit;
1004   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1005   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1006   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1007   btf_16_adds_subs_avx2(&x[16], &x[23]);
1008   btf_16_adds_subs_avx2(&x[17], &x[22]);
1009   btf_16_adds_subs_avx2(&x[18], &x[21]);
1010   btf_16_adds_subs_avx2(&x[19], &x[20]);
1011   btf_16_adds_subs_avx2(&x[31], &x[24]);
1012   btf_16_adds_subs_avx2(&x[30], &x[25]);
1013   btf_16_adds_subs_avx2(&x[29], &x[26]);
1014   btf_16_adds_subs_avx2(&x[28], &x[27]);
1015   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
1016   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
1017   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
1018   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
1019   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
1020   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1021   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1022   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1023 }
1024 
idct64_stage9_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1025 static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1026                                       const __m256i _r, int8_t cos_bit) {
1027   (void)cos_bit;
1028   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1029   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1030   btf_16_adds_subs_avx2(&x[0], &x[15]);
1031   btf_16_adds_subs_avx2(&x[1], &x[14]);
1032   btf_16_adds_subs_avx2(&x[2], &x[13]);
1033   btf_16_adds_subs_avx2(&x[3], &x[12]);
1034   btf_16_adds_subs_avx2(&x[4], &x[11]);
1035   btf_16_adds_subs_avx2(&x[5], &x[10]);
1036   btf_16_adds_subs_avx2(&x[6], &x[9]);
1037   btf_16_adds_subs_avx2(&x[7], &x[8]);
1038   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1039   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1040   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1041   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1042   btf_16_adds_subs_avx2(&x[32], &x[47]);
1043   btf_16_adds_subs_avx2(&x[33], &x[46]);
1044   btf_16_adds_subs_avx2(&x[34], &x[45]);
1045   btf_16_adds_subs_avx2(&x[35], &x[44]);
1046   btf_16_adds_subs_avx2(&x[36], &x[43]);
1047   btf_16_adds_subs_avx2(&x[37], &x[42]);
1048   btf_16_adds_subs_avx2(&x[38], &x[41]);
1049   btf_16_adds_subs_avx2(&x[39], &x[40]);
1050   btf_16_adds_subs_avx2(&x[63], &x[48]);
1051   btf_16_adds_subs_avx2(&x[62], &x[49]);
1052   btf_16_adds_subs_avx2(&x[61], &x[50]);
1053   btf_16_adds_subs_avx2(&x[60], &x[51]);
1054   btf_16_adds_subs_avx2(&x[59], &x[52]);
1055   btf_16_adds_subs_avx2(&x[58], &x[53]);
1056   btf_16_adds_subs_avx2(&x[57], &x[54]);
1057   btf_16_adds_subs_avx2(&x[56], &x[55]);
1058 }
1059 
idct64_stage10_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1060 static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1061                                        const __m256i _r, int8_t cos_bit) {
1062   (void)cos_bit;
1063   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1064   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1065   btf_16_adds_subs_avx2(&x[0], &x[31]);
1066   btf_16_adds_subs_avx2(&x[1], &x[30]);
1067   btf_16_adds_subs_avx2(&x[2], &x[29]);
1068   btf_16_adds_subs_avx2(&x[3], &x[28]);
1069   btf_16_adds_subs_avx2(&x[4], &x[27]);
1070   btf_16_adds_subs_avx2(&x[5], &x[26]);
1071   btf_16_adds_subs_avx2(&x[6], &x[25]);
1072   btf_16_adds_subs_avx2(&x[7], &x[24]);
1073   btf_16_adds_subs_avx2(&x[8], &x[23]);
1074   btf_16_adds_subs_avx2(&x[9], &x[22]);
1075   btf_16_adds_subs_avx2(&x[10], &x[21]);
1076   btf_16_adds_subs_avx2(&x[11], &x[20]);
1077   btf_16_adds_subs_avx2(&x[12], &x[19]);
1078   btf_16_adds_subs_avx2(&x[13], &x[18]);
1079   btf_16_adds_subs_avx2(&x[14], &x[17]);
1080   btf_16_adds_subs_avx2(&x[15], &x[16]);
1081   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1082   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1083   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1084   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1085   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1086   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1087   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1088   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1089 }
1090 
idct64_stage11_avx2(__m256i * output,__m256i * x)1091 static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1092   btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1093   btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1094   btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1095   btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1096   btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1097   btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1098   btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1099   btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1100   btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1101   btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1102   btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1103   btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1104   btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1105   btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1106   btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1107   btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1108   btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1109   btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1110   btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1111   btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1112   btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1113   btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1114   btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1115   btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1116   btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1117   btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1118   btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1119   btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1120   btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1121   btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1122   btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1123   btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1124 }
1125 
idct64_low1_avx2(const __m256i * input,__m256i * output)1126 static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
1127   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1128 
1129   // stage 1
1130   __m256i x[32];
1131   x[0] = input[0];
1132 
1133   // stage 2
1134   // stage 3
1135   // stage 4
1136   // stage 5
1137   // stage 6
1138   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1139 
1140   // stage 7
1141   // stage 8
1142   // stage 9
1143   // stage 10
1144   // stage 11
1145   output[0] = x[0];
1146   output[63] = x[0];
1147   output[1] = x[1];
1148   output[62] = x[1];
1149   output[2] = x[1];
1150   output[61] = x[1];
1151   output[3] = x[0];
1152   output[60] = x[0];
1153   output[4] = x[0];
1154   output[59] = x[0];
1155   output[5] = x[1];
1156   output[58] = x[1];
1157   output[6] = x[1];
1158   output[57] = x[1];
1159   output[7] = x[0];
1160   output[56] = x[0];
1161   output[8] = x[0];
1162   output[55] = x[0];
1163   output[9] = x[1];
1164   output[54] = x[1];
1165   output[10] = x[1];
1166   output[53] = x[1];
1167   output[11] = x[0];
1168   output[52] = x[0];
1169   output[12] = x[0];
1170   output[51] = x[0];
1171   output[13] = x[1];
1172   output[50] = x[1];
1173   output[14] = x[1];
1174   output[49] = x[1];
1175   output[15] = x[0];
1176   output[48] = x[0];
1177   output[16] = x[0];
1178   output[47] = x[0];
1179   output[17] = x[1];
1180   output[46] = x[1];
1181   output[18] = x[1];
1182   output[45] = x[1];
1183   output[19] = x[0];
1184   output[44] = x[0];
1185   output[20] = x[0];
1186   output[43] = x[0];
1187   output[21] = x[1];
1188   output[42] = x[1];
1189   output[22] = x[1];
1190   output[41] = x[1];
1191   output[23] = x[0];
1192   output[40] = x[0];
1193   output[24] = x[0];
1194   output[39] = x[0];
1195   output[25] = x[1];
1196   output[38] = x[1];
1197   output[26] = x[1];
1198   output[37] = x[1];
1199   output[27] = x[0];
1200   output[36] = x[0];
1201   output[28] = x[0];
1202   output[35] = x[0];
1203   output[29] = x[1];
1204   output[34] = x[1];
1205   output[30] = x[1];
1206   output[33] = x[1];
1207   output[31] = x[0];
1208   output[32] = x[0];
1209 }
1210 
idct64_low8_avx2(const __m256i * input,__m256i * output)1211 static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
1212   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1213   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1214   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1215   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1216   const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1217   const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1218   const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1219   const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1220   const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1221   const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1222   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1223   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1224   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1225   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1226   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1227   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1228   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1229   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1230 
1231   // stage 1
1232   __m256i x[64];
1233   x[0] = input[0];
1234   x[8] = input[4];
1235   x[16] = input[2];
1236   x[24] = input[6];
1237   x[32] = input[1];
1238   x[40] = input[5];
1239   x[48] = input[3];
1240   x[56] = input[7];
1241 
1242   // stage 2
1243   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1244   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1245   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1246   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1247 
1248   // stage 3
1249   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1250   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1251   x[33] = x[32];
1252   x[38] = x[39];
1253   x[41] = x[40];
1254   x[46] = x[47];
1255   x[49] = x[48];
1256   x[54] = x[55];
1257   x[57] = x[56];
1258   x[62] = x[63];
1259 
1260   // stage 4
1261   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1262   x[17] = x[16];
1263   x[22] = x[23];
1264   x[25] = x[24];
1265   x[30] = x[31];
1266   btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
1267                   INV_COS_BIT);
1268   btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
1269                   INV_COS_BIT);
1270   btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
1271                   INV_COS_BIT);
1272   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
1273                   INV_COS_BIT);
1274 
1275   // stage 5
1276   x[9] = x[8];
1277   x[14] = x[15];
1278   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
1279                   INV_COS_BIT);
1280   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
1281                   INV_COS_BIT);
1282   x[35] = x[32];
1283   x[34] = x[33];
1284   x[36] = x[39];
1285   x[37] = x[38];
1286   x[43] = x[40];
1287   x[42] = x[41];
1288   x[44] = x[47];
1289   x[45] = x[46];
1290   x[51] = x[48];
1291   x[50] = x[49];
1292   x[52] = x[55];
1293   x[53] = x[54];
1294   x[59] = x[56];
1295   x[58] = x[57];
1296   x[60] = x[63];
1297   x[61] = x[62];
1298 
1299   // stage 6
1300   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1301   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1302   x[19] = x[16];
1303   x[18] = x[17];
1304   x[20] = x[23];
1305   x[21] = x[22];
1306   x[27] = x[24];
1307   x[26] = x[25];
1308   x[28] = x[31];
1309   x[29] = x[30];
1310   idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
1311 
1312   // stage 7
1313   x[3] = x[0];
1314   x[2] = x[1];
1315   x[11] = x[8];
1316   x[10] = x[9];
1317   x[12] = x[15];
1318   x[13] = x[14];
1319   idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1320 
1321   // stage 8
1322   x[7] = x[0];
1323   x[6] = x[1];
1324   x[5] = x[2];
1325   x[4] = x[3];
1326   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1327                   INV_COS_BIT);
1328   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1329                   INV_COS_BIT);
1330   idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1331 
1332   idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1333   idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1334   idct64_stage11_avx2(output, x);
1335 }
1336 
idct64_low16_avx2(const __m256i * input,__m256i * output)1337 static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
1338   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1339   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1340 
1341   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1342   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1343   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1344   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1345   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1346 
1347   // stage 1
1348   __m256i x[64];
1349   x[0] = input[0];
1350   x[4] = input[8];
1351   x[8] = input[4];
1352   x[12] = input[12];
1353   x[16] = input[2];
1354   x[20] = input[10];
1355   x[24] = input[6];
1356   x[28] = input[14];
1357   x[32] = input[1];
1358   x[36] = input[9];
1359   x[40] = input[5];
1360   x[44] = input[13];
1361   x[48] = input[3];
1362   x[52] = input[11];
1363   x[56] = input[7];
1364   x[60] = input[15];
1365 
1366   // stage 2
1367   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1368   btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1369   btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1370   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1371   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1372   btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1373   btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1374   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1375 
1376   // stage 3
1377   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1378   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1379   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1380   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1381   x[33] = x[32];
1382   x[34] = x[35];
1383   x[37] = x[36];
1384   x[38] = x[39];
1385   x[41] = x[40];
1386   x[42] = x[43];
1387   x[45] = x[44];
1388   x[46] = x[47];
1389   x[49] = x[48];
1390   x[50] = x[51];
1391   x[53] = x[52];
1392   x[54] = x[55];
1393   x[57] = x[56];
1394   x[58] = x[59];
1395   x[61] = x[60];
1396   x[62] = x[63];
1397 
1398   // stage 4
1399   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1400   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1401   x[17] = x[16];
1402   x[18] = x[19];
1403   x[21] = x[20];
1404   x[22] = x[23];
1405   x[25] = x[24];
1406   x[26] = x[27];
1407   x[29] = x[28];
1408   x[30] = x[31];
1409   idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1410 
1411   // stage 5
1412   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1413   x[9] = x[8];
1414   x[10] = x[11];
1415   x[13] = x[12];
1416   x[14] = x[15];
1417   idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1418 
1419   // stage 6
1420   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1421   x[5] = x[4];
1422   x[6] = x[7];
1423   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1424   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1425                   INV_COS_BIT);
1426   idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1427 
1428   // stage 7
1429   x[3] = x[0];
1430   x[2] = x[1];
1431   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1432   btf_16_adds_subs_avx2(&x[8], &x[11]);
1433   btf_16_adds_subs_avx2(&x[9], &x[10]);
1434   btf_16_adds_subs_avx2(&x[15], &x[12]);
1435   btf_16_adds_subs_avx2(&x[14], &x[13]);
1436   idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1437 
1438   // stage 8
1439   btf_16_adds_subs_avx2(&x[0], &x[7]);
1440   btf_16_adds_subs_avx2(&x[1], &x[6]);
1441   btf_16_adds_subs_avx2(&x[2], &x[5]);
1442   btf_16_adds_subs_avx2(&x[3], &x[4]);
1443   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1444                   INV_COS_BIT);
1445   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1446                   INV_COS_BIT);
1447   idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1448 
1449   idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1450   idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1451   idct64_stage11_avx2(output, x);
1452 }
1453 
idct64_low32_avx2(const __m256i * input,__m256i * output)1454 static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
1455   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1456   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1457 
1458   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1459   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1460   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1461   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1462   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1463 
1464   // stage 1
1465   __m256i x[64];
1466   x[0] = input[0];
1467   x[2] = input[16];
1468   x[4] = input[8];
1469   x[6] = input[24];
1470   x[8] = input[4];
1471   x[10] = input[20];
1472   x[12] = input[12];
1473   x[14] = input[28];
1474   x[16] = input[2];
1475   x[18] = input[18];
1476   x[20] = input[10];
1477   x[22] = input[26];
1478   x[24] = input[6];
1479   x[26] = input[22];
1480   x[28] = input[14];
1481   x[30] = input[30];
1482   x[32] = input[1];
1483   x[34] = input[17];
1484   x[36] = input[9];
1485   x[38] = input[25];
1486   x[40] = input[5];
1487   x[42] = input[21];
1488   x[44] = input[13];
1489   x[46] = input[29];
1490   x[48] = input[3];
1491   x[50] = input[19];
1492   x[52] = input[11];
1493   x[54] = input[27];
1494   x[56] = input[7];
1495   x[58] = input[23];
1496   x[60] = input[15];
1497   x[62] = input[31];
1498 
1499   // stage 2
1500   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1501   btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1502   btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1503   btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1504   btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1505   btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1506   btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1507   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1508   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1509   btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1510   btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1511   btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1512   btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1513   btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1514   btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1515   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1516 
1517   // stage 3
1518   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1519   btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1520   btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1521   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1522   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1523   btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1524   btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1525   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1526   btf_16_adds_subs_avx2(&x[32], &x[33]);
1527   btf_16_adds_subs_avx2(&x[35], &x[34]);
1528   btf_16_adds_subs_avx2(&x[36], &x[37]);
1529   btf_16_adds_subs_avx2(&x[39], &x[38]);
1530   btf_16_adds_subs_avx2(&x[40], &x[41]);
1531   btf_16_adds_subs_avx2(&x[43], &x[42]);
1532   btf_16_adds_subs_avx2(&x[44], &x[45]);
1533   btf_16_adds_subs_avx2(&x[47], &x[46]);
1534   btf_16_adds_subs_avx2(&x[48], &x[49]);
1535   btf_16_adds_subs_avx2(&x[51], &x[50]);
1536   btf_16_adds_subs_avx2(&x[52], &x[53]);
1537   btf_16_adds_subs_avx2(&x[55], &x[54]);
1538   btf_16_adds_subs_avx2(&x[56], &x[57]);
1539   btf_16_adds_subs_avx2(&x[59], &x[58]);
1540   btf_16_adds_subs_avx2(&x[60], &x[61]);
1541   btf_16_adds_subs_avx2(&x[63], &x[62]);
1542 
1543   // stage 4
1544   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1545   btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1546   btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1547   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1548   btf_16_adds_subs_avx2(&x[16], &x[17]);
1549   btf_16_adds_subs_avx2(&x[19], &x[18]);
1550   btf_16_adds_subs_avx2(&x[20], &x[21]);
1551   btf_16_adds_subs_avx2(&x[23], &x[22]);
1552   btf_16_adds_subs_avx2(&x[24], &x[25]);
1553   btf_16_adds_subs_avx2(&x[27], &x[26]);
1554   btf_16_adds_subs_avx2(&x[28], &x[29]);
1555   btf_16_adds_subs_avx2(&x[31], &x[30]);
1556   idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1557 
1558   // stage 5
1559   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1560   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1561   btf_16_adds_subs_avx2(&x[8], &x[9]);
1562   btf_16_adds_subs_avx2(&x[11], &x[10]);
1563   btf_16_adds_subs_avx2(&x[12], &x[13]);
1564   btf_16_adds_subs_avx2(&x[15], &x[14]);
1565   idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1566 
1567   // stage 6
1568   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1569   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1570   btf_16_adds_subs_avx2(&x[4], &x[5]);
1571   btf_16_adds_subs_avx2(&x[7], &x[6]);
1572   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1573   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1574                   INV_COS_BIT);
1575   idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1576 
1577   // stage 7
1578   btf_16_adds_subs_avx2(&x[0], &x[3]);
1579   btf_16_adds_subs_avx2(&x[1], &x[2]);
1580   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1581   btf_16_adds_subs_avx2(&x[8], &x[11]);
1582   btf_16_adds_subs_avx2(&x[9], &x[10]);
1583   btf_16_adds_subs_avx2(&x[15], &x[12]);
1584   btf_16_adds_subs_avx2(&x[14], &x[13]);
1585   idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1586 
1587   // stage 8
1588   btf_16_adds_subs_avx2(&x[0], &x[7]);
1589   btf_16_adds_subs_avx2(&x[1], &x[6]);
1590   btf_16_adds_subs_avx2(&x[2], &x[5]);
1591   btf_16_adds_subs_avx2(&x[3], &x[4]);
1592   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1593                   INV_COS_BIT);
1594   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1595                   INV_COS_BIT);
1596   idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1597 
1598   // stage 9~11
1599   idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1600   idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1601   idct64_stage11_avx2(output, x);
1602 }
1603 
1604 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
1605 
1606 // 1D functions process 16 pixels at one time.
1607 static const transform_1d_avx2
1608     lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1609       {
1610           { NULL, NULL, NULL, NULL },
1611           { NULL, NULL, NULL, NULL },
1612           { NULL, NULL, NULL, NULL },
1613       },
1614       { { NULL, NULL, NULL, NULL },
1615         { NULL, NULL, NULL, NULL },
1616         { NULL, NULL, NULL, NULL } },
1617       {
1618           { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
1619           { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
1620           { NULL, NULL, NULL, NULL },
1621       },
1622       { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
1623         { NULL, NULL, NULL, NULL },
1624         { NULL, NULL, NULL, NULL } },
1625       { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
1626           idct64_low32_avx2 },
1627         { NULL, NULL, NULL, NULL },
1628         { NULL, NULL, NULL, NULL } }
1629     };
1630 
1631 // only process w >= 16 h >= 16
lowbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1632 static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
1633     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1634     TX_SIZE tx_size, int eob) {
1635   __m256i buf1[64 * 16];
1636   int eobx, eoby;
1637   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1638   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1639   const int txw_idx = get_txw_idx(tx_size);
1640   const int txh_idx = get_txh_idx(tx_size);
1641   const int txfm_size_col = tx_size_wide[tx_size];
1642   const int txfm_size_row = tx_size_high[tx_size];
1643   const int buf_size_w_div16 = txfm_size_col >> 4;
1644   const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
1645   const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1646   const int input_stride = AOMMIN(32, txfm_size_col);
1647   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1648 
1649   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1650   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1651   const transform_1d_avx2 row_txfm =
1652       lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1653   const transform_1d_avx2 col_txfm =
1654       lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1655 
1656   assert(col_txfm != NULL);
1657   assert(row_txfm != NULL);
1658   int ud_flip, lr_flip;
1659   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1660   const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
1661   for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1662     __m256i buf0[64];
1663     const int32_t *input_row = input + (i << 4) * input_stride;
1664     for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
1665       __m256i *buf0_cur = buf0 + j * 16;
1666       const int32_t *input_cur = input_row + j * 16;
1667       load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
1668                                           16);
1669       transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1670     }
1671     if (rect_type == 1 || rect_type == -1) {
1672       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
1673     }
1674     row_txfm(buf0, buf0);
1675     for (int j = 0; j < txfm_size_col; ++j) {
1676       buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
1677     }
1678 
1679     __m256i *buf1_cur = buf1 + (i << 4);
1680     if (lr_flip) {
1681       for (int j = 0; j < buf_size_w_div16; ++j) {
1682         __m256i temp[16];
1683         flip_buf_avx2(buf0 + 16 * j, temp, 16);
1684         int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1685         transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1686       }
1687     } else {
1688       for (int j = 0; j < buf_size_w_div16; ++j) {
1689         transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1690       }
1691     }
1692   }
1693   const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
1694   for (int i = 0; i < buf_size_w_div16; i++) {
1695     __m256i *buf1_cur = buf1 + i * txfm_size_row;
1696     col_txfm(buf1_cur, buf1_cur);
1697     for (int j = 0; j < txfm_size_row; ++j) {
1698       buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
1699     }
1700   }
1701   for (int i = 0; i < buf_size_w_div16; i++) {
1702     lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1703                                  stride, ud_flip, txfm_size_row);
1704   }
1705 }
1706 
iidentity_row_16xn_avx2(__m256i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)1707 static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1708                                            int stride, int shift, int height,
1709                                            int txw_idx, int rect_type) {
1710   const int32_t *input_row = input;
1711   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1712   const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1713                                        (1 << (NewSqrt2Bits - shift - 1)));
1714   const __m256i one = _mm256_set1_epi16(1);
1715   const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1716   if (rect_type != 1 && rect_type != -1) {
1717     for (int i = 0; i < height; ++i) {
1718       const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1719       input_row += stride;
1720       __m256i lo = _mm256_unpacklo_epi16(src, one);
1721       __m256i hi = _mm256_unpackhi_epi16(src, one);
1722       lo = _mm256_madd_epi16(lo, scale__r);
1723       hi = _mm256_madd_epi16(hi, scale__r);
1724       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1725       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1726       out[i] = _mm256_packs_epi32(lo, hi);
1727     }
1728   } else {
1729     const __m256i rect_scale =
1730         _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1731     for (int i = 0; i < height; ++i) {
1732       __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1733       src = _mm256_mulhrs_epi16(src, rect_scale);
1734       input_row += stride;
1735       __m256i lo = _mm256_unpacklo_epi16(src, one);
1736       __m256i hi = _mm256_unpackhi_epi16(src, one);
1737       lo = _mm256_madd_epi16(lo, scale__r);
1738       hi = _mm256_madd_epi16(hi, scale__r);
1739       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1740       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1741       out[i] = _mm256_packs_epi32(lo, hi);
1742     }
1743   }
1744 }
1745 
iidentity_col_16xn_avx2(uint8_t * output,int stride,__m256i * buf,int shift,int height,int txh_idx)1746 static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1747                                            __m256i *buf, int shift, int height,
1748                                            int txh_idx) {
1749   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1750   const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1751   const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1752   const __m256i one = _mm256_set1_epi16(1);
1753   const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1754   for (int h = 0; h < height; ++h) {
1755     __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1756     __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1757     lo = _mm256_madd_epi16(lo, scale_coeff);
1758     hi = _mm256_madd_epi16(hi, scale_coeff);
1759     lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1760     hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1761     lo = _mm256_add_epi32(lo, shift__r);
1762     hi = _mm256_add_epi32(hi, shift__r);
1763     lo = _mm256_srai_epi32(lo, -shift);
1764     hi = _mm256_srai_epi32(hi, -shift);
1765     const __m256i x = _mm256_packs_epi32(lo, hi);
1766     write_recon_w16_avx2(x, output);
1767     output += stride;
1768   }
1769 }
1770 
lowbd_inv_txfm2d_add_idtx_avx2(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size,int32_t eob)1771 static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1772                                                   uint8_t *output, int stride,
1773                                                   TX_SIZE tx_size,
1774                                                   int32_t eob) {
1775   (void)eob;
1776   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1777   const int txw_idx = get_txw_idx(tx_size);
1778   const int txh_idx = get_txh_idx(tx_size);
1779   const int txfm_size_col = tx_size_wide[tx_size];
1780   const int txfm_size_row = tx_size_high[tx_size];
1781   const int input_stride = AOMMIN(32, txfm_size_col);
1782   const int row_max = AOMMIN(32, txfm_size_row);
1783   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1784   __m256i buf[32];
1785   for (int i = 0; i < input_stride; i += 16) {
1786     iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
1787                             txw_idx, rect_type);
1788     iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
1789                             txh_idx);
1790   }
1791 }
1792 
lowbd_inv_txfm2d_add_h_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1793 static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
1794     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1795     TX_SIZE tx_size, int eob) {
1796   int eobx, eoby;
1797   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1798   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1799   const int txw_idx = get_txw_idx(tx_size);
1800   const int txh_idx = get_txh_idx(tx_size);
1801   const int txfm_size_col = tx_size_wide[tx_size];
1802   const int txfm_size_row = tx_size_high[tx_size];
1803   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
1804   const int input_stride = txfm_size_col_notzero;
1805   const int buf_size_w_div16 = (eobx + 16) >> 4;
1806   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1807 
1808   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1809   const transform_1d_avx2 col_txfm =
1810       lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1811 
1812   assert(col_txfm != NULL);
1813 
1814   int ud_flip, lr_flip;
1815   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1816   for (int i = 0; i < buf_size_w_div16; i++) {
1817     __m256i buf0[64];
1818     iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
1819                             eoby + 1, txw_idx, rect_type);
1820     col_txfm(buf0, buf0);
1821     __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1822     int k = ud_flip ? (txfm_size_row - 1) : 0;
1823     const int step = ud_flip ? -1 : 1;
1824     for (int j = 0; j < txfm_size_row; ++j, k += step) {
1825       __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1826       write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1827     }
1828   }
1829 }
1830 
lowbd_inv_txfm2d_add_v_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1831 static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
1832     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1833     TX_SIZE tx_size, int eob) {
1834   __m256i buf1[64];
1835   int eobx, eoby;
1836   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1837   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1838   const int txw_idx = get_txw_idx(tx_size);
1839   const int txh_idx = get_txh_idx(tx_size);
1840   const int txfm_size_col = tx_size_wide[tx_size];
1841   const int txfm_size_row = tx_size_high[tx_size];
1842   const int buf_size_w_div16 = txfm_size_col >> 4;
1843   const int buf_size_h_div16 = (eoby + 16) >> 4;
1844   const int input_stride = AOMMIN(32, txfm_size_col);
1845   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1846 
1847   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1848   const transform_1d_avx2 row_txfm =
1849       lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1850 
1851   assert(row_txfm != NULL);
1852 
1853   int ud_flip, lr_flip;
1854   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1855   for (int i = 0; i < buf_size_h_div16; i++) {
1856     __m256i buf0[64];
1857     const int32_t *input_row = input + i * input_stride * 16;
1858     for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
1859       __m256i *buf0_cur = buf0 + j * 16;
1860       load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
1861                                           buf0_cur, 16);
1862       transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1863     }
1864     if (rect_type == 1 || rect_type == -1) {
1865       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
1866     }
1867     row_txfm(buf0, buf0);
1868     round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1869     __m256i *_buf1 = buf1;
1870     if (lr_flip) {
1871       for (int j = 0; j < buf_size_w_div16; ++j) {
1872         __m256i temp[16];
1873         flip_buf_avx2(buf0 + 16 * j, temp, 16);
1874         transpose_16bit_16x16_avx2(temp,
1875                                    _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1876       }
1877     } else {
1878       for (int j = 0; j < buf_size_w_div16; ++j) {
1879         transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1880       }
1881     }
1882     for (int j = 0; j < buf_size_w_div16; ++j) {
1883       iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1884                               buf1 + j * 16, shift[1], 16, txh_idx);
1885     }
1886   }
1887 }
1888 
1889 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1890 static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
1891     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1892     TX_SIZE tx_size, int eob) {
1893   (void)eob;
1894   switch (tx_type) {
1895     case DCT_DCT:
1896     case ADST_DCT:   // ADST in vertical, DCT in horizontal
1897     case DCT_ADST:   // DCT  in vertical, ADST in horizontal
1898     case ADST_ADST:  // ADST in both directions
1899     case FLIPADST_DCT:
1900     case DCT_FLIPADST:
1901     case FLIPADST_FLIPADST:
1902     case ADST_FLIPADST:
1903     case FLIPADST_ADST:
1904       lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
1905                                             tx_size, eob);
1906       break;
1907     case IDTX:
1908       lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
1909       break;
1910     case V_DCT:
1911     case V_ADST:
1912     case V_FLIPADST:
1913       lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
1914                                            tx_size, eob);
1915       break;
1916     case H_DCT:
1917     case H_ADST:
1918     case H_FLIPADST:
1919       lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
1920                                            tx_size, eob);
1921       break;
1922     default:
1923       av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1924                                      eob);
1925       break;
1926   }
1927 }
1928 
av1_lowbd_inv_txfm2d_add_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1929 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
1930                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
1931                                    int eob) {
1932   switch (tx_size) {
1933     case TX_4X4:
1934     case TX_8X8:
1935     case TX_4X8:
1936     case TX_8X4:
1937     case TX_8X16:
1938     case TX_16X8:
1939     case TX_4X16:
1940     case TX_16X4:
1941     case TX_8X32:
1942     case TX_32X8:
1943       av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1944                                      eob);
1945       break;
1946     case TX_16X16:
1947     case TX_32X32:
1948     case TX_64X64:
1949     case TX_16X32:
1950     case TX_32X16:
1951     case TX_32X64:
1952     case TX_64X32:
1953     case TX_16X64:
1954     case TX_64X16:
1955     default:
1956       lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
1957                                          tx_size, eob);
1958       break;
1959   }
1960 }
1961 
av1_inv_txfm_add_avx2(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)1962 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
1963                            const TxfmParam *txfm_param) {
1964   const TX_TYPE tx_type = txfm_param->tx_type;
1965   if (!txfm_param->lossless) {
1966     av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
1967                                   txfm_param->tx_size, txfm_param->eob);
1968   } else {
1969     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
1970   }
1971 }
1972