1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_config.h"
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/av1_inv_txfm1d_cfg.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20
21 // TODO(venkatsanampudi@ittiam.com): move this to header file
22
23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25 4 * 5793 };
26
idct16_stage5_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)27 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28 const __m256i _r, int8_t cos_bit) {
29 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34
35 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39 }
40
idct16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)41 static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42 const __m256i _r, int8_t cos_bit) {
43 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45 btf_16_adds_subs_avx2(&x[0], &x[7]);
46 btf_16_adds_subs_avx2(&x[1], &x[6]);
47 btf_16_adds_subs_avx2(&x[2], &x[5]);
48 btf_16_adds_subs_avx2(&x[3], &x[4]);
49 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51 }
52
idct16_stage7_avx2(__m256i * output,__m256i * x1)53 static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54 btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55 btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56 btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57 btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58 btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59 btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60 btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61 btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62 }
63
idct16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)64 static void idct16_new_avx2(const __m256i *input, __m256i *output,
65 int8_t cos_bit) {
66 (void)(cos_bit);
67 const int32_t *cospi = cospi_arr(INV_COS_BIT);
68 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
69
70 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
71 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
72 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
73 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
74 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
75 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
76 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
77 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
78 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
79 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
80 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
81 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
82 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
83 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
84 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
85 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
86 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
87 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
88 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
89
90 // stage 1
91 __m256i x1[16];
92 x1[0] = input[0];
93 x1[1] = input[8];
94 x1[2] = input[4];
95 x1[3] = input[12];
96 x1[4] = input[2];
97 x1[5] = input[10];
98 x1[6] = input[6];
99 x1[7] = input[14];
100 x1[8] = input[1];
101 x1[9] = input[9];
102 x1[10] = input[5];
103 x1[11] = input[13];
104 x1[12] = input[3];
105 x1[13] = input[11];
106 x1[14] = input[7];
107 x1[15] = input[15];
108
109 // stage 2
110 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
111 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
112 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
113 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
114
115 // stage 3
116 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
117 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
118 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
119 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
120 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
121 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
122
123 // stage 4
124 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
125 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
126 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
127 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
128 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
129 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
130
131 idct16_stage5_avx2(x1, cospi, _r, cos_bit);
132 idct16_stage6_avx2(x1, cospi, _r, cos_bit);
133 idct16_stage7_avx2(output, x1);
134 }
135
idct16_low8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)136 static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
137 int8_t cos_bit) {
138 (void)(cos_bit);
139 const int32_t *cospi = cospi_arr(INV_COS_BIT);
140 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
141
142 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
143 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
144 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
145
146 // stage 1
147 __m256i x1[16];
148 x1[0] = input[0];
149 x1[2] = input[4];
150 x1[4] = input[2];
151 x1[6] = input[6];
152 x1[8] = input[1];
153 x1[10] = input[5];
154 x1[12] = input[3];
155 x1[14] = input[7];
156
157 // stage 2
158 btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
159 btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
160 btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
161 btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
162
163 // stage 3
164 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
165 btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
166 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
167 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
168 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
169 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
170
171 // stage 4
172 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
173 btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
174 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
175 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
176 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
177 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
178
179 idct16_stage5_avx2(x1, cospi, _r, cos_bit);
180 idct16_stage6_avx2(x1, cospi, _r, cos_bit);
181 idct16_stage7_avx2(output, x1);
182 }
183
idct16_low1_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)184 static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
185 int8_t cos_bit) {
186 (void)(cos_bit);
187 const int32_t *cospi = cospi_arr(INV_COS_BIT);
188
189 // stage 1
190 __m256i x1[2];
191 x1[0] = input[0];
192
193 // stage 2
194 // stage 3
195 // stage 4
196 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
197
198 // stage 5
199 // stage 6
200 output[0] = x1[0];
201 output[1] = x1[1];
202 output[2] = x1[1];
203 output[3] = x1[0];
204 output[4] = x1[0];
205 output[5] = x1[1];
206 output[6] = x1[1];
207 output[7] = x1[0];
208 output[8] = x1[0];
209 output[9] = x1[1];
210 output[10] = x1[1];
211 output[11] = x1[0];
212 output[12] = x1[0];
213 output[13] = x1[1];
214 output[14] = x1[1];
215 output[15] = x1[0];
216 }
217
iadst16_stage3_avx2(__m256i * x)218 static INLINE void iadst16_stage3_avx2(__m256i *x) {
219 btf_16_adds_subs_avx2(&x[0], &x[8]);
220 btf_16_adds_subs_avx2(&x[1], &x[9]);
221 btf_16_adds_subs_avx2(&x[2], &x[10]);
222 btf_16_adds_subs_avx2(&x[3], &x[11]);
223 btf_16_adds_subs_avx2(&x[4], &x[12]);
224 btf_16_adds_subs_avx2(&x[5], &x[13]);
225 btf_16_adds_subs_avx2(&x[6], &x[14]);
226 btf_16_adds_subs_avx2(&x[7], &x[15]);
227 }
228
iadst16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)229 static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
230 const __m256i _r, int8_t cos_bit) {
231 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
232 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
233 const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
234 const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
235 const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
236 const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
237 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
238 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
239 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
240 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
241 }
242
iadst16_stage5_avx2(__m256i * x)243 static INLINE void iadst16_stage5_avx2(__m256i *x) {
244 btf_16_adds_subs_avx2(&x[0], &x[4]);
245 btf_16_adds_subs_avx2(&x[1], &x[5]);
246 btf_16_adds_subs_avx2(&x[2], &x[6]);
247 btf_16_adds_subs_avx2(&x[3], &x[7]);
248 btf_16_adds_subs_avx2(&x[8], &x[12]);
249 btf_16_adds_subs_avx2(&x[9], &x[13]);
250 btf_16_adds_subs_avx2(&x[10], &x[14]);
251 btf_16_adds_subs_avx2(&x[11], &x[15]);
252 }
253
iadst16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)254 static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
255 const __m256i _r, int8_t cos_bit) {
256 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
257 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
258 const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
259 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
260 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
261 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
262 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
263 }
264
iadst16_stage7_avx2(__m256i * x)265 static INLINE void iadst16_stage7_avx2(__m256i *x) {
266 btf_16_adds_subs_avx2(&x[0], &x[2]);
267 btf_16_adds_subs_avx2(&x[1], &x[3]);
268 btf_16_adds_subs_avx2(&x[4], &x[6]);
269 btf_16_adds_subs_avx2(&x[5], &x[7]);
270 btf_16_adds_subs_avx2(&x[8], &x[10]);
271 btf_16_adds_subs_avx2(&x[9], &x[11]);
272 btf_16_adds_subs_avx2(&x[12], &x[14]);
273 btf_16_adds_subs_avx2(&x[13], &x[15]);
274 }
275
iadst16_stage8_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)276 static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
277 const __m256i _r, int8_t cos_bit) {
278 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
279 const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
280 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
281 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
282 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
283 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
284 }
285
iadst16_stage9_avx2(__m256i * output,__m256i * x1)286 static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
287 const __m256i __zero = _mm256_setzero_si256();
288 output[0] = x1[0];
289 output[1] = _mm256_subs_epi16(__zero, x1[8]);
290 output[2] = x1[12];
291 output[3] = _mm256_subs_epi16(__zero, x1[4]);
292 output[4] = x1[6];
293 output[5] = _mm256_subs_epi16(__zero, x1[14]);
294 output[6] = x1[10];
295 output[7] = _mm256_subs_epi16(__zero, x1[2]);
296 output[8] = x1[3];
297 output[9] = _mm256_subs_epi16(__zero, x1[11]);
298 output[10] = x1[15];
299 output[11] = _mm256_subs_epi16(__zero, x1[7]);
300 output[12] = x1[5];
301 output[13] = _mm256_subs_epi16(__zero, x1[13]);
302 output[14] = x1[9];
303 output[15] = _mm256_subs_epi16(__zero, x1[1]);
304 }
305
iadst16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)306 static void iadst16_new_avx2(const __m256i *input, __m256i *output,
307 int8_t cos_bit) {
308 (void)(cos_bit);
309 const int32_t *cospi = cospi_arr(INV_COS_BIT);
310
311 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
312
313 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
314 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
315 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
316 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
317 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
318 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
319 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
320 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
321 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
322 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
323 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
324 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
325 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
326 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
327 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
328 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
329
330 // stage 1
331 __m256i x1[16];
332 x1[0] = input[15];
333 x1[1] = input[0];
334 x1[2] = input[13];
335 x1[3] = input[2];
336 x1[4] = input[11];
337 x1[5] = input[4];
338 x1[6] = input[9];
339 x1[7] = input[6];
340 x1[8] = input[7];
341 x1[9] = input[8];
342 x1[10] = input[5];
343 x1[11] = input[10];
344 x1[12] = input[3];
345 x1[13] = input[12];
346 x1[14] = input[1];
347 x1[15] = input[14];
348
349 // stage 2
350 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
351 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
352 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
353 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
354 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
355 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
356 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
357 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
358
359 iadst16_stage3_avx2(x1);
360 iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
361 iadst16_stage5_avx2(x1);
362 iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
363 iadst16_stage7_avx2(x1);
364 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
365 iadst16_stage9_avx2(output, x1);
366 }
367
iadst16_low8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)368 static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
369 int8_t cos_bit) {
370 (void)(cos_bit);
371 const int32_t *cospi = cospi_arr(INV_COS_BIT);
372 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
373
374 // stage 1
375 __m256i x1[16];
376 x1[1] = input[0];
377 x1[3] = input[2];
378 x1[5] = input[4];
379 x1[7] = input[6];
380 x1[8] = input[7];
381 x1[10] = input[5];
382 x1[12] = input[3];
383 x1[14] = input[1];
384
385 // stage 2
386 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
387 btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
388 btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
389 btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
390 btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
391 btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
392 btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
393 btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
394
395 iadst16_stage3_avx2(x1);
396 iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
397 iadst16_stage5_avx2(x1);
398 iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
399 iadst16_stage7_avx2(x1);
400 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
401 iadst16_stage9_avx2(output, x1);
402 }
403
iadst16_low1_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)404 static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
405 int8_t cos_bit) {
406 (void)(cos_bit);
407 const int32_t *cospi = cospi_arr(INV_COS_BIT);
408 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
409
410 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
411 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
412 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
413 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
414
415 // stage 1
416 __m256i x1[16];
417 x1[1] = input[0];
418
419 // stage 2
420 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
421
422 // stage 3
423 x1[8] = x1[0];
424 x1[9] = x1[1];
425
426 // stage 4
427 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
428
429 // stage 5
430 x1[4] = x1[0];
431 x1[5] = x1[1];
432
433 x1[12] = x1[8];
434 x1[13] = x1[9];
435
436 // stage 6
437 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
438 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
439
440 // stage 7
441 x1[2] = x1[0];
442 x1[3] = x1[1];
443 x1[6] = x1[4];
444 x1[7] = x1[5];
445 x1[10] = x1[8];
446 x1[11] = x1[9];
447 x1[14] = x1[12];
448 x1[15] = x1[13];
449
450 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
451 iadst16_stage9_avx2(output, x1);
452 }
453
idct32_high16_stage3_avx2(__m256i * x)454 static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
455 btf_16_adds_subs_avx2(&x[16], &x[17]);
456 btf_16_adds_subs_avx2(&x[19], &x[18]);
457 btf_16_adds_subs_avx2(&x[20], &x[21]);
458 btf_16_adds_subs_avx2(&x[23], &x[22]);
459 btf_16_adds_subs_avx2(&x[24], &x[25]);
460 btf_16_adds_subs_avx2(&x[27], &x[26]);
461 btf_16_adds_subs_avx2(&x[28], &x[29]);
462 btf_16_adds_subs_avx2(&x[31], &x[30]);
463 }
464
idct32_high16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)465 static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
466 const __m256i _r, int8_t cos_bit) {
467 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
468 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
469 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
470 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
471 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
472 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
473 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
474 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
475 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
476 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
477 }
478
idct32_high24_stage5_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)479 static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
480 const __m256i _r, int8_t cos_bit) {
481 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
482 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
483 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
484 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
485 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
486 btf_16_adds_subs_avx2(&x[16], &x[19]);
487 btf_16_adds_subs_avx2(&x[17], &x[18]);
488 btf_16_adds_subs_avx2(&x[23], &x[20]);
489 btf_16_adds_subs_avx2(&x[22], &x[21]);
490 btf_16_adds_subs_avx2(&x[24], &x[27]);
491 btf_16_adds_subs_avx2(&x[25], &x[26]);
492 btf_16_adds_subs_avx2(&x[31], &x[28]);
493 btf_16_adds_subs_avx2(&x[30], &x[29]);
494 }
495
idct32_high28_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)496 static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
497 const __m256i _r, int8_t cos_bit) {
498 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
499 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
500 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
501 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
502 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
503 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
504 btf_16_adds_subs_avx2(&x[8], &x[11]);
505 btf_16_adds_subs_avx2(&x[9], &x[10]);
506 btf_16_adds_subs_avx2(&x[15], &x[12]);
507 btf_16_adds_subs_avx2(&x[14], &x[13]);
508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
509 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
511 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
512 }
513
idct32_stage7_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)514 static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
515 const __m256i _r, int8_t cos_bit) {
516 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
517 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
518 btf_16_adds_subs_avx2(&x[0], &x[7]);
519 btf_16_adds_subs_avx2(&x[1], &x[6]);
520 btf_16_adds_subs_avx2(&x[2], &x[5]);
521 btf_16_adds_subs_avx2(&x[3], &x[4]);
522 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
523 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
524 btf_16_adds_subs_avx2(&x[16], &x[23]);
525 btf_16_adds_subs_avx2(&x[17], &x[22]);
526 btf_16_adds_subs_avx2(&x[18], &x[21]);
527 btf_16_adds_subs_avx2(&x[19], &x[20]);
528 btf_16_adds_subs_avx2(&x[31], &x[24]);
529 btf_16_adds_subs_avx2(&x[30], &x[25]);
530 btf_16_adds_subs_avx2(&x[29], &x[26]);
531 btf_16_adds_subs_avx2(&x[28], &x[27]);
532 }
533
idct32_stage8_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)534 static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
535 const __m256i _r, int8_t cos_bit) {
536 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
537 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
538 btf_16_adds_subs_avx2(&x[0], &x[15]);
539 btf_16_adds_subs_avx2(&x[1], &x[14]);
540 btf_16_adds_subs_avx2(&x[2], &x[13]);
541 btf_16_adds_subs_avx2(&x[3], &x[12]);
542 btf_16_adds_subs_avx2(&x[4], &x[11]);
543 btf_16_adds_subs_avx2(&x[5], &x[10]);
544 btf_16_adds_subs_avx2(&x[6], &x[9]);
545 btf_16_adds_subs_avx2(&x[7], &x[8]);
546 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
547 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
548 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
549 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
550 }
551
idct32_stage9_avx2(__m256i * output,__m256i * x)552 static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
553 btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
554 btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
555 btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
556 btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
557 btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
558 btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
559 btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
560 btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
561 btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
562 btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
563 btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
564 btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
565 btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
566 btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
567 btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
568 btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
569 }
570
idct32_low1_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)571 static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
572 int8_t cos_bit) {
573 (void)cos_bit;
574 const int32_t *cospi = cospi_arr(INV_COS_BIT);
575
576 // stage 1
577 __m256i x[2];
578 x[0] = input[0];
579
580 // stage 2
581 // stage 3
582 // stage 4
583 // stage 5
584 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
585
586 // stage 6
587 // stage 7
588 // stage 8
589 // stage 9
590 output[0] = x[0];
591 output[31] = x[0];
592 output[1] = x[1];
593 output[30] = x[1];
594 output[2] = x[1];
595 output[29] = x[1];
596 output[3] = x[0];
597 output[28] = x[0];
598 output[4] = x[0];
599 output[27] = x[0];
600 output[5] = x[1];
601 output[26] = x[1];
602 output[6] = x[1];
603 output[25] = x[1];
604 output[7] = x[0];
605 output[24] = x[0];
606 output[8] = x[0];
607 output[23] = x[0];
608 output[9] = x[1];
609 output[22] = x[1];
610 output[10] = x[1];
611 output[21] = x[1];
612 output[11] = x[0];
613 output[20] = x[0];
614 output[12] = x[0];
615 output[19] = x[0];
616 output[13] = x[1];
617 output[18] = x[1];
618 output[14] = x[1];
619 output[17] = x[1];
620 output[15] = x[0];
621 output[16] = x[0];
622 }
623
idct32_low8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)624 static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
625 int8_t cos_bit) {
626 (void)cos_bit;
627 const int32_t *cospi = cospi_arr(INV_COS_BIT);
628 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
629
630 // stage 1
631 __m256i x[32];
632 x[0] = input[0];
633 x[4] = input[4];
634 x[8] = input[2];
635 x[12] = input[6];
636 x[16] = input[1];
637 x[20] = input[5];
638 x[24] = input[3];
639 x[28] = input[7];
640
641 // stage 2
642 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
643 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
644 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
645 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
646
647 // stage 3
648 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
649 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
650 x[17] = x[16];
651 x[18] = x[19];
652 x[21] = x[20];
653 x[22] = x[23];
654 x[25] = x[24];
655 x[26] = x[27];
656 x[29] = x[28];
657 x[30] = x[31];
658
659 // stage 4
660 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
661 x[9] = x[8];
662 x[10] = x[11];
663 x[13] = x[12];
664 x[14] = x[15];
665 idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
666
667 // stage 5
668 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
669 x[5] = x[4];
670 x[6] = x[7];
671 idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
672 // stage 6
673 x[3] = x[0];
674 x[2] = x[1];
675 idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
676
677 idct32_stage7_avx2(x, cospi, _r, cos_bit);
678 idct32_stage8_avx2(x, cospi, _r, cos_bit);
679 idct32_stage9_avx2(output, x);
680 }
681
idct32_low16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)682 static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
683 int8_t cos_bit) {
684 (void)cos_bit;
685 const int32_t *cospi = cospi_arr(INV_COS_BIT);
686 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
687
688 // stage 1
689 __m256i x[32];
690 x[0] = input[0];
691 x[2] = input[8];
692 x[4] = input[4];
693 x[6] = input[12];
694 x[8] = input[2];
695 x[10] = input[10];
696 x[12] = input[6];
697 x[14] = input[14];
698 x[16] = input[1];
699 x[18] = input[9];
700 x[20] = input[5];
701 x[22] = input[13];
702 x[24] = input[3];
703 x[26] = input[11];
704 x[28] = input[7];
705 x[30] = input[15];
706
707 // stage 2
708 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
709 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
710 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
711 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
712 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
713 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
714 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
715 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
716
717 // stage 3
718 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
719 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
720 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
721 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
722 idct32_high16_stage3_avx2(x);
723
724 // stage 4
725 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
726 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
727 btf_16_adds_subs_avx2(&x[8], &x[9]);
728 btf_16_adds_subs_avx2(&x[11], &x[10]);
729 btf_16_adds_subs_avx2(&x[12], &x[13]);
730 btf_16_adds_subs_avx2(&x[15], &x[14]);
731 idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
732
733 // stage 5
734 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
735 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
736 btf_16_adds_subs_avx2(&x[4], &x[5]);
737 btf_16_adds_subs_avx2(&x[7], &x[6]);
738 idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
739
740 btf_16_adds_subs_avx2(&x[0], &x[3]);
741 btf_16_adds_subs_avx2(&x[1], &x[2]);
742 idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
743
744 idct32_stage7_avx2(x, cospi, _r, cos_bit);
745 idct32_stage8_avx2(x, cospi, _r, cos_bit);
746 idct32_stage9_avx2(output, x);
747 }
748
idct32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)749 static void idct32_new_avx2(const __m256i *input, __m256i *output,
750 int8_t cos_bit) {
751 (void)(cos_bit);
752 const int32_t *cospi = cospi_arr(INV_COS_BIT);
753 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
754
755 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
756 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
757 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
758 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
759 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
760 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
761 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
762 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
763 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
764 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
765 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
766 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
767 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
768 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
769 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
770 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
771 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
772 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
773 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
774 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
775 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
776 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
777 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
778 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
779 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
780 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
781 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
782 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
783 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
784 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
785 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
786 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
787
788 // stage 1
789 __m256i x1[32];
790 x1[0] = input[0];
791 x1[1] = input[16];
792 x1[2] = input[8];
793 x1[3] = input[24];
794 x1[4] = input[4];
795 x1[5] = input[20];
796 x1[6] = input[12];
797 x1[7] = input[28];
798 x1[8] = input[2];
799 x1[9] = input[18];
800 x1[10] = input[10];
801 x1[11] = input[26];
802 x1[12] = input[6];
803 x1[13] = input[22];
804 x1[14] = input[14];
805 x1[15] = input[30];
806 x1[16] = input[1];
807 x1[17] = input[17];
808 x1[18] = input[9];
809 x1[19] = input[25];
810 x1[20] = input[5];
811 x1[21] = input[21];
812 x1[22] = input[13];
813 x1[23] = input[29];
814 x1[24] = input[3];
815 x1[25] = input[19];
816 x1[26] = input[11];
817 x1[27] = input[27];
818 x1[28] = input[7];
819 x1[29] = input[23];
820 x1[30] = input[15];
821 x1[31] = input[31];
822
823 // stage 2
824 btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
825 btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
826 btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
827 btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
828 btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
829 btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
830 btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
831 btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
832
833 // stage 3
834 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
835 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
836 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
837 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
838 idct32_high16_stage3_avx2(x1);
839
840 // stage 4
841 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
842 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
843 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
844 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
845 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
846 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
847 idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
848
849 // stage 5
850 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
851 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
852 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
853 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
854 idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
855
856 // stage 6
857 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
858 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
859 idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
860
861 idct32_stage7_avx2(x1, cospi, _r, cos_bit);
862 idct32_stage8_avx2(x1, cospi, _r, cos_bit);
863 idct32_stage9_avx2(output, x1);
864 }
865
idct64_stage4_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)866 static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
867 const __m256i _r, int8_t cos_bit) {
868 (void)cos_bit;
869 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
870 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
871 const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
872 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
873 const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
874 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
875 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
876 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
877 const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
878 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
879 const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
880 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
881 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
882 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
883 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
884 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
885 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
886 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
887 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
888 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
889 }
890
idct64_stage5_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)891 static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
892 const __m256i _r, int8_t cos_bit) {
893 (void)cos_bit;
894 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
895 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
896 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
897 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
898 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
899 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
900 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
901 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
902 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
903 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
904 btf_16_adds_subs_avx2(&x[32], &x[35]);
905 btf_16_adds_subs_avx2(&x[33], &x[34]);
906 btf_16_adds_subs_avx2(&x[39], &x[36]);
907 btf_16_adds_subs_avx2(&x[38], &x[37]);
908 btf_16_adds_subs_avx2(&x[40], &x[43]);
909 btf_16_adds_subs_avx2(&x[41], &x[42]);
910 btf_16_adds_subs_avx2(&x[47], &x[44]);
911 btf_16_adds_subs_avx2(&x[46], &x[45]);
912 btf_16_adds_subs_avx2(&x[48], &x[51]);
913 btf_16_adds_subs_avx2(&x[49], &x[50]);
914 btf_16_adds_subs_avx2(&x[55], &x[52]);
915 btf_16_adds_subs_avx2(&x[54], &x[53]);
916 btf_16_adds_subs_avx2(&x[56], &x[59]);
917 btf_16_adds_subs_avx2(&x[57], &x[58]);
918 btf_16_adds_subs_avx2(&x[63], &x[60]);
919 btf_16_adds_subs_avx2(&x[62], &x[61]);
920 }
921
idct64_stage6_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)922 static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
923 const __m256i _r, int8_t cos_bit) {
924 (void)cos_bit;
925 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
926 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
927 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
928 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
929 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
930 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
931 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
932 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
933 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
934 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
935 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
936 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
937 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
938 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
939 }
940
idct64_stage6_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)941 static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
942 const __m256i _r, int8_t cos_bit) {
943 btf_16_adds_subs_avx2(&x[16], &x[19]);
944 btf_16_adds_subs_avx2(&x[17], &x[18]);
945 btf_16_adds_subs_avx2(&x[23], &x[20]);
946 btf_16_adds_subs_avx2(&x[22], &x[21]);
947 btf_16_adds_subs_avx2(&x[24], &x[27]);
948 btf_16_adds_subs_avx2(&x[25], &x[26]);
949 btf_16_adds_subs_avx2(&x[31], &x[28]);
950 btf_16_adds_subs_avx2(&x[30], &x[29]);
951 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
952 }
953
idct64_stage7_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)954 static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
955 const __m256i _r, int8_t cos_bit) {
956 (void)cos_bit;
957 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
958 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
959 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
960 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
961 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
962 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
963 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
964 btf_16_adds_subs_avx2(&x[32], &x[39]);
965 btf_16_adds_subs_avx2(&x[33], &x[38]);
966 btf_16_adds_subs_avx2(&x[34], &x[37]);
967 btf_16_adds_subs_avx2(&x[35], &x[36]);
968 btf_16_adds_subs_avx2(&x[47], &x[40]);
969 btf_16_adds_subs_avx2(&x[46], &x[41]);
970 btf_16_adds_subs_avx2(&x[45], &x[42]);
971 btf_16_adds_subs_avx2(&x[44], &x[43]);
972 btf_16_adds_subs_avx2(&x[48], &x[55]);
973 btf_16_adds_subs_avx2(&x[49], &x[54]);
974 btf_16_adds_subs_avx2(&x[50], &x[53]);
975 btf_16_adds_subs_avx2(&x[51], &x[52]);
976 btf_16_adds_subs_avx2(&x[63], &x[56]);
977 btf_16_adds_subs_avx2(&x[62], &x[57]);
978 btf_16_adds_subs_avx2(&x[61], &x[58]);
979 btf_16_adds_subs_avx2(&x[60], &x[59]);
980 }
981
idct64_stage8_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)982 static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
983 const __m256i _r, int8_t cos_bit) {
984 (void)cos_bit;
985 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
986 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
987 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
988 btf_16_adds_subs_avx2(&x[16], &x[23]);
989 btf_16_adds_subs_avx2(&x[17], &x[22]);
990 btf_16_adds_subs_avx2(&x[18], &x[21]);
991 btf_16_adds_subs_avx2(&x[19], &x[20]);
992 btf_16_adds_subs_avx2(&x[31], &x[24]);
993 btf_16_adds_subs_avx2(&x[30], &x[25]);
994 btf_16_adds_subs_avx2(&x[29], &x[26]);
995 btf_16_adds_subs_avx2(&x[28], &x[27]);
996 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
997 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
998 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
999 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
1000 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
1001 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1002 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1003 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1004 }
1005
idct64_stage9_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1006 static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1007 const __m256i _r, int8_t cos_bit) {
1008 (void)cos_bit;
1009 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1010 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1011 btf_16_adds_subs_avx2(&x[0], &x[15]);
1012 btf_16_adds_subs_avx2(&x[1], &x[14]);
1013 btf_16_adds_subs_avx2(&x[2], &x[13]);
1014 btf_16_adds_subs_avx2(&x[3], &x[12]);
1015 btf_16_adds_subs_avx2(&x[4], &x[11]);
1016 btf_16_adds_subs_avx2(&x[5], &x[10]);
1017 btf_16_adds_subs_avx2(&x[6], &x[9]);
1018 btf_16_adds_subs_avx2(&x[7], &x[8]);
1019 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1020 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1021 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1022 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1023 btf_16_adds_subs_avx2(&x[32], &x[47]);
1024 btf_16_adds_subs_avx2(&x[33], &x[46]);
1025 btf_16_adds_subs_avx2(&x[34], &x[45]);
1026 btf_16_adds_subs_avx2(&x[35], &x[44]);
1027 btf_16_adds_subs_avx2(&x[36], &x[43]);
1028 btf_16_adds_subs_avx2(&x[37], &x[42]);
1029 btf_16_adds_subs_avx2(&x[38], &x[41]);
1030 btf_16_adds_subs_avx2(&x[39], &x[40]);
1031 btf_16_adds_subs_avx2(&x[63], &x[48]);
1032 btf_16_adds_subs_avx2(&x[62], &x[49]);
1033 btf_16_adds_subs_avx2(&x[61], &x[50]);
1034 btf_16_adds_subs_avx2(&x[60], &x[51]);
1035 btf_16_adds_subs_avx2(&x[59], &x[52]);
1036 btf_16_adds_subs_avx2(&x[58], &x[53]);
1037 btf_16_adds_subs_avx2(&x[57], &x[54]);
1038 btf_16_adds_subs_avx2(&x[56], &x[55]);
1039 }
1040
idct64_stage10_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1041 static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1042 const __m256i _r, int8_t cos_bit) {
1043 (void)cos_bit;
1044 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1045 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1046 btf_16_adds_subs_avx2(&x[0], &x[31]);
1047 btf_16_adds_subs_avx2(&x[1], &x[30]);
1048 btf_16_adds_subs_avx2(&x[2], &x[29]);
1049 btf_16_adds_subs_avx2(&x[3], &x[28]);
1050 btf_16_adds_subs_avx2(&x[4], &x[27]);
1051 btf_16_adds_subs_avx2(&x[5], &x[26]);
1052 btf_16_adds_subs_avx2(&x[6], &x[25]);
1053 btf_16_adds_subs_avx2(&x[7], &x[24]);
1054 btf_16_adds_subs_avx2(&x[8], &x[23]);
1055 btf_16_adds_subs_avx2(&x[9], &x[22]);
1056 btf_16_adds_subs_avx2(&x[10], &x[21]);
1057 btf_16_adds_subs_avx2(&x[11], &x[20]);
1058 btf_16_adds_subs_avx2(&x[12], &x[19]);
1059 btf_16_adds_subs_avx2(&x[13], &x[18]);
1060 btf_16_adds_subs_avx2(&x[14], &x[17]);
1061 btf_16_adds_subs_avx2(&x[15], &x[16]);
1062 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1063 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1064 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1065 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1066 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1067 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1068 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1069 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1070 }
1071
idct64_stage11_avx2(__m256i * output,__m256i * x)1072 static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1073 btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1074 btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1075 btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1076 btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1077 btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1078 btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1079 btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1080 btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1081 btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1082 btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1083 btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1084 btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1085 btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1086 btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1087 btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1088 btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1089 btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1090 btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1091 btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1092 btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1093 btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1094 btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1095 btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1096 btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1097 btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1098 btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1099 btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1100 btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1101 btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1102 btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1103 btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1104 btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1105 }
1106
idct64_low1_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1107 static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
1108 int8_t cos_bit) {
1109 (void)cos_bit;
1110 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1111
1112 // stage 1
1113 __m256i x[32];
1114 x[0] = input[0];
1115
1116 // stage 2
1117 // stage 3
1118 // stage 4
1119 // stage 5
1120 // stage 6
1121 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1122
1123 // stage 7
1124 // stage 8
1125 // stage 9
1126 // stage 10
1127 // stage 11
1128 output[0] = x[0];
1129 output[63] = x[0];
1130 output[1] = x[1];
1131 output[62] = x[1];
1132 output[2] = x[1];
1133 output[61] = x[1];
1134 output[3] = x[0];
1135 output[60] = x[0];
1136 output[4] = x[0];
1137 output[59] = x[0];
1138 output[5] = x[1];
1139 output[58] = x[1];
1140 output[6] = x[1];
1141 output[57] = x[1];
1142 output[7] = x[0];
1143 output[56] = x[0];
1144 output[8] = x[0];
1145 output[55] = x[0];
1146 output[9] = x[1];
1147 output[54] = x[1];
1148 output[10] = x[1];
1149 output[53] = x[1];
1150 output[11] = x[0];
1151 output[52] = x[0];
1152 output[12] = x[0];
1153 output[51] = x[0];
1154 output[13] = x[1];
1155 output[50] = x[1];
1156 output[14] = x[1];
1157 output[49] = x[1];
1158 output[15] = x[0];
1159 output[48] = x[0];
1160 output[16] = x[0];
1161 output[47] = x[0];
1162 output[17] = x[1];
1163 output[46] = x[1];
1164 output[18] = x[1];
1165 output[45] = x[1];
1166 output[19] = x[0];
1167 output[44] = x[0];
1168 output[20] = x[0];
1169 output[43] = x[0];
1170 output[21] = x[1];
1171 output[42] = x[1];
1172 output[22] = x[1];
1173 output[41] = x[1];
1174 output[23] = x[0];
1175 output[40] = x[0];
1176 output[24] = x[0];
1177 output[39] = x[0];
1178 output[25] = x[1];
1179 output[38] = x[1];
1180 output[26] = x[1];
1181 output[37] = x[1];
1182 output[27] = x[0];
1183 output[36] = x[0];
1184 output[28] = x[0];
1185 output[35] = x[0];
1186 output[29] = x[1];
1187 output[34] = x[1];
1188 output[30] = x[1];
1189 output[33] = x[1];
1190 output[31] = x[0];
1191 output[32] = x[0];
1192 }
1193
idct64_low8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1194 static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
1195 int8_t cos_bit) {
1196 (void)cos_bit;
1197 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1198 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1199 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1200 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1201 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1202 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1203 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1204 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1205 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1206 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1207 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1208 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1209 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1210 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1211 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1212 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1213 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1214 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1215
1216 // stage 1
1217 __m256i x[64];
1218 x[0] = input[0];
1219 x[8] = input[4];
1220 x[16] = input[2];
1221 x[24] = input[6];
1222 x[32] = input[1];
1223 x[40] = input[5];
1224 x[48] = input[3];
1225 x[56] = input[7];
1226
1227 // stage 2
1228 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1229 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1230 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1231 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1232
1233 // stage 3
1234 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1235 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1236 x[33] = x[32];
1237 x[38] = x[39];
1238 x[41] = x[40];
1239 x[46] = x[47];
1240 x[49] = x[48];
1241 x[54] = x[55];
1242 x[57] = x[56];
1243 x[62] = x[63];
1244
1245 // stage 4
1246 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1247 x[17] = x[16];
1248 x[22] = x[23];
1249 x[25] = x[24];
1250 x[30] = x[31];
1251 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
1252 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
1253 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
1254 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
1255
1256 // stage 5
1257 x[9] = x[8];
1258 x[14] = x[15];
1259 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
1260 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
1261 x[35] = x[32];
1262 x[34] = x[33];
1263 x[36] = x[39];
1264 x[37] = x[38];
1265 x[43] = x[40];
1266 x[42] = x[41];
1267 x[44] = x[47];
1268 x[45] = x[46];
1269 x[51] = x[48];
1270 x[50] = x[49];
1271 x[52] = x[55];
1272 x[53] = x[54];
1273 x[59] = x[56];
1274 x[58] = x[57];
1275 x[60] = x[63];
1276 x[61] = x[62];
1277
1278 // stage 6
1279 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1280 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1281 x[19] = x[16];
1282 x[18] = x[17];
1283 x[20] = x[23];
1284 x[21] = x[22];
1285 x[27] = x[24];
1286 x[26] = x[25];
1287 x[28] = x[31];
1288 x[29] = x[30];
1289 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
1290
1291 // stage 7
1292 x[3] = x[0];
1293 x[2] = x[1];
1294 x[11] = x[8];
1295 x[10] = x[9];
1296 x[12] = x[15];
1297 x[13] = x[14];
1298 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1299
1300 // stage 8
1301 x[7] = x[0];
1302 x[6] = x[1];
1303 x[5] = x[2];
1304 x[4] = x[3];
1305 x[9] = x[9];
1306 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1307 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1308 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1309
1310 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1311 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1312 idct64_stage11_avx2(output, x);
1313 }
1314
idct64_low16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1315 static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
1316 int8_t cos_bit) {
1317 (void)cos_bit;
1318 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1319 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1320
1321 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1322 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1323 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1324 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1325 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1326
1327 // stage 1
1328 __m256i x[64];
1329 x[0] = input[0];
1330 x[4] = input[8];
1331 x[8] = input[4];
1332 x[12] = input[12];
1333 x[16] = input[2];
1334 x[20] = input[10];
1335 x[24] = input[6];
1336 x[28] = input[14];
1337 x[32] = input[1];
1338 x[36] = input[9];
1339 x[40] = input[5];
1340 x[44] = input[13];
1341 x[48] = input[3];
1342 x[52] = input[11];
1343 x[56] = input[7];
1344 x[60] = input[15];
1345
1346 // stage 2
1347 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1348 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1349 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1350 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1351 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1352 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1353 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1354 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1355
1356 // stage 3
1357 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1358 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1359 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1360 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1361 x[33] = x[32];
1362 x[34] = x[35];
1363 x[37] = x[36];
1364 x[38] = x[39];
1365 x[41] = x[40];
1366 x[42] = x[43];
1367 x[45] = x[44];
1368 x[46] = x[47];
1369 x[49] = x[48];
1370 x[50] = x[51];
1371 x[53] = x[52];
1372 x[54] = x[55];
1373 x[57] = x[56];
1374 x[58] = x[59];
1375 x[61] = x[60];
1376 x[62] = x[63];
1377
1378 // stage 4
1379 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1380 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1381 x[17] = x[16];
1382 x[18] = x[19];
1383 x[21] = x[20];
1384 x[22] = x[23];
1385 x[25] = x[24];
1386 x[26] = x[27];
1387 x[29] = x[28];
1388 x[30] = x[31];
1389 idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1390
1391 // stage 5
1392 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1393 x[9] = x[8];
1394 x[10] = x[11];
1395 x[13] = x[12];
1396 x[14] = x[15];
1397 idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1398
1399 // stage 6
1400 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1401 x[5] = x[4];
1402 x[6] = x[7];
1403 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1404 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1405 idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1406
1407 // stage 7
1408 x[3] = x[0];
1409 x[2] = x[1];
1410 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1411 btf_16_adds_subs_avx2(&x[8], &x[11]);
1412 btf_16_adds_subs_avx2(&x[9], &x[10]);
1413 btf_16_adds_subs_avx2(&x[15], &x[12]);
1414 btf_16_adds_subs_avx2(&x[14], &x[13]);
1415 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1416
1417 // stage 8
1418 btf_16_adds_subs_avx2(&x[0], &x[7]);
1419 btf_16_adds_subs_avx2(&x[1], &x[6]);
1420 btf_16_adds_subs_avx2(&x[2], &x[5]);
1421 btf_16_adds_subs_avx2(&x[3], &x[4]);
1422 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1423 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1424 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1425
1426 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1427 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1428 idct64_stage11_avx2(output, x);
1429 }
1430
idct64_low32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1431 static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
1432 int8_t cos_bit) {
1433 (void)cos_bit;
1434 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1435 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1436
1437 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1438 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1439 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1440 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1441 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1442
1443 // stage 1
1444 __m256i x[64];
1445 x[0] = input[0];
1446 x[2] = input[16];
1447 x[4] = input[8];
1448 x[6] = input[24];
1449 x[8] = input[4];
1450 x[10] = input[20];
1451 x[12] = input[12];
1452 x[14] = input[28];
1453 x[16] = input[2];
1454 x[18] = input[18];
1455 x[20] = input[10];
1456 x[22] = input[26];
1457 x[24] = input[6];
1458 x[26] = input[22];
1459 x[28] = input[14];
1460 x[30] = input[30];
1461 x[32] = input[1];
1462 x[34] = input[17];
1463 x[36] = input[9];
1464 x[38] = input[25];
1465 x[40] = input[5];
1466 x[42] = input[21];
1467 x[44] = input[13];
1468 x[46] = input[29];
1469 x[48] = input[3];
1470 x[50] = input[19];
1471 x[52] = input[11];
1472 x[54] = input[27];
1473 x[56] = input[7];
1474 x[58] = input[23];
1475 x[60] = input[15];
1476 x[62] = input[31];
1477
1478 // stage 2
1479 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1480 btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1481 btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1482 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1483 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1484 btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1485 btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1486 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1487 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1488 btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1489 btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1490 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1491 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1492 btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1493 btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1494 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1495
1496 // stage 3
1497 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1498 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1499 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1500 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1501 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1502 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1503 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1504 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1505 btf_16_adds_subs_avx2(&x[32], &x[33]);
1506 btf_16_adds_subs_avx2(&x[35], &x[34]);
1507 btf_16_adds_subs_avx2(&x[36], &x[37]);
1508 btf_16_adds_subs_avx2(&x[39], &x[38]);
1509 btf_16_adds_subs_avx2(&x[40], &x[41]);
1510 btf_16_adds_subs_avx2(&x[43], &x[42]);
1511 btf_16_adds_subs_avx2(&x[44], &x[45]);
1512 btf_16_adds_subs_avx2(&x[47], &x[46]);
1513 btf_16_adds_subs_avx2(&x[48], &x[49]);
1514 btf_16_adds_subs_avx2(&x[51], &x[50]);
1515 btf_16_adds_subs_avx2(&x[52], &x[53]);
1516 btf_16_adds_subs_avx2(&x[55], &x[54]);
1517 btf_16_adds_subs_avx2(&x[56], &x[57]);
1518 btf_16_adds_subs_avx2(&x[59], &x[58]);
1519 btf_16_adds_subs_avx2(&x[60], &x[61]);
1520 btf_16_adds_subs_avx2(&x[63], &x[62]);
1521
1522 // stage 4
1523 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1524 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1525 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1526 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1527 btf_16_adds_subs_avx2(&x[16], &x[17]);
1528 btf_16_adds_subs_avx2(&x[19], &x[18]);
1529 btf_16_adds_subs_avx2(&x[20], &x[21]);
1530 btf_16_adds_subs_avx2(&x[23], &x[22]);
1531 btf_16_adds_subs_avx2(&x[24], &x[25]);
1532 btf_16_adds_subs_avx2(&x[27], &x[26]);
1533 btf_16_adds_subs_avx2(&x[28], &x[29]);
1534 btf_16_adds_subs_avx2(&x[31], &x[30]);
1535 idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1536
1537 // stage 5
1538 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1539 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1540 btf_16_adds_subs_avx2(&x[8], &x[9]);
1541 btf_16_adds_subs_avx2(&x[11], &x[10]);
1542 btf_16_adds_subs_avx2(&x[12], &x[13]);
1543 btf_16_adds_subs_avx2(&x[15], &x[14]);
1544 idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1545
1546 // stage 6
1547 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1548 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1549 btf_16_adds_subs_avx2(&x[4], &x[5]);
1550 btf_16_adds_subs_avx2(&x[7], &x[6]);
1551 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1552 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1553 idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1554
1555 // stage 7
1556 btf_16_adds_subs_avx2(&x[0], &x[3]);
1557 btf_16_adds_subs_avx2(&x[1], &x[2]);
1558 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1559 btf_16_adds_subs_avx2(&x[8], &x[11]);
1560 btf_16_adds_subs_avx2(&x[9], &x[10]);
1561 btf_16_adds_subs_avx2(&x[15], &x[12]);
1562 btf_16_adds_subs_avx2(&x[14], &x[13]);
1563 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1564
1565 // stage 8
1566 btf_16_adds_subs_avx2(&x[0], &x[7]);
1567 btf_16_adds_subs_avx2(&x[1], &x[6]);
1568 btf_16_adds_subs_avx2(&x[2], &x[5]);
1569 btf_16_adds_subs_avx2(&x[3], &x[4]);
1570 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1571 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1572 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1573
1574 // stage 9~11
1575 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1576 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1577 idct64_stage11_avx2(output, x);
1578 }
1579
1580 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1581 int8_t cos_bit);
1582
1583 // 1D functions process 16 pixels at one time.
1584 static const transform_1d_avx2
1585 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1586 {
1587 { NULL, NULL, NULL, NULL },
1588 { NULL, NULL, NULL, NULL },
1589 { NULL, NULL, NULL, NULL },
1590 },
1591 { { NULL, NULL, NULL, NULL },
1592 { NULL, NULL, NULL, NULL },
1593 { NULL, NULL, NULL, NULL } },
1594 {
1595 { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
1596 { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
1597 NULL },
1598 { NULL, NULL, NULL, NULL },
1599 },
1600 { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
1601 idct32_new_avx2 },
1602 { NULL, NULL, NULL, NULL },
1603 { NULL, NULL, NULL, NULL } },
1604 { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
1605 idct64_low32_new_avx2 },
1606 { NULL, NULL, NULL, NULL },
1607 { NULL, NULL, NULL, NULL } }
1608 };
1609
1610 // only process w >= 16 h >= 16
lowbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1611 static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
1612 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1613 TX_SIZE tx_size, int eob) {
1614 __m256i buf1[64 * 16];
1615 int eobx, eoby;
1616 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1617 const int8_t *shift = inv_txfm_shift_ls[tx_size];
1618 const int txw_idx = get_txw_idx(tx_size);
1619 const int txh_idx = get_txh_idx(tx_size);
1620 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
1621 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
1622 const int txfm_size_col = tx_size_wide[tx_size];
1623 const int txfm_size_row = tx_size_high[tx_size];
1624 const int buf_size_w_div16 = txfm_size_col >> 4;
1625 const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
1626 const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1627 const int input_stride = AOMMIN(32, txfm_size_col);
1628 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1629
1630 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1631 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1632 const transform_1d_avx2 row_txfm =
1633 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1634 const transform_1d_avx2 col_txfm =
1635 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1636
1637 assert(col_txfm != NULL);
1638 assert(row_txfm != NULL);
1639 int ud_flip, lr_flip;
1640 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1641 for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1642 __m256i buf0[64];
1643 const int32_t *input_row = input + (i << 4) * input_stride;
1644 for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
1645 __m256i *buf0_cur = buf0 + j * 16;
1646 const int32_t *input_cur = input_row + j * 16;
1647 load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
1648 16);
1649 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1650 }
1651 if (rect_type == 1 || rect_type == -1) {
1652 round_shift_avx2(buf0, buf0, input_stride); // rect special code
1653 }
1654 row_txfm(buf0, buf0, cos_bit_row);
1655 round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1656
1657 __m256i *buf1_cur = buf1 + (i << 4);
1658 if (lr_flip) {
1659 for (int j = 0; j < buf_size_w_div16; ++j) {
1660 __m256i temp[16];
1661 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1662 int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1663 transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1664 }
1665 } else {
1666 for (int j = 0; j < buf_size_w_div16; ++j) {
1667 transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1668 }
1669 }
1670 }
1671 for (int i = 0; i < buf_size_w_div16; i++) {
1672 __m256i *buf1_cur = buf1 + i * txfm_size_row;
1673 col_txfm(buf1_cur, buf1_cur, cos_bit_col);
1674 round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
1675 }
1676 for (int i = 0; i < buf_size_w_div16; i++) {
1677 lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1678 stride, ud_flip, txfm_size_row);
1679 }
1680 }
1681
iidentity_row_16xn_avx2(__m256i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)1682 static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1683 int stride, int shift, int height,
1684 int txw_idx, int rect_type) {
1685 const int32_t *input_row = input;
1686 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1687 const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1688 (1 << (NewSqrt2Bits - shift - 1)));
1689 const __m256i one = _mm256_set1_epi16(1);
1690 const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1691 if (rect_type != 1 && rect_type != -1) {
1692 for (int i = 0; i < height; ++i) {
1693 const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1694 input_row += stride;
1695 __m256i lo = _mm256_unpacklo_epi16(src, one);
1696 __m256i hi = _mm256_unpackhi_epi16(src, one);
1697 lo = _mm256_madd_epi16(lo, scale__r);
1698 hi = _mm256_madd_epi16(hi, scale__r);
1699 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1700 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1701 out[i] = _mm256_packs_epi32(lo, hi);
1702 }
1703 } else {
1704 const __m256i rect_scale =
1705 _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1706 for (int i = 0; i < height; ++i) {
1707 __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1708 src = _mm256_mulhrs_epi16(src, rect_scale);
1709 input_row += stride;
1710 __m256i lo = _mm256_unpacklo_epi16(src, one);
1711 __m256i hi = _mm256_unpackhi_epi16(src, one);
1712 lo = _mm256_madd_epi16(lo, scale__r);
1713 hi = _mm256_madd_epi16(hi, scale__r);
1714 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1715 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1716 out[i] = _mm256_packs_epi32(lo, hi);
1717 }
1718 }
1719 }
1720
iidentity_col_16xn_avx2(uint8_t * output,int stride,__m256i * buf,int shift,int height,int txh_idx)1721 static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1722 __m256i *buf, int shift, int height,
1723 int txh_idx) {
1724 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1725 const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1726 const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1727 const __m256i one = _mm256_set1_epi16(1);
1728 const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1729 for (int h = 0; h < height; ++h) {
1730 __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1731 __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1732 lo = _mm256_madd_epi16(lo, scale_coeff);
1733 hi = _mm256_madd_epi16(hi, scale_coeff);
1734 lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1735 hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1736 lo = _mm256_add_epi32(lo, shift__r);
1737 hi = _mm256_add_epi32(hi, shift__r);
1738 lo = _mm256_srai_epi32(lo, -shift);
1739 hi = _mm256_srai_epi32(hi, -shift);
1740 const __m256i x = _mm256_packs_epi32(lo, hi);
1741 write_recon_w16_avx2(x, output);
1742 output += stride;
1743 }
1744 }
1745
lowbd_inv_txfm2d_add_idtx_avx2(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size,int32_t eob)1746 static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1747 uint8_t *output, int stride,
1748 TX_SIZE tx_size,
1749 int32_t eob) {
1750 (void)eob;
1751 const int8_t *shift = inv_txfm_shift_ls[tx_size];
1752 const int txw_idx = get_txw_idx(tx_size);
1753 const int txh_idx = get_txh_idx(tx_size);
1754 const int txfm_size_col = tx_size_wide[tx_size];
1755 const int txfm_size_row = tx_size_high[tx_size];
1756 const int input_stride = AOMMIN(32, txfm_size_col);
1757 const int row_max = AOMMIN(32, txfm_size_row);
1758 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1759 __m256i buf[32];
1760 for (int i = 0; i < input_stride; i += 16) {
1761 iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
1762 txw_idx, rect_type);
1763 iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
1764 txh_idx);
1765 }
1766 }
1767
lowbd_inv_txfm2d_add_h_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1768 static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
1769 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1770 TX_SIZE tx_size, int eob) {
1771 int eobx, eoby;
1772 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1773 const int8_t *shift = inv_txfm_shift_ls[tx_size];
1774 const int txw_idx = get_txw_idx(tx_size);
1775 const int txh_idx = get_txh_idx(tx_size);
1776 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
1777 const int txfm_size_col = tx_size_wide[tx_size];
1778 const int txfm_size_row = tx_size_high[tx_size];
1779 const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
1780 const int input_stride = txfm_size_col_notzero;
1781 const int buf_size_w_div16 = (eobx + 16) >> 4;
1782 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1783
1784 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1785 const transform_1d_avx2 col_txfm =
1786 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1787
1788 assert(col_txfm != NULL);
1789
1790 int ud_flip, lr_flip;
1791 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1792 for (int i = 0; i < buf_size_w_div16; i++) {
1793 __m256i buf0[64];
1794 iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
1795 eoby + 1, txw_idx, rect_type);
1796 col_txfm(buf0, buf0, cos_bit_col);
1797 __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1798 int k = ud_flip ? (txfm_size_row - 1) : 0;
1799 const int step = ud_flip ? -1 : 1;
1800 for (int j = 0; j < txfm_size_row; ++j, k += step) {
1801 __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1802 write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1803 }
1804 }
1805 }
1806
lowbd_inv_txfm2d_add_v_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1807 static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
1808 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1809 TX_SIZE tx_size, int eob) {
1810 __m256i buf1[64];
1811 int eobx, eoby;
1812 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1813 const int8_t *shift = inv_txfm_shift_ls[tx_size];
1814 const int txw_idx = get_txw_idx(tx_size);
1815 const int txh_idx = get_txh_idx(tx_size);
1816 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
1817 const int txfm_size_col = tx_size_wide[tx_size];
1818 const int txfm_size_row = tx_size_high[tx_size];
1819 const int buf_size_w_div16 = txfm_size_col >> 4;
1820 const int buf_size_h_div16 = (eoby + 16) >> 4;
1821 const int input_stride = AOMMIN(32, txfm_size_col);
1822 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1823
1824 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1825 const transform_1d_avx2 row_txfm =
1826 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1827
1828 assert(row_txfm != NULL);
1829
1830 int ud_flip, lr_flip;
1831 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1832 for (int i = 0; i < buf_size_h_div16; i++) {
1833 __m256i buf0[64];
1834 const int32_t *input_row = input + i * input_stride * 16;
1835 for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
1836 __m256i *buf0_cur = buf0 + j * 16;
1837 load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
1838 buf0_cur, 16);
1839 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1840 }
1841 if (rect_type == 1 || rect_type == -1) {
1842 round_shift_avx2(buf0, buf0, input_stride); // rect special code
1843 }
1844 row_txfm(buf0, buf0, cos_bit_row);
1845 round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1846 __m256i *_buf1 = buf1;
1847 if (lr_flip) {
1848 for (int j = 0; j < buf_size_w_div16; ++j) {
1849 __m256i temp[16];
1850 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1851 transpose_16bit_16x16_avx2(temp,
1852 _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1853 }
1854 } else {
1855 for (int j = 0; j < buf_size_w_div16; ++j) {
1856 transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1857 }
1858 }
1859 for (int j = 0; j < buf_size_w_div16; ++j) {
1860 iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1861 buf1 + j * 16, shift[1], 16, txh_idx);
1862 }
1863 }
1864 }
1865
1866 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1867 static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
1868 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1869 TX_SIZE tx_size, int eob) {
1870 (void)eob;
1871 switch (tx_type) {
1872 case DCT_DCT:
1873 case ADST_DCT: // ADST in vertical, DCT in horizontal
1874 case DCT_ADST: // DCT in vertical, ADST in horizontal
1875 case ADST_ADST: // ADST in both directions
1876 case FLIPADST_DCT:
1877 case DCT_FLIPADST:
1878 case FLIPADST_FLIPADST:
1879 case ADST_FLIPADST:
1880 case FLIPADST_ADST:
1881 lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
1882 tx_size, eob);
1883 break;
1884 case IDTX:
1885 lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
1886 break;
1887 case V_DCT:
1888 case V_ADST:
1889 case V_FLIPADST:
1890 lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
1891 tx_size, eob);
1892 break;
1893 case H_DCT:
1894 case H_ADST:
1895 case H_FLIPADST:
1896 lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
1897 tx_size, eob);
1898 break;
1899 default:
1900 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1901 eob);
1902 break;
1903 }
1904 }
1905
av1_lowbd_inv_txfm2d_add_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1906 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
1907 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
1908 int eob) {
1909 switch (tx_size) {
1910 case TX_4X4:
1911 case TX_8X8:
1912 case TX_4X8:
1913 case TX_8X4:
1914 case TX_8X16:
1915 case TX_16X8:
1916 case TX_4X16:
1917 case TX_16X4:
1918 case TX_8X32:
1919 case TX_32X8:
1920 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1921 eob);
1922 break;
1923 case TX_16X16:
1924 case TX_32X32:
1925 case TX_64X64:
1926 case TX_16X32:
1927 case TX_32X16:
1928 case TX_32X64:
1929 case TX_64X32:
1930 case TX_16X64:
1931 case TX_64X16:
1932 default:
1933 lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
1934 tx_size, eob);
1935 break;
1936 }
1937 }
1938
av1_inv_txfm_add_avx2(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)1939 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
1940 const TxfmParam *txfm_param) {
1941 const TX_TYPE tx_type = txfm_param->tx_type;
1942 if (!txfm_param->lossless) {
1943 av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
1944 txfm_param->tx_size, txfm_param->eob);
1945 } else {
1946 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
1947 }
1948 }
1949