1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/av1_rtcd.h"
13
14 #include "av1/common/enums.h"
15 #include "av1/common/av1_txfm.h"
16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22
fdct16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)23 static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
24 int8_t cos_bit) {
25 const int32_t *cospi = cospi_arr(cos_bit);
26 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
27
28 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
29 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
30 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
31 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
32 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
33 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
34 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
35 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
36 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
37 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
38 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
39 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
40 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
41 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
42 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
43 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
44 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
45 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
46
47 // stage 1
48 __m256i x1[16];
49 btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
50 btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
51 btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
52 btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
53 btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
54 btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
55 btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
56 btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
57
58 // stage 2
59 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
60 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
61 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
62 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
63 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
64 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
65
66 // stage 3
67 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
68 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
69 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
70 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
71 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
72 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
73 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
74
75 // stage 4
76 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
77 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
78 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
79 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
80 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
81 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
82
83 // stage 5
84 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
85 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
86 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
87 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
88 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
89 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
90
91 // stage 6
92 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
93 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
94 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
95 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
96
97 // stage 7
98 output[0] = x1[0];
99 output[1] = x1[8];
100 output[2] = x1[4];
101 output[3] = x1[12];
102 output[4] = x1[2];
103 output[5] = x1[10];
104 output[6] = x1[6];
105 output[7] = x1[14];
106 output[8] = x1[1];
107 output[9] = x1[9];
108 output[10] = x1[5];
109 output[11] = x1[13];
110 output[12] = x1[3];
111 output[13] = x1[11];
112 output[14] = x1[7];
113 output[15] = x1[15];
114 }
115
fdct16x32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)116 static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
117 int8_t cos_bit) {
118 const int32_t *cospi = cospi_arr(cos_bit);
119 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
120
121 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
122 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
123 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
124 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
125 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
126 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
127 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
128 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
129 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
130 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
131 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
132 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
133 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
134 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
135 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
136 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
137 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
138 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
139 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
140 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
141 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
142 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
143 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
144 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
145 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
146 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
147 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
148 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
149 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
150 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
151 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
152 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
153 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
154 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
155 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
156 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
157
158 // stage 1
159 __m256i x1[32];
160 btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
161 btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
162 btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
163 btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
164 btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
165 btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
166 btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
167 btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
168 btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
169 btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
170 btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
171 btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
172 btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
173 btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
174 btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
175 btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
176
177 // stage 2
178 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
179 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
180 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
181 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
182 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
183 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
184 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
185 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
186 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
187 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
188 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
189 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
190
191 // stage 3
192 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
193 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
194 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
195 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
196 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
197 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
198 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
199 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
200 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
201 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
202 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
203 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
204 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
205 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
206
207 // stage 4
208 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
209 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
210 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
211 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
212 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
213 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
214 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
215 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
216 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
217 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
218 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
219
220 // stage 5
221 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
222 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
223 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
224 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
225 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
226 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
227 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
228 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
229 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
230 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
231 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
232 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
233 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
234 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
235
236 // stage 6
237 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
238 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
239 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
240 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
241 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
242 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
243 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
244 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
245 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
246 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
247
248 // stage 7
249 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
250 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
251 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
252 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
253 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
254 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
255 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
256 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
257 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
258 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
259 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
260 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
261
262 // stage 8
263 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
264 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
265 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
266 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
267 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
268 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
269 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
270 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
271
272 // stage 9
273 output[0] = x1[0];
274 output[1] = x1[16];
275 output[2] = x1[8];
276 output[3] = x1[24];
277 output[4] = x1[4];
278 output[5] = x1[20];
279 output[6] = x1[12];
280 output[7] = x1[28];
281 output[8] = x1[2];
282 output[9] = x1[18];
283 output[10] = x1[10];
284 output[11] = x1[26];
285 output[12] = x1[6];
286 output[13] = x1[22];
287 output[14] = x1[14];
288 output[15] = x1[30];
289 output[16] = x1[1];
290 output[17] = x1[17];
291 output[18] = x1[9];
292 output[19] = x1[25];
293 output[20] = x1[5];
294 output[21] = x1[21];
295 output[22] = x1[13];
296 output[23] = x1[29];
297 output[24] = x1[3];
298 output[25] = x1[19];
299 output[26] = x1[11];
300 output[27] = x1[27];
301 output[28] = x1[7];
302 output[29] = x1[23];
303 output[30] = x1[15];
304 output[31] = x1[31];
305 }
306
fdct16x64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)307 static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
308 int8_t cos_bit) {
309 const int32_t *cospi = cospi_arr(cos_bit);
310 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
311
312 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
313 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
314 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
315 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
316 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
317 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
318 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
319 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
320 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
321 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
322 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
323 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
324 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
325 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
326 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
327 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
328 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
329 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
330 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
331 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
332 __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
333 __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
334 __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
335 __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
336 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
337 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
338 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
339 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
340 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
341 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
342 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
343 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
344 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
345 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
346 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
347 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
348 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
349 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
350 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
351 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
352 __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
353 __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
354 __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
355 __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
356 __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
357 __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
358 __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
359 __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
360 __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
361 __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
362 __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
363 __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
364 __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
365 __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
366 __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
367 __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
368 __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
369 __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
370 __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
371 __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
372 __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
373 __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
374 __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
375 __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
376 __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
377 __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
378 __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
379 __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
380 __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
381 __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
382 __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
383 __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
384
385 // stage 1
386 __m256i x1[64];
387 btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
388 btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
389 btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
390 btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
391 btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
392 btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
393 btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
394 btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
395 btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
396 btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
397 btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
398 btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
399 btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
400 btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
401 btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
402 btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
403 btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
404 btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
405 btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
406 btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
407 btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
408 btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
409 btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
410 btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
411 btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
412 btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
413 btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
414 btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
415 btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
416 btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
417 btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
418 btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
419
420 // stage 2
421 btf_16_adds_subs_avx2(&x1[0], &x1[31]);
422 btf_16_adds_subs_avx2(&x1[1], &x1[30]);
423 btf_16_adds_subs_avx2(&x1[2], &x1[29]);
424 btf_16_adds_subs_avx2(&x1[3], &x1[28]);
425 btf_16_adds_subs_avx2(&x1[4], &x1[27]);
426 btf_16_adds_subs_avx2(&x1[5], &x1[26]);
427 btf_16_adds_subs_avx2(&x1[6], &x1[25]);
428 btf_16_adds_subs_avx2(&x1[7], &x1[24]);
429 btf_16_adds_subs_avx2(&x1[8], &x1[23]);
430 btf_16_adds_subs_avx2(&x1[9], &x1[22]);
431 btf_16_adds_subs_avx2(&x1[10], &x1[21]);
432 btf_16_adds_subs_avx2(&x1[11], &x1[20]);
433 btf_16_adds_subs_avx2(&x1[12], &x1[19]);
434 btf_16_adds_subs_avx2(&x1[13], &x1[18]);
435 btf_16_adds_subs_avx2(&x1[14], &x1[17]);
436 btf_16_adds_subs_avx2(&x1[15], &x1[16]);
437 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
438 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
439 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
440 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
441 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
442 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
444 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
445
446 // stage 3
447 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
448 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
449 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
450 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
451 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
452 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
453 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
454 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
455 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
456 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
457 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
458 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
459 btf_16_adds_subs_avx2(&x1[32], &x1[47]);
460 btf_16_adds_subs_avx2(&x1[33], &x1[46]);
461 btf_16_adds_subs_avx2(&x1[34], &x1[45]);
462 btf_16_adds_subs_avx2(&x1[35], &x1[44]);
463 btf_16_adds_subs_avx2(&x1[36], &x1[43]);
464 btf_16_adds_subs_avx2(&x1[37], &x1[42]);
465 btf_16_adds_subs_avx2(&x1[38], &x1[41]);
466 btf_16_adds_subs_avx2(&x1[39], &x1[40]);
467 btf_16_adds_subs_avx2(&x1[63], &x1[48]);
468 btf_16_adds_subs_avx2(&x1[62], &x1[49]);
469 btf_16_adds_subs_avx2(&x1[61], &x1[50]);
470 btf_16_adds_subs_avx2(&x1[60], &x1[51]);
471 btf_16_adds_subs_avx2(&x1[59], &x1[52]);
472 btf_16_adds_subs_avx2(&x1[58], &x1[53]);
473 btf_16_adds_subs_avx2(&x1[57], &x1[54]);
474 btf_16_adds_subs_avx2(&x1[56], &x1[55]);
475
476 // stage 4
477 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
478 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
479 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
480 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
481 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
482 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
483 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
484 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
485 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
486 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
487 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
488 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
489 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
490 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
491 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
492 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
493 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
494 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
495 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
497 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
498 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
499
500 // stage 5
501 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
502 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
503 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
504 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
505 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
506 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
507 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
509 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
511 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
512 btf_16_adds_subs_avx2(&x1[32], &x1[39]);
513 btf_16_adds_subs_avx2(&x1[33], &x1[38]);
514 btf_16_adds_subs_avx2(&x1[34], &x1[37]);
515 btf_16_adds_subs_avx2(&x1[35], &x1[36]);
516 btf_16_adds_subs_avx2(&x1[47], &x1[40]);
517 btf_16_adds_subs_avx2(&x1[46], &x1[41]);
518 btf_16_adds_subs_avx2(&x1[45], &x1[42]);
519 btf_16_adds_subs_avx2(&x1[44], &x1[43]);
520 btf_16_adds_subs_avx2(&x1[48], &x1[55]);
521 btf_16_adds_subs_avx2(&x1[49], &x1[54]);
522 btf_16_adds_subs_avx2(&x1[50], &x1[53]);
523 btf_16_adds_subs_avx2(&x1[51], &x1[52]);
524 btf_16_adds_subs_avx2(&x1[63], &x1[56]);
525 btf_16_adds_subs_avx2(&x1[62], &x1[57]);
526 btf_16_adds_subs_avx2(&x1[61], &x1[58]);
527 btf_16_adds_subs_avx2(&x1[60], &x1[59]);
528
529 // stage 6
530 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
531 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
532 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
533 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
534 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
535 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
536 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
537 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
538 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
539 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
540 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
541 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
542 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
543 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
544 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
545 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
546 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
547 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
548 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
549 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
550 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
551 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
552
553 // stage 7
554 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
555 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
556 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
557 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
558 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
559 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
560 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
561 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
562 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
563 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
564 btf_16_adds_subs_avx2(&x1[32], &x1[35]);
565 btf_16_adds_subs_avx2(&x1[33], &x1[34]);
566 btf_16_adds_subs_avx2(&x1[39], &x1[36]);
567 btf_16_adds_subs_avx2(&x1[38], &x1[37]);
568 btf_16_adds_subs_avx2(&x1[40], &x1[43]);
569 btf_16_adds_subs_avx2(&x1[41], &x1[42]);
570 btf_16_adds_subs_avx2(&x1[47], &x1[44]);
571 btf_16_adds_subs_avx2(&x1[46], &x1[45]);
572 btf_16_adds_subs_avx2(&x1[48], &x1[51]);
573 btf_16_adds_subs_avx2(&x1[49], &x1[50]);
574 btf_16_adds_subs_avx2(&x1[55], &x1[52]);
575 btf_16_adds_subs_avx2(&x1[54], &x1[53]);
576 btf_16_adds_subs_avx2(&x1[56], &x1[59]);
577 btf_16_adds_subs_avx2(&x1[57], &x1[58]);
578 btf_16_adds_subs_avx2(&x1[63], &x1[60]);
579 btf_16_adds_subs_avx2(&x1[62], &x1[61]);
580
581 // stage 8
582 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
583 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
584 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
585 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
586 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
587 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
588 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
589 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
590 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
591 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
592 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
593 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
594 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
595 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
596 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
597 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
598 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
599 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
600 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
601 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
602
603 // stage 9
604 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
605 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
606 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
607 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
608 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
609 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
610 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
611 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
612 btf_16_adds_subs_avx2(&x1[32], &x1[33]);
613 btf_16_adds_subs_avx2(&x1[35], &x1[34]);
614 btf_16_adds_subs_avx2(&x1[36], &x1[37]);
615 btf_16_adds_subs_avx2(&x1[39], &x1[38]);
616 btf_16_adds_subs_avx2(&x1[40], &x1[41]);
617 btf_16_adds_subs_avx2(&x1[43], &x1[42]);
618 btf_16_adds_subs_avx2(&x1[44], &x1[45]);
619 btf_16_adds_subs_avx2(&x1[47], &x1[46]);
620 btf_16_adds_subs_avx2(&x1[48], &x1[49]);
621 btf_16_adds_subs_avx2(&x1[51], &x1[50]);
622 btf_16_adds_subs_avx2(&x1[52], &x1[53]);
623 btf_16_adds_subs_avx2(&x1[55], &x1[54]);
624 btf_16_adds_subs_avx2(&x1[56], &x1[57]);
625 btf_16_adds_subs_avx2(&x1[59], &x1[58]);
626 btf_16_adds_subs_avx2(&x1[60], &x1[61]);
627 btf_16_adds_subs_avx2(&x1[63], &x1[62]);
628
629 // stage 10
630 btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
631 btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
632 btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
633 btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
634 btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
635 btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
636 btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
637 btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
638 btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
639 btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
640 btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
641 btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
642 btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
643 btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
644 btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
645 btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
646
647 // stage 11
648 output[0] = x1[0];
649 output[1] = x1[32];
650 output[2] = x1[16];
651 output[3] = x1[48];
652 output[4] = x1[8];
653 output[5] = x1[40];
654 output[6] = x1[24];
655 output[7] = x1[56];
656 output[8] = x1[4];
657 output[9] = x1[36];
658 output[10] = x1[20];
659 output[11] = x1[52];
660 output[12] = x1[12];
661 output[13] = x1[44];
662 output[14] = x1[28];
663 output[15] = x1[60];
664 output[16] = x1[2];
665 output[17] = x1[34];
666 output[18] = x1[18];
667 output[19] = x1[50];
668 output[20] = x1[10];
669 output[21] = x1[42];
670 output[22] = x1[26];
671 output[23] = x1[58];
672 output[24] = x1[6];
673 output[25] = x1[38];
674 output[26] = x1[22];
675 output[27] = x1[54];
676 output[28] = x1[14];
677 output[29] = x1[46];
678 output[30] = x1[30];
679 output[31] = x1[62];
680 output[32] = x1[1];
681 output[33] = x1[33];
682 output[34] = x1[17];
683 output[35] = x1[49];
684 output[36] = x1[9];
685 output[37] = x1[41];
686 output[38] = x1[25];
687 output[39] = x1[57];
688 output[40] = x1[5];
689 output[41] = x1[37];
690 output[42] = x1[21];
691 output[43] = x1[53];
692 output[44] = x1[13];
693 output[45] = x1[45];
694 output[46] = x1[29];
695 output[47] = x1[61];
696 output[48] = x1[3];
697 output[49] = x1[35];
698 output[50] = x1[19];
699 output[51] = x1[51];
700 output[52] = x1[11];
701 output[53] = x1[43];
702 output[54] = x1[27];
703 output[55] = x1[59];
704 output[56] = x1[7];
705 output[57] = x1[39];
706 output[58] = x1[23];
707 output[59] = x1[55];
708 output[60] = x1[15];
709 output[61] = x1[47];
710 output[62] = x1[31];
711 output[63] = x1[63];
712 }
713
av1_fdct32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)714 static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
715 int8_t cos_bit) {
716 __m256i x1[32];
717 const int32_t *cospi = cospi_arr(cos_bit);
718 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
719 // stage 0
720 // stage 1
721 btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
722 btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
723 btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
724 btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
725 btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
726 btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
727 btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
728 btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
729 btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
730 btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
731 btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
732 btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
733 btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
734 btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
735 btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
736 btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
737
738 // stage 2
739 btf_32_add_sub_avx2(&x1[0], &x1[15]);
740 btf_32_add_sub_avx2(&x1[1], &x1[14]);
741 btf_32_add_sub_avx2(&x1[2], &x1[13]);
742 btf_32_add_sub_avx2(&x1[3], &x1[12]);
743 btf_32_add_sub_avx2(&x1[4], &x1[11]);
744 btf_32_add_sub_avx2(&x1[5], &x1[10]);
745 btf_32_add_sub_avx2(&x1[6], &x1[9]);
746 btf_32_add_sub_avx2(&x1[7], &x1[8]);
747 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
748 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
749 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
750 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
751
752 // stage 3
753 btf_32_add_sub_avx2(&x1[0], &x1[7]);
754 btf_32_add_sub_avx2(&x1[1], &x1[6]);
755 btf_32_add_sub_avx2(&x1[2], &x1[5]);
756 btf_32_add_sub_avx2(&x1[3], &x1[4]);
757 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
758 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
759 btf_32_add_sub_avx2(&x1[16], &x1[23]);
760 btf_32_add_sub_avx2(&x1[17], &x1[22]);
761 btf_32_add_sub_avx2(&x1[18], &x1[21]);
762 btf_32_add_sub_avx2(&x1[19], &x1[20]);
763 btf_32_add_sub_avx2(&x1[31], &x1[24]);
764 btf_32_add_sub_avx2(&x1[30], &x1[25]);
765 btf_32_add_sub_avx2(&x1[29], &x1[26]);
766 btf_32_add_sub_avx2(&x1[28], &x1[27]);
767
768 // stage 4
769 btf_32_add_sub_avx2(&x1[0], &x1[3]);
770 btf_32_add_sub_avx2(&x1[1], &x1[2]);
771 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
772 btf_32_add_sub_avx2(&x1[8], &x1[11]);
773 btf_32_add_sub_avx2(&x1[9], &x1[10]);
774 btf_32_add_sub_avx2(&x1[15], &x1[12]);
775 btf_32_add_sub_avx2(&x1[14], &x1[13]);
776 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
777 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
778 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
779 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
780
781 // stage 5
782 btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
783 btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
784 btf_32_add_sub_avx2(&x1[4], &x1[5]);
785 btf_32_add_sub_avx2(&x1[7], &x1[6]);
786 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
787 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
788 btf_32_add_sub_avx2(&x1[16], &x1[19]);
789 btf_32_add_sub_avx2(&x1[17], &x1[18]);
790 btf_32_add_sub_avx2(&x1[23], &x1[20]);
791 btf_32_add_sub_avx2(&x1[22], &x1[21]);
792 btf_32_add_sub_avx2(&x1[24], &x1[27]);
793 btf_32_add_sub_avx2(&x1[25], &x1[26]);
794 btf_32_add_sub_avx2(&x1[31], &x1[28]);
795 btf_32_add_sub_avx2(&x1[30], &x1[29]);
796
797 // stage 6
798 btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
799 btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
800 btf_32_add_sub_avx2(&x1[8], &x1[9]);
801 btf_32_add_sub_avx2(&x1[11], &x1[10]);
802 btf_32_add_sub_avx2(&x1[12], &x1[13]);
803 btf_32_add_sub_avx2(&x1[15], &x1[14]);
804 btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
805 btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
806 btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
807 btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
808
809 // stage 7
810 btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
811 btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
812 btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
813 btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
814 btf_32_add_sub_avx2(&x1[16], &x1[17]);
815 btf_32_add_sub_avx2(&x1[19], &x1[18]);
816 btf_32_add_sub_avx2(&x1[20], &x1[21]);
817 btf_32_add_sub_avx2(&x1[23], &x1[22]);
818 btf_32_add_sub_avx2(&x1[24], &x1[25]);
819 btf_32_add_sub_avx2(&x1[27], &x1[26]);
820 btf_32_add_sub_avx2(&x1[28], &x1[29]);
821 btf_32_add_sub_avx2(&x1[31], &x1[30]);
822
823 // stage 8
824 btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
825 btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
826 btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
827 btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
828 btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
829 btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
830 btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
831 btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
832
833 // stage 9
834 output[0] = x1[0];
835 output[1] = x1[16];
836 output[2] = x1[8];
837 output[3] = x1[24];
838 output[4] = x1[4];
839 output[5] = x1[20];
840 output[6] = x1[12];
841 output[7] = x1[28];
842 output[8] = x1[2];
843 output[9] = x1[18];
844 output[10] = x1[10];
845 output[11] = x1[26];
846 output[12] = x1[6];
847 output[13] = x1[22];
848 output[14] = x1[14];
849 output[15] = x1[30];
850 output[16] = x1[1];
851 output[17] = x1[17];
852 output[18] = x1[9];
853 output[19] = x1[25];
854 output[20] = x1[5];
855 output[21] = x1[21];
856 output[22] = x1[13];
857 output[23] = x1[29];
858 output[24] = x1[3];
859 output[25] = x1[19];
860 output[26] = x1[11];
861 output[27] = x1[27];
862 output[28] = x1[7];
863 output[29] = x1[23];
864 output[30] = x1[15];
865 output[31] = x1[31];
866 }
867
av1_fdct64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)868 static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
869 int8_t cos_bit) {
870 const int32_t *cospi = cospi_arr(cos_bit);
871 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
872
873 __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
874 __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
875 __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
876 __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
877 __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
878 __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
879 __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
880 __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
881 __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
882 __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
883 __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
884 __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
885 __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
886 __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
887 __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
888 __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
889 __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
890 __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
891 __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
892 __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
893 __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
894 __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
895 __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
896 __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
897 __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
898 __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
899 __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
900 __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
901 __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
902 __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
903 __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
904 __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
905 __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
906 __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
907 __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
908 __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
909 __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
910 __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
911 __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
912 __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
913 __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
914 __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
915 __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
916 __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
917 __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
918 __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
919 __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
920 __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
921 __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
922 __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
923 __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
924 __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
925 __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
926 __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
927 __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
928 __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
929 __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
930 __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
931 __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
932 __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
933 __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
934 __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
935 __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
936 __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
937 __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
938 __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
939 __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
940 __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
941 __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
942 __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
943 __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
944 __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
945 __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
946 __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
947 __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
948 __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
949 __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
950 __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
951
952 // stage 1
953 __m256i x1[64];
954 btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
955 btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
956 btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
957 btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
958 btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
959 btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
960 btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
961 btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
962 btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
963 btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
964 btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
965 btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
966 btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
967 btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
968 btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
969 btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
970 btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
971 btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
972 btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
973 btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
974 btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
975 btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
976 btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
977 btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
978 btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
979 btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
980 btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
981 btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
982 btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
983 btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
984 btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
985 btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
986
987 // stage 2
988 btf_32_add_sub_avx2(&x1[0], &x1[31]);
989 btf_32_add_sub_avx2(&x1[1], &x1[30]);
990 btf_32_add_sub_avx2(&x1[2], &x1[29]);
991 btf_32_add_sub_avx2(&x1[3], &x1[28]);
992 btf_32_add_sub_avx2(&x1[4], &x1[27]);
993 btf_32_add_sub_avx2(&x1[5], &x1[26]);
994 btf_32_add_sub_avx2(&x1[6], &x1[25]);
995 btf_32_add_sub_avx2(&x1[7], &x1[24]);
996 btf_32_add_sub_avx2(&x1[8], &x1[23]);
997 btf_32_add_sub_avx2(&x1[9], &x1[22]);
998 btf_32_add_sub_avx2(&x1[10], &x1[21]);
999 btf_32_add_sub_avx2(&x1[11], &x1[20]);
1000 btf_32_add_sub_avx2(&x1[12], &x1[19]);
1001 btf_32_add_sub_avx2(&x1[13], &x1[18]);
1002 btf_32_add_sub_avx2(&x1[14], &x1[17]);
1003 btf_32_add_sub_avx2(&x1[15], &x1[16]);
1004 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
1005 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
1006 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
1007 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
1008 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
1009 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
1010 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
1011 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
1012
1013 // stage 3
1014 btf_32_add_sub_avx2(&x1[0], &x1[15]);
1015 btf_32_add_sub_avx2(&x1[1], &x1[14]);
1016 btf_32_add_sub_avx2(&x1[2], &x1[13]);
1017 btf_32_add_sub_avx2(&x1[3], &x1[12]);
1018 btf_32_add_sub_avx2(&x1[4], &x1[11]);
1019 btf_32_add_sub_avx2(&x1[5], &x1[10]);
1020 btf_32_add_sub_avx2(&x1[6], &x1[9]);
1021 btf_32_add_sub_avx2(&x1[7], &x1[8]);
1022 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
1023 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
1024 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
1025 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
1026 btf_32_add_sub_avx2(&x1[32], &x1[47]);
1027 btf_32_add_sub_avx2(&x1[33], &x1[46]);
1028 btf_32_add_sub_avx2(&x1[34], &x1[45]);
1029 btf_32_add_sub_avx2(&x1[35], &x1[44]);
1030 btf_32_add_sub_avx2(&x1[36], &x1[43]);
1031 btf_32_add_sub_avx2(&x1[37], &x1[42]);
1032 btf_32_add_sub_avx2(&x1[38], &x1[41]);
1033 btf_32_add_sub_avx2(&x1[39], &x1[40]);
1034 btf_32_add_sub_avx2(&x1[63], &x1[48]);
1035 btf_32_add_sub_avx2(&x1[62], &x1[49]);
1036 btf_32_add_sub_avx2(&x1[61], &x1[50]);
1037 btf_32_add_sub_avx2(&x1[60], &x1[51]);
1038 btf_32_add_sub_avx2(&x1[59], &x1[52]);
1039 btf_32_add_sub_avx2(&x1[58], &x1[53]);
1040 btf_32_add_sub_avx2(&x1[57], &x1[54]);
1041 btf_32_add_sub_avx2(&x1[56], &x1[55]);
1042
1043 // stage 4
1044 btf_32_add_sub_avx2(&x1[0], &x1[7]);
1045 btf_32_add_sub_avx2(&x1[1], &x1[6]);
1046 btf_32_add_sub_avx2(&x1[2], &x1[5]);
1047 btf_32_add_sub_avx2(&x1[3], &x1[4]);
1048 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
1049 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
1050 btf_32_add_sub_avx2(&x1[16], &x1[23]);
1051 btf_32_add_sub_avx2(&x1[17], &x1[22]);
1052 btf_32_add_sub_avx2(&x1[18], &x1[21]);
1053 btf_32_add_sub_avx2(&x1[19], &x1[20]);
1054 btf_32_add_sub_avx2(&x1[31], &x1[24]);
1055 btf_32_add_sub_avx2(&x1[30], &x1[25]);
1056 btf_32_add_sub_avx2(&x1[29], &x1[26]);
1057 btf_32_add_sub_avx2(&x1[28], &x1[27]);
1058 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
1059 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
1060 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
1061 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
1062 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
1063 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
1064 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
1065 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
1066
1067 // stage 5
1068 btf_32_add_sub_avx2(&x1[0], &x1[3]);
1069 btf_32_add_sub_avx2(&x1[1], &x1[2]);
1070 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
1071 btf_32_add_sub_avx2(&x1[8], &x1[11]);
1072 btf_32_add_sub_avx2(&x1[9], &x1[10]);
1073 btf_32_add_sub_avx2(&x1[15], &x1[12]);
1074 btf_32_add_sub_avx2(&x1[14], &x1[13]);
1075 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
1076 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
1077 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
1078 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
1079 btf_32_add_sub_avx2(&x1[32], &x1[39]);
1080 btf_32_add_sub_avx2(&x1[33], &x1[38]);
1081 btf_32_add_sub_avx2(&x1[34], &x1[37]);
1082 btf_32_add_sub_avx2(&x1[35], &x1[36]);
1083 btf_32_add_sub_avx2(&x1[47], &x1[40]);
1084 btf_32_add_sub_avx2(&x1[46], &x1[41]);
1085 btf_32_add_sub_avx2(&x1[45], &x1[42]);
1086 btf_32_add_sub_avx2(&x1[44], &x1[43]);
1087 btf_32_add_sub_avx2(&x1[48], &x1[55]);
1088 btf_32_add_sub_avx2(&x1[49], &x1[54]);
1089 btf_32_add_sub_avx2(&x1[50], &x1[53]);
1090 btf_32_add_sub_avx2(&x1[51], &x1[52]);
1091 btf_32_add_sub_avx2(&x1[63], &x1[56]);
1092 btf_32_add_sub_avx2(&x1[62], &x1[57]);
1093 btf_32_add_sub_avx2(&x1[61], &x1[58]);
1094 btf_32_add_sub_avx2(&x1[60], &x1[59]);
1095
1096 // stage 6
1097 btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
1098 btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
1099 btf_32_add_sub_avx2(&x1[4], &x1[5]);
1100 btf_32_add_sub_avx2(&x1[7], &x1[6]);
1101 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
1102 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
1103 btf_32_add_sub_avx2(&x1[16], &x1[19]);
1104 btf_32_add_sub_avx2(&x1[17], &x1[18]);
1105 btf_32_add_sub_avx2(&x1[23], &x1[20]);
1106 btf_32_add_sub_avx2(&x1[22], &x1[21]);
1107 btf_32_add_sub_avx2(&x1[24], &x1[27]);
1108 btf_32_add_sub_avx2(&x1[25], &x1[26]);
1109 btf_32_add_sub_avx2(&x1[31], &x1[28]);
1110 btf_32_add_sub_avx2(&x1[30], &x1[29]);
1111 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
1112 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
1113 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
1114 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
1115 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
1116 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
1117 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
1118 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
1119
1120 // stage 7
1121 btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
1122 btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
1123 btf_32_add_sub_avx2(&x1[8], &x1[9]);
1124 btf_32_add_sub_avx2(&x1[11], &x1[10]);
1125 btf_32_add_sub_avx2(&x1[12], &x1[13]);
1126 btf_32_add_sub_avx2(&x1[15], &x1[14]);
1127 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
1128 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
1129 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
1130 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
1131 btf_32_add_sub_avx2(&x1[32], &x1[35]);
1132 btf_32_add_sub_avx2(&x1[33], &x1[34]);
1133 btf_32_add_sub_avx2(&x1[39], &x1[36]);
1134 btf_32_add_sub_avx2(&x1[38], &x1[37]);
1135 btf_32_add_sub_avx2(&x1[40], &x1[43]);
1136 btf_32_add_sub_avx2(&x1[41], &x1[42]);
1137 btf_32_add_sub_avx2(&x1[47], &x1[44]);
1138 btf_32_add_sub_avx2(&x1[46], &x1[45]);
1139 btf_32_add_sub_avx2(&x1[48], &x1[51]);
1140 btf_32_add_sub_avx2(&x1[49], &x1[50]);
1141 btf_32_add_sub_avx2(&x1[55], &x1[52]);
1142 btf_32_add_sub_avx2(&x1[54], &x1[53]);
1143 btf_32_add_sub_avx2(&x1[56], &x1[59]);
1144 btf_32_add_sub_avx2(&x1[57], &x1[58]);
1145 btf_32_add_sub_avx2(&x1[63], &x1[60]);
1146 btf_32_add_sub_avx2(&x1[62], &x1[61]);
1147
1148 // stage 8
1149 btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
1150 btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
1151 btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
1152 btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
1153 btf_32_add_sub_avx2(&x1[16], &x1[17]);
1154 btf_32_add_sub_avx2(&x1[19], &x1[18]);
1155 btf_32_add_sub_avx2(&x1[20], &x1[21]);
1156 btf_32_add_sub_avx2(&x1[23], &x1[22]);
1157 btf_32_add_sub_avx2(&x1[24], &x1[25]);
1158 btf_32_add_sub_avx2(&x1[27], &x1[26]);
1159 btf_32_add_sub_avx2(&x1[28], &x1[29]);
1160 btf_32_add_sub_avx2(&x1[31], &x1[30]);
1161 btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
1162 btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
1163 btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
1164 btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
1165 btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
1166 btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
1167 btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
1168 btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
1169
1170 // stage 9
1171 btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
1172 btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
1173 btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
1174 btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
1175 btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
1176 btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
1177 btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
1178 btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
1179 btf_32_add_sub_avx2(&x1[32], &x1[33]);
1180 btf_32_add_sub_avx2(&x1[35], &x1[34]);
1181 btf_32_add_sub_avx2(&x1[36], &x1[37]);
1182 btf_32_add_sub_avx2(&x1[39], &x1[38]);
1183 btf_32_add_sub_avx2(&x1[40], &x1[41]);
1184 btf_32_add_sub_avx2(&x1[43], &x1[42]);
1185 btf_32_add_sub_avx2(&x1[44], &x1[45]);
1186 btf_32_add_sub_avx2(&x1[47], &x1[46]);
1187 btf_32_add_sub_avx2(&x1[48], &x1[49]);
1188 btf_32_add_sub_avx2(&x1[51], &x1[50]);
1189 btf_32_add_sub_avx2(&x1[52], &x1[53]);
1190 btf_32_add_sub_avx2(&x1[55], &x1[54]);
1191 btf_32_add_sub_avx2(&x1[56], &x1[57]);
1192 btf_32_add_sub_avx2(&x1[59], &x1[58]);
1193 btf_32_add_sub_avx2(&x1[60], &x1[61]);
1194 btf_32_add_sub_avx2(&x1[63], &x1[62]);
1195
1196 // stage 10
1197 btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
1198 btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
1199 btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
1200 btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
1201 btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
1202 btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
1203 btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
1204 btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
1205 btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
1206 btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
1207 btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
1208 btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
1209 btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
1210 btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
1211 btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
1212 btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
1213
1214 // stage 11
1215 output[0] = x1[0];
1216 output[1] = x1[32];
1217 output[2] = x1[16];
1218 output[3] = x1[48];
1219 output[4] = x1[8];
1220 output[5] = x1[40];
1221 output[6] = x1[24];
1222 output[7] = x1[56];
1223 output[8] = x1[4];
1224 output[9] = x1[36];
1225 output[10] = x1[20];
1226 output[11] = x1[52];
1227 output[12] = x1[12];
1228 output[13] = x1[44];
1229 output[14] = x1[28];
1230 output[15] = x1[60];
1231 output[16] = x1[2];
1232 output[17] = x1[34];
1233 output[18] = x1[18];
1234 output[19] = x1[50];
1235 output[20] = x1[10];
1236 output[21] = x1[42];
1237 output[22] = x1[26];
1238 output[23] = x1[58];
1239 output[24] = x1[6];
1240 output[25] = x1[38];
1241 output[26] = x1[22];
1242 output[27] = x1[54];
1243 output[28] = x1[14];
1244 output[29] = x1[46];
1245 output[30] = x1[30];
1246 output[31] = x1[62];
1247 output[32] = x1[1];
1248 output[33] = x1[33];
1249 output[34] = x1[17];
1250 output[35] = x1[49];
1251 output[36] = x1[9];
1252 output[37] = x1[41];
1253 output[38] = x1[25];
1254 output[39] = x1[57];
1255 output[40] = x1[5];
1256 output[41] = x1[37];
1257 output[42] = x1[21];
1258 output[43] = x1[53];
1259 output[44] = x1[13];
1260 output[45] = x1[45];
1261 output[46] = x1[29];
1262 output[47] = x1[61];
1263 output[48] = x1[3];
1264 output[49] = x1[35];
1265 output[50] = x1[19];
1266 output[51] = x1[51];
1267 output[52] = x1[11];
1268 output[53] = x1[43];
1269 output[54] = x1[27];
1270 output[55] = x1[59];
1271 output[56] = x1[7];
1272 output[57] = x1[39];
1273 output[58] = x1[23];
1274 output[59] = x1[55];
1275 output[60] = x1[15];
1276 output[61] = x1[47];
1277 output[62] = x1[31];
1278 output[63] = x1[63];
1279 }
1280
fadst16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1281 static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
1282 int8_t cos_bit) {
1283 const int32_t *cospi = cospi_arr(cos_bit);
1284 const __m256i __zero = _mm256_setzero_si256();
1285 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
1286
1287 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1288 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
1289 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
1290 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
1291 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
1292 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
1293 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
1294 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
1295 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
1296 __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
1297 __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
1298 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
1299 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
1300 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
1301 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
1302 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
1303 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
1304 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
1305 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
1306 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
1307 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
1308 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
1309 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
1310 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
1311 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
1312 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
1313 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
1314
1315 // stage 1
1316 __m256i x1[16];
1317 x1[0] = input[0];
1318 x1[1] = _mm256_subs_epi16(__zero, input[15]);
1319 x1[2] = _mm256_subs_epi16(__zero, input[7]);
1320 x1[3] = input[8];
1321 x1[4] = _mm256_subs_epi16(__zero, input[3]);
1322 x1[5] = input[12];
1323 x1[6] = input[4];
1324 x1[7] = _mm256_subs_epi16(__zero, input[11]);
1325 x1[8] = _mm256_subs_epi16(__zero, input[1]);
1326 x1[9] = input[14];
1327 x1[10] = input[6];
1328 x1[11] = _mm256_subs_epi16(__zero, input[9]);
1329 x1[12] = input[2];
1330 x1[13] = _mm256_subs_epi16(__zero, input[13]);
1331 x1[14] = _mm256_subs_epi16(__zero, input[5]);
1332 x1[15] = input[10];
1333
1334 // stage 2
1335 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
1336 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
1337 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
1338 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
1339
1340 // stage 3
1341 btf_16_adds_subs_avx2(&x1[0], &x1[2]);
1342 btf_16_adds_subs_avx2(&x1[1], &x1[3]);
1343 btf_16_adds_subs_avx2(&x1[4], &x1[6]);
1344 btf_16_adds_subs_avx2(&x1[5], &x1[7]);
1345 btf_16_adds_subs_avx2(&x1[8], &x1[10]);
1346 btf_16_adds_subs_avx2(&x1[9], &x1[11]);
1347 btf_16_adds_subs_avx2(&x1[12], &x1[14]);
1348 btf_16_adds_subs_avx2(&x1[13], &x1[15]);
1349
1350 // stage 4
1351 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
1352 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
1353 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
1354 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
1355
1356 // stage 5
1357 btf_16_adds_subs_avx2(&x1[0], &x1[4]);
1358 btf_16_adds_subs_avx2(&x1[1], &x1[5]);
1359 btf_16_adds_subs_avx2(&x1[2], &x1[6]);
1360 btf_16_adds_subs_avx2(&x1[3], &x1[7]);
1361 btf_16_adds_subs_avx2(&x1[8], &x1[12]);
1362 btf_16_adds_subs_avx2(&x1[9], &x1[13]);
1363 btf_16_adds_subs_avx2(&x1[10], &x1[14]);
1364 btf_16_adds_subs_avx2(&x1[11], &x1[15]);
1365
1366 // stage 6
1367 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
1368 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
1369 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
1370 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
1371
1372 // stage 7
1373 btf_16_adds_subs_avx2(&x1[0], &x1[8]);
1374 btf_16_adds_subs_avx2(&x1[1], &x1[9]);
1375 btf_16_adds_subs_avx2(&x1[2], &x1[10]);
1376 btf_16_adds_subs_avx2(&x1[3], &x1[11]);
1377 btf_16_adds_subs_avx2(&x1[4], &x1[12]);
1378 btf_16_adds_subs_avx2(&x1[5], &x1[13]);
1379 btf_16_adds_subs_avx2(&x1[6], &x1[14]);
1380 btf_16_adds_subs_avx2(&x1[7], &x1[15]);
1381
1382 // stage 8
1383 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
1384 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
1385 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
1386 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
1387 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
1388 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
1389 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
1390 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
1391
1392 // stage 9
1393 output[0] = x1[1];
1394 output[1] = x1[14];
1395 output[2] = x1[3];
1396 output[3] = x1[12];
1397 output[4] = x1[5];
1398 output[5] = x1[10];
1399 output[6] = x1[7];
1400 output[7] = x1[8];
1401 output[8] = x1[9];
1402 output[9] = x1[6];
1403 output[10] = x1[11];
1404 output[11] = x1[4];
1405 output[12] = x1[13];
1406 output[13] = x1[2];
1407 output[14] = x1[15];
1408 output[15] = x1[0];
1409 }
1410
fidentity16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1411 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
1412 __m256i *output, int8_t cos_bit) {
1413 (void)cos_bit;
1414 const __m256i one = _mm256_set1_epi16(1);
1415
1416 for (int i = 0; i < 16; ++i) {
1417 const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
1418 const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
1419 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
1420 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
1421 output[i] = _mm256_packs_epi32(b_lo, b_hi);
1422 }
1423 }
1424
fidentity16x32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1425 static INLINE void fidentity16x32_new_avx2(const __m256i *input,
1426 __m256i *output, int8_t cos_bit) {
1427 (void)cos_bit;
1428 for (int i = 0; i < 32; ++i) {
1429 output[i] = _mm256_slli_epi16(input[i], 2);
1430 }
1431 }
1432
transpose_32_8x8_avx2(int stride,const __m256i * inputA,__m256i * output)1433 static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
1434 __m256i *output) {
1435 __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
1436 __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
1437 __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
1438 __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
1439 __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
1440 __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
1441 __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
1442 __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
1443
1444 __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
1445 __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
1446 __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
1447 __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
1448 __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
1449 __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
1450 __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
1451 __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
1452
1453 output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
1454 output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
1455 output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
1456 output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
1457 output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
1458 output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
1459 output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
1460 output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
1461 }
1462
1463 // Store 8 16 bit values. Sign extend the values.
store_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * out,const int stride,const int out_size)1464 static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
1465 int32_t *out,
1466 const int stride,
1467 const int out_size) {
1468 for (int i = 0; i < out_size; ++i) {
1469 _mm256_store_si256((__m256i *)(out),
1470 _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
1471 _mm256_store_si256(
1472 (__m256i *)(out + 8),
1473 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
1474 out += stride;
1475 }
1476 }
1477
store_rect_16bit_to_32bit_avx2(const __m256i a,int32_t * const b)1478 static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
1479 int32_t *const b) {
1480 const __m256i one = _mm256_set1_epi16(1);
1481 const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
1482 const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
1483 const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
1484 const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
1485 const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
1486 _mm256_store_si256((__m256i *)b, b_lo);
1487 _mm256_store_si256((__m256i *)(b + 8), b_hi);
1488 }
1489
store_rect_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * const out,const int stride,const int out_size)1490 static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
1491 const __m256i *const in, int32_t *const out, const int stride,
1492 const int out_size) {
1493 for (int i = 0; i < out_size; ++i) {
1494 store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
1495 }
1496 }
1497
1498 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1499 int8_t cos_bit);
1500
1501 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
1502 fdct16x32_new_avx2, // DCT_DCT
1503 NULL, // ADST_DCT
1504 NULL, // DCT_ADST
1505 NULL, // ADST_ADST
1506 NULL, // FLIPADST_DCT
1507 NULL, // DCT_FLIPADST
1508 NULL, // FLIPADST_FLIPADST
1509 NULL, // ADST_FLIPADST
1510 NULL, // FLIPADST_ADST
1511 fidentity16x32_new_avx2, // IDTX
1512 fdct16x32_new_avx2, // V_DCT
1513 fidentity16x32_new_avx2, // H_DCT
1514 NULL, // V_ADST
1515 NULL, // H_ADST
1516 NULL, // V_FLIPADST
1517 NULL // H_FLIPADST
1518 };
1519
1520 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
1521 fdct16x32_new_avx2, // DCT_DCT
1522 NULL, // ADST_DCT
1523 NULL, // DCT_ADST
1524 NULL, // ADST_ADST
1525 NULL, // FLIPADST_DCT
1526 NULL, // DCT_FLIPADST
1527 NULL, // FLIPADST_FLIPADST
1528 NULL, // ADST_FLIPADST
1529 NULL, // FLIPADST_ADST
1530 fidentity16x32_new_avx2, // IDTX
1531 fidentity16x32_new_avx2, // V_DCT
1532 fdct16x32_new_avx2, // H_DCT
1533 NULL, // V_ADST
1534 NULL, // H_ADST
1535 NULL, // V_FLIPADST
1536 NULL // H_FLIPADST
1537 };
1538
1539 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
1540 fdct16x16_new_avx2, // DCT_DCT
1541 fadst16x16_new_avx2, // ADST_DCT
1542 fdct16x16_new_avx2, // DCT_ADST
1543 fadst16x16_new_avx2, // ADST_ADST
1544 fadst16x16_new_avx2, // FLIPADST_DCT
1545 fdct16x16_new_avx2, // DCT_FLIPADST
1546 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1547 fadst16x16_new_avx2, // ADST_FLIPADST
1548 fadst16x16_new_avx2, // FLIPADST_ADST
1549 fidentity16x16_new_avx2, // IDTX
1550 fdct16x16_new_avx2, // V_DCT
1551 fidentity16x16_new_avx2, // H_DCT
1552 fadst16x16_new_avx2, // V_ADST
1553 fidentity16x16_new_avx2, // H_ADST
1554 fadst16x16_new_avx2, // V_FLIPADST
1555 fidentity16x16_new_avx2 // H_FLIPADST
1556 };
1557
1558 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
1559 fdct16x16_new_avx2, // DCT_DCT
1560 fdct16x16_new_avx2, // ADST_DCT
1561 fadst16x16_new_avx2, // DCT_ADST
1562 fadst16x16_new_avx2, // ADST_ADST
1563 fdct16x16_new_avx2, // FLIPADST_DCT
1564 fadst16x16_new_avx2, // DCT_FLIPADST
1565 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1566 fadst16x16_new_avx2, // ADST_FLIPADST
1567 fadst16x16_new_avx2, // FLIPADST_ADST
1568 fidentity16x16_new_avx2, // IDTX
1569 fidentity16x16_new_avx2, // V_DCT
1570 fdct16x16_new_avx2, // H_DCT
1571 fidentity16x16_new_avx2, // V_ADST
1572 fadst16x16_new_avx2, // H_ADST
1573 fidentity16x16_new_avx2, // V_FLIPADST
1574 fadst16x16_new_avx2 // H_FLIPADST
1575 };
1576
lowbd_fwd_txfm2d_16x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1577 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
1578 int stride, TX_TYPE tx_type, int bd) {
1579 (void)bd;
1580 const TX_SIZE tx_size = TX_16X16;
1581 __m256i buf0[16], buf1[16];
1582 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1583 const int txw_idx = get_txw_idx(tx_size);
1584 const int txh_idx = get_txh_idx(tx_size);
1585 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1586 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1587 const int width = tx_size_wide[tx_size];
1588 const int height = tx_size_high[tx_size];
1589 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1590 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1591 int ud_flip, lr_flip;
1592
1593 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1594 const int32_t i = 0;
1595 if (ud_flip) {
1596 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
1597 } else {
1598 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1599 }
1600 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1601 col_txfm(buf0, buf0, cos_bit_col);
1602 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1603 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1604
1605 __m256i *buf;
1606 if (lr_flip) {
1607 buf = buf0;
1608 flip_buf_avx2(buf1 + width * i, buf, width);
1609 } else {
1610 buf = buf1 + width * i;
1611 }
1612 row_txfm(buf, buf, cos_bit_row);
1613 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1614 transpose_16bit_16x16_avx2(buf, buf);
1615 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
1616 }
1617
lowbd_fwd_txfm2d_32x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1618 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
1619 int stride, TX_TYPE tx_type, int bd) {
1620 (void)bd;
1621 const TX_SIZE tx_size = TX_32X32;
1622 __m256i buf0[32], buf1[128];
1623 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1624 const int txw_idx = get_txw_idx(tx_size);
1625 const int txh_idx = get_txh_idx(tx_size);
1626 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1627 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1628 const int width = tx_size_wide[tx_size];
1629 const int height = tx_size_high[tx_size];
1630 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1631 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1632
1633 int ud_flip, lr_flip;
1634 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1635
1636 for (int i = 0; i < 2; i++) {
1637 if (ud_flip) {
1638 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1639 height);
1640 } else {
1641 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1642 }
1643 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1644 col_txfm(buf0, buf0, cos_bit_col);
1645 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1646 transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
1647 transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
1648 }
1649
1650 for (int i = 0; i < 2; i++) {
1651 __m256i *buf;
1652 if (lr_flip) {
1653 buf = buf0;
1654 flip_buf_avx2(buf1 + width * i, buf, width);
1655 } else {
1656 buf = buf1 + width * i;
1657 }
1658 row_txfm(buf, buf, cos_bit_row);
1659 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1660 transpose_16bit_16x16_avx2(buf, buf);
1661 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
1662 16);
1663 transpose_16bit_16x16_avx2(buf + 16, buf + 16);
1664 store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
1665 width, 16);
1666 }
1667 }
1668
lowbd_fwd_txfm2d_64x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1669 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
1670 int stride, TX_TYPE tx_type, int bd) {
1671 (void)bd;
1672 (void)tx_type;
1673 assert(tx_type == DCT_DCT);
1674 const TX_SIZE tx_size = TX_64X64;
1675 __m256i buf0[64], buf1[256];
1676 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1677 const int txw_idx = get_txw_idx(tx_size);
1678 const int txh_idx = get_txh_idx(tx_size);
1679 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1680 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1681 const int width = tx_size_wide[tx_size];
1682 const int height = tx_size_high[tx_size];
1683 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1684 const int width_div16 = (width >> 4);
1685 const int height_div16 = (height >> 4);
1686
1687 for (int i = 0; i < width_div16; i++) {
1688 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1689 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1690 col_txfm(buf0, buf0, cos_bit_col);
1691 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1692 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1693 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1694 }
1695 }
1696
1697 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1698 __m256i bufA[64];
1699 __m256i bufB[64];
1700 __m128i *buf = (__m128i *)(buf1 + width * i);
1701 for (int j = 0; j < width; ++j) {
1702 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1703 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1704 }
1705 av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
1706 av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
1707 av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
1708 av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
1709
1710 int32_t *output8 = output + 16 * 32 * i;
1711 for (int j = 0; j < 4; ++j) {
1712 __m256i *out = (__m256i *)(output8 + 8 * j);
1713 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1714 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1715 }
1716 }
1717 }
1718
lowbd_fwd_txfm2d_16x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1719 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
1720 int stride, TX_TYPE tx_type, int bd) {
1721 (void)bd;
1722 const TX_SIZE tx_size = TX_16X32;
1723 __m256i buf0[32], buf1[32];
1724 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1725 const int txw_idx = get_txw_idx(tx_size);
1726 const int txh_idx = get_txh_idx(tx_size);
1727 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1728 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1729 const int width = tx_size_wide[tx_size];
1730 const int height = tx_size_high[tx_size];
1731 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1732 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1733
1734 int ud_flip, lr_flip;
1735 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1736
1737 if (ud_flip) {
1738 load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
1739 } else {
1740 load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
1741 }
1742 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1743 col_txfm(buf0, buf0, cos_bit_col);
1744 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1745 transpose_16bit_16x16_avx2(buf0, buf1);
1746 transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
1747
1748 for (int i = 0; i < 2; i++) {
1749 __m256i *buf;
1750 if (lr_flip) {
1751 buf = buf0;
1752 flip_buf_avx2(buf1 + width * i, buf, width);
1753 } else {
1754 buf = buf1 + width * i;
1755 }
1756 row_txfm(buf, buf, cos_bit_row);
1757 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1758 transpose_16bit_16x16_avx2(buf, buf);
1759 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
1760 width, 16);
1761 }
1762 }
1763
lowbd_fwd_txfm2d_32x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1764 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
1765 int stride, TX_TYPE tx_type, int bd) {
1766 (void)bd;
1767 __m256i buf0[32], buf1[64];
1768 const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
1769 const int txw_idx = get_txw_idx(TX_32X16);
1770 const int txh_idx = get_txh_idx(TX_32X16);
1771 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1772 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1773 const int width = 32;
1774 const int height = 16;
1775 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1776 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1777
1778 int ud_flip, lr_flip;
1779 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1780
1781 for (int i = 0; i < 2; i++) {
1782 if (ud_flip) {
1783 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1784 height);
1785 } else {
1786 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1787 }
1788 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1789 col_txfm(buf0, buf0, cos_bit_col);
1790 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1791 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1792 }
1793
1794 __m256i *buf;
1795 if (lr_flip) {
1796 buf = buf0;
1797 flip_buf_avx2(buf1, buf, width);
1798 } else {
1799 buf = buf1;
1800 }
1801 row_txfm(buf, buf, cos_bit_row);
1802 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1803 transpose_16bit_16x16_avx2(buf, buf);
1804 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
1805
1806 transpose_16bit_16x16_avx2(buf + 16, buf + 16);
1807 store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
1808 }
1809
lowbd_fwd_txfm2d_64x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1810 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
1811 int stride, TX_TYPE tx_type, int bd) {
1812 (void)bd;
1813 const TX_SIZE tx_size = TX_64X32;
1814 __m256i buf0[64], buf1[256];
1815 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1816 const int txw_idx = get_txw_idx(tx_size);
1817 const int txh_idx = get_txh_idx(tx_size);
1818 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1819 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1820 const int width = tx_size_wide[tx_size];
1821 const int height = tx_size_high[tx_size];
1822 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1823 const int width_div16 = (width >> 4);
1824 const int height_div16 = (height >> 4);
1825
1826 for (int i = 0; i < width_div16; i++) {
1827 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1828 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1829 col_txfm(buf0, buf0, cos_bit_col);
1830 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1831 for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
1832 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1833 }
1834 }
1835 assert(tx_type == DCT_DCT);
1836 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1837 __m256i bufA[64];
1838 __m256i bufB[64];
1839 __m128i *buf = (__m128i *)(buf1 + width * i);
1840 for (int j = 0; j < width; ++j) {
1841 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1842 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1843 }
1844 av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
1845 av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
1846 av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
1847 av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
1848
1849 int32_t *output8 = output + 16 * 32 * i;
1850 for (int j = 0; j < 4; ++j) {
1851 __m256i *out = (__m256i *)(output8 + 8 * j);
1852 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1853 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1854 }
1855 }
1856 }
1857
lowbd_fwd_txfm2d_32x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1858 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
1859 int stride, TX_TYPE tx_type, int bd) {
1860 (void)bd;
1861 (void)tx_type;
1862 assert(tx_type == DCT_DCT);
1863 const TX_SIZE tx_size = TX_32X64;
1864 __m256i buf0[64], buf1[256];
1865 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1866 const int txw_idx = get_txw_idx(tx_size);
1867 const int txh_idx = get_txh_idx(tx_size);
1868 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1869 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1870 const int width = tx_size_wide[tx_size];
1871 const int height = tx_size_high[tx_size];
1872 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1873 const int width_div16 = (width >> 4);
1874 const int height_div16 = (height >> 4);
1875
1876 for (int i = 0; i < width_div16; i++) {
1877 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1878 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1879 col_txfm(buf0, buf0, cos_bit_col);
1880 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1881 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1882 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1883 }
1884 }
1885
1886 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1887 __m256i bufA[32];
1888 __m256i bufB[32];
1889 __m128i *buf = (__m128i *)(buf1 + width * i);
1890 for (int j = 0; j < width; ++j) {
1891 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1892 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1893 }
1894 av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
1895 av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
1896 av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
1897 av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
1898
1899 int32_t *output8 = output + 16 * 32 * i;
1900 for (int j = 0; j < 4; ++j) {
1901 __m256i *out = (__m256i *)(output8 + 8 * j);
1902 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1903 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1904 }
1905 }
1906 }
1907
lowbd_fwd_txfm2d_16x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1908 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
1909 int stride, TX_TYPE tx_type, int bd) {
1910 (void)bd;
1911 (void)tx_type;
1912 assert(tx_type == DCT_DCT);
1913 const TX_SIZE tx_size = TX_16X64;
1914 __m256i buf0[64], buf1[64];
1915 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1916 const int txw_idx = get_txw_idx(tx_size);
1917 const int txh_idx = get_txh_idx(tx_size);
1918 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1919 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1920 const int width = tx_size_wide[tx_size];
1921 const int height = tx_size_high[tx_size];
1922 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1923 const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
1924 const int width_div16 = (width >> 4);
1925 const int height_div16 = (height >> 4);
1926
1927 for (int i = 0; i < width_div16; i++) {
1928 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1929 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1930 col_txfm(buf0, buf0, cos_bit_col);
1931 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1932 for (int j = 0; j < height_div16; ++j) {
1933 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1934 }
1935 }
1936
1937 for (int i = 0; i < AOMMIN(4, height_div16); i++) {
1938 __m256i *buf = buf1 + width * i;
1939 row_txfm(buf, buf, cos_bit_row);
1940 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1941 int32_t *output16 = output + 16 * width * i;
1942 for (int j = 0; j < width_div16; ++j) {
1943 __m256i *buf16 = buf + 16 * j;
1944 transpose_16bit_16x16_avx2(buf16, buf16);
1945 store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
1946 }
1947 }
1948 // Zero out the bottom 16x32 area.
1949 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
1950 }
1951
lowbd_fwd_txfm2d_64x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1952 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
1953 int stride, TX_TYPE tx_type, int bd) {
1954 (void)bd;
1955 (void)tx_type;
1956 assert(tx_type == DCT_DCT);
1957 const TX_SIZE tx_size = TX_64X16;
1958 __m256i buf0[64], buf1[64];
1959 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1960 const int txw_idx = get_txw_idx(tx_size);
1961 const int txh_idx = get_txh_idx(tx_size);
1962 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1963 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1964 const int width = tx_size_wide[tx_size];
1965 const int height = tx_size_high[tx_size];
1966 const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
1967 const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
1968 const int width_div16 = (width >> 4);
1969 const int height_div16 = (height >> 4);
1970
1971 for (int i = 0; i < width_div16; i++) {
1972 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1973 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1974 col_txfm(buf0, buf0, cos_bit_col);
1975 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1976 for (int j = 0; j < height_div16; ++j) {
1977 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1978 }
1979 }
1980
1981 for (int i = 0; i < height_div16; i++) {
1982 __m256i *buf = buf1 + width * i;
1983 row_txfm(buf, buf, cos_bit_row);
1984 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1985 int32_t *output16 = output + 16 * 32 * i;
1986 for (int j = 0; j < 2; ++j) {
1987 __m256i *buf16 = buf + 16 * j;
1988 transpose_16bit_16x16_avx2(buf16, buf16);
1989 store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
1990 }
1991 }
1992 }
1993
btf_16_avx2(__m256i * w0,__m256i * w1,__m256i * in0,__m256i * in1,__m128i * out0,__m128i * out1,__m128i * out2,__m128i * out3,const __m256i * __rounding,int8_t * cos_bit)1994 static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
1995 __m256i *in1, __m128i *out0, __m128i *out1,
1996 __m128i *out2, __m128i *out3,
1997 const __m256i *__rounding, int8_t *cos_bit) {
1998 __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
1999 __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
2000 __m256i u0 = _mm256_madd_epi16(t0, *w0);
2001 __m256i u1 = _mm256_madd_epi16(t1, *w0);
2002 __m256i v0 = _mm256_madd_epi16(t0, *w1);
2003 __m256i v1 = _mm256_madd_epi16(t1, *w1);
2004
2005 __m256i a0 = _mm256_add_epi32(u0, *__rounding);
2006 __m256i a1 = _mm256_add_epi32(u1, *__rounding);
2007 __m256i b0 = _mm256_add_epi32(v0, *__rounding);
2008 __m256i b1 = _mm256_add_epi32(v1, *__rounding);
2009
2010 __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
2011 __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
2012 __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
2013 __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
2014
2015 __m256i temp0 = _mm256_packs_epi32(c0, c1);
2016 __m256i temp1 = _mm256_packs_epi32(d0, d1);
2017
2018 *out0 = _mm256_castsi256_si128(temp0);
2019 *out1 = _mm256_castsi256_si128(temp1);
2020 *out2 = _mm256_extracti128_si256(temp0, 0x01);
2021 *out3 = _mm256_extracti128_si256(temp1, 0x01);
2022 }
2023
fdct8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2024 static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
2025 int8_t cos_bit) {
2026 const int32_t *cospi = cospi_arr(cos_bit);
2027 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2028
2029 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
2030 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2031 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2032 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
2033 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
2034 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
2035 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
2036 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
2037 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
2038
2039 // stage 1
2040 __m256i x1[8];
2041 x1[0] = _mm256_adds_epi16(input[0], input[7]);
2042 x1[7] = _mm256_subs_epi16(input[0], input[7]);
2043 x1[1] = _mm256_adds_epi16(input[1], input[6]);
2044 x1[6] = _mm256_subs_epi16(input[1], input[6]);
2045 x1[2] = _mm256_adds_epi16(input[2], input[5]);
2046 x1[5] = _mm256_subs_epi16(input[2], input[5]);
2047 x1[3] = _mm256_adds_epi16(input[3], input[4]);
2048 x1[4] = _mm256_subs_epi16(input[3], input[4]);
2049
2050 // stage 2
2051 __m256i x2[8];
2052 x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2053 x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
2054 x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2055 x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
2056 x2[4] = x1[4];
2057 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
2058 cos_bit);
2059 x2[5] = x1[5];
2060 x2[6] = x1[6];
2061 x2[7] = x1[7];
2062
2063 // stage 3
2064 __m256i x3[8];
2065 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
2066 cos_bit);
2067 x3[0] = x2[0];
2068 x3[1] = x2[1];
2069 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
2070 cos_bit);
2071 x3[2] = x2[2];
2072 x3[3] = x2[3];
2073 x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2074 x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2075 x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
2076 x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
2077
2078 // stage 4
2079 __m256i x4[8];
2080 x4[0] = x3[0];
2081 x4[1] = x3[1];
2082 x4[2] = x3[2];
2083 x4[3] = x3[3];
2084 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
2085 cos_bit);
2086 x4[4] = x3[4];
2087 x4[7] = x3[7];
2088 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
2089 cos_bit);
2090 x4[5] = x3[5];
2091 x4[6] = x3[6];
2092 // stage 5
2093 output[0] = x4[0];
2094 output[1] = x4[4];
2095 output[2] = x4[2];
2096 output[3] = x4[6];
2097 output[4] = x4[1];
2098 output[5] = x4[5];
2099 output[6] = x4[3];
2100 output[7] = x4[7];
2101 }
2102
fadst8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2103 static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
2104 int8_t cos_bit) {
2105 const int32_t *cospi = cospi_arr(cos_bit);
2106 const __m256i __zero = _mm256_setzero_si256();
2107 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2108
2109 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2110 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2111 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
2112 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
2113 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
2114 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
2115 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
2116 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
2117 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
2118 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
2119 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
2120 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
2121 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
2122
2123 // stage 1
2124 __m256i x1[8];
2125 x1[0] = input[0];
2126 x1[1] = _mm256_subs_epi16(__zero, input[7]);
2127 x1[2] = _mm256_subs_epi16(__zero, input[3]);
2128 x1[3] = input[4];
2129 x1[4] = _mm256_subs_epi16(__zero, input[1]);
2130 x1[5] = input[6];
2131 x1[6] = input[2];
2132 x1[7] = _mm256_subs_epi16(__zero, input[5]);
2133
2134 // stage 2
2135 __m256i x2[8];
2136 x2[0] = x1[0];
2137 x2[1] = x1[1];
2138 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
2139 cos_bit);
2140 x2[2] = x1[2];
2141 x2[3] = x1[3];
2142 x2[4] = x1[4];
2143 x2[5] = x1[5];
2144 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
2145 cos_bit);
2146 x2[6] = x1[6];
2147 x2[7] = x1[7];
2148
2149 // stage 3
2150 __m256i x3[8];
2151 x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
2152 x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
2153 x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
2154 x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
2155 x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
2156 x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
2157 x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
2158 x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
2159
2160 // stage 4
2161 __m256i x4[8];
2162 x4[0] = x3[0];
2163 x4[1] = x3[1];
2164 x4[2] = x3[2];
2165 x4[3] = x3[3];
2166 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
2167 cos_bit);
2168 x4[4] = x3[4];
2169 x4[5] = x3[5];
2170 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
2171 cos_bit);
2172 x4[6] = x3[6];
2173 x4[7] = x3[7];
2174
2175 // stage 5
2176 __m256i x5[8];
2177 x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
2178 x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
2179 x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
2180 x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
2181 x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
2182 x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
2183 x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
2184 x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
2185
2186 // stage 6
2187 __m256i x6[8];
2188 btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
2189 cos_bit);
2190 x6[0] = x5[0];
2191 x6[1] = x5[1];
2192 btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
2193 cos_bit);
2194 x6[2] = x5[2];
2195 x6[3] = x5[3];
2196 btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
2197 cos_bit);
2198 x6[4] = x5[4];
2199 x6[5] = x5[5];
2200 btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
2201 cos_bit);
2202 x6[6] = x5[6];
2203 x6[7] = x5[7];
2204
2205 // stage 7
2206 output[0] = x6[1];
2207 output[1] = x6[6];
2208 output[2] = x6[3];
2209 output[3] = x6[4];
2210 output[4] = x6[5];
2211 output[5] = x6[2];
2212 output[6] = x6[7];
2213 output[7] = x6[0];
2214 }
2215
fidentity8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2216 static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
2217 int8_t cos_bit) {
2218 (void)cos_bit;
2219
2220 output[0] = _mm256_adds_epi16(input[0], input[0]);
2221 output[1] = _mm256_adds_epi16(input[1], input[1]);
2222 output[2] = _mm256_adds_epi16(input[2], input[2]);
2223 output[3] = _mm256_adds_epi16(input[3], input[3]);
2224 output[4] = _mm256_adds_epi16(input[4], input[4]);
2225 output[5] = _mm256_adds_epi16(input[5], input[5]);
2226 output[6] = _mm256_adds_epi16(input[6], input[6]);
2227 output[7] = _mm256_adds_epi16(input[7], input[7]);
2228 }
2229
fdct8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2230 static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
2231 int8_t cos_bit) {
2232 const int32_t *cospi = cospi_arr(cos_bit);
2233 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2234 const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
2235 __m128i temp0, temp1, temp2, temp3;
2236 __m256i in0, in1;
2237 __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
2238 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2239 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2240 __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
2241 __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
2242 __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
2243 __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
2244 __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
2245 __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
2246 __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
2247 __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
2248 __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
2249 __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
2250 __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
2251 __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
2252 __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
2253 __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
2254 __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
2255
2256 __m256i cospi_arr[12];
2257
2258 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
2259 cospi_m32_p32, 0x1);
2260 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2261 cospi_p32_p32, 0x1);
2262 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2263 cospi_p48_p16, 0x1);
2264 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2265 cospi_m16_p48, 0x1);
2266 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
2267 cospi_m48_m16, 0x1);
2268 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
2269 cospi_m16_p48, 0x1);
2270 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
2271 cospi_p24_p40, 0x1);
2272 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
2273 cospi_m40_p24, 0x1);
2274 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
2275 cospi_p28_p36, 0x1);
2276 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
2277 cospi_m36_p28, 0x1);
2278 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
2279 cospi_p12_p52, 0x1);
2280 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
2281 cospi_m52_p12, 0x1);
2282
2283 __m256i x[8];
2284 x[0] =
2285 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
2286 x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
2287 0x1);
2288 x[2] =
2289 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
2290 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
2291 0x1);
2292 x[4] =
2293 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
2294 x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
2295 0x1);
2296 x[6] =
2297 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
2298 x[7] =
2299 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
2300
2301 // stage 1
2302 __m256i x1[8];
2303 x1[0] = _mm256_adds_epi16(x[0], x[1]);
2304 x1[7] = _mm256_subs_epi16(x[0], x[1]);
2305 x1[1] = _mm256_adds_epi16(x[2], x[3]);
2306 x1[6] = _mm256_subs_epi16(x[2], x[3]);
2307 x1[2] = _mm256_adds_epi16(x[4], x[5]);
2308 x1[5] = _mm256_subs_epi16(x[4], x[5]);
2309 x1[3] = _mm256_adds_epi16(x[6], x[7]);
2310 x1[4] = _mm256_subs_epi16(x[6], x[7]);
2311
2312 // stage 2
2313 __m256i x2[8];
2314 x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2315 x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
2316 x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2317 x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
2318 x2[2] = x1[4];
2319 x2[3] = x1[7];
2320 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
2321 &temp2, &temp3, &__rounding_256, &cos_bit);
2322 x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
2323 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2324
2325 // stage 3
2326 __m256i x3[8];
2327 x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
2328 x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2329 x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2330 x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
2331 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
2332 _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
2333 x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
2334 x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
2335 x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
2336 x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
2337 x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
2338
2339 // stage 4
2340 __m256i x4[8];
2341 x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
2342 x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
2343 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
2344 &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
2345 x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
2346 x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
2347 x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
2348 x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
2349 in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
2350 in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
2351 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2352 &temp3, &__rounding_256, &cos_bit);
2353
2354 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
2355 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2356
2357 // stage 5
2358 __m256i x5[4];
2359 in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
2360 in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
2361 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
2362 &output[10], &output[6], &__rounding_256, &cos_bit);
2363 x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
2364 x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
2365 x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
2366 x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
2367
2368 // stage 6
2369 in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
2370 in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
2371 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
2372 &output[9], &output[7], &__rounding_256, &cos_bit);
2373 in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
2374 in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
2375 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
2376 &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
2377 }
2378
fadst8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2379 static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
2380 int8_t cos_bit) {
2381 const int32_t *cospi = cospi_arr(cos_bit);
2382 const __m256i __zero = _mm256_setzero_si256();
2383 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2384 __m256i in0, in1;
2385 __m128i temp0, temp1, temp2, temp3;
2386
2387 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2388 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2389 __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2390 __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2391 __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2392 __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2393 __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2394 __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2395 __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2396 __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2397 __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2398 __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2399 __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2400 __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2401 __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2402 __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2403 __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2404 __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2405 __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2406 __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2407 __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2408 __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2409 __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2410 __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2411 __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2412 __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2413 __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2414
2415 __m256i cospi_arr[20];
2416
2417 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2418 cospi_p32_p32, 0x1);
2419 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2420 cospi_p32_m32, 0x1);
2421 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2422 cospi_p32_p32, 0x1);
2423 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2424 cospi_p32_m32, 0x1);
2425 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2426 cospi_m48_p16, 0x1);
2427 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2428 cospi_p16_p48, 0x1);
2429 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2430 cospi_m48_p16, 0x1);
2431 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2432 cospi_p16_p48, 0x1);
2433 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2434 cospi_p40_p24, 0x1);
2435 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
2436 cospi_p24_m40, 0x1);
2437 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
2438 cospi_m24_p40, 0x1);
2439 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2440 cospi_p40_p24, 0x1);
2441 cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
2442 cospi_p10_p54, 0x1);
2443 cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
2444 cospi_p54_m10, 0x1);
2445 cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
2446 cospi_p26_p38, 0x1);
2447 cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
2448 cospi_p38_m26, 0x1);
2449 cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
2450 cospi_p42_p22, 0x1);
2451 cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
2452 cospi_p22_m42, 0x1);
2453 cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
2454 cospi_p58_p06, 0x1);
2455 cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
2456 cospi_p06_m58, 0x1);
2457
2458 __m256i x[8];
2459 x[0] =
2460 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
2461 x[1] =
2462 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
2463 x[2] =
2464 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
2465 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
2466 0x1);
2467 x[4] =
2468 _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
2469 x[5] =
2470 _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
2471 x[6] =
2472 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
2473 x[7] =
2474 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
2475
2476 // stage 1
2477 __m256i x1[8];
2478 x1[0] = x[0];
2479 x1[1] = _mm256_subs_epi16(__zero, x[7]);
2480 x1[2] = x[2];
2481 x1[3] = _mm256_subs_epi16(__zero, x[5]);
2482 x1[4] = _mm256_subs_epi16(__zero, x[4]);
2483 x1[5] = x[3];
2484 x1[6] = _mm256_subs_epi16(__zero, x[6]);
2485 x1[7] = x[1];
2486
2487 // stage 2
2488 __m256i x2[8];
2489 x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
2490 x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
2491 x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
2492 x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
2493 in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
2494 in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
2495 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
2496 &temp3, &__rounding_256, &cos_bit);
2497 x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2498 x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2499 in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
2500 in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
2501 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
2502 &temp3, &__rounding_256, &cos_bit);
2503 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2504 x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2505
2506 // stage 3
2507 __m256i x3[8];
2508 x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2509 x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2510 x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
2511 x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
2512 x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2513 x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2514 x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
2515 x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
2516
2517 // stage 4
2518 __m256i x4[8];
2519 x4[0] = x3[0];
2520 x4[1] = x3[1];
2521 x4[4] = x3[4];
2522 x4[5] = x3[5];
2523 in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
2524 in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
2525 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2526 &temp3, &__rounding_256, &cos_bit);
2527 x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2528 x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2529 in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
2530 in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
2531 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
2532 &temp3, &__rounding_256, &cos_bit);
2533 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2534 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2535
2536 // stage 5
2537 __m256i x5[8];
2538 x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
2539 x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
2540 x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
2541 x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
2542 x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
2543 x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
2544 x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
2545 x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
2546
2547 // stage 6
2548 __m256i x6[8];
2549 x6[0] = x5[0];
2550 x6[1] = x5[2];
2551 x6[2] = x5[1];
2552 x6[3] = x5[3];
2553 in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
2554 in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
2555 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
2556 &temp3, &__rounding_256, &cos_bit);
2557 x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2558 x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2559 in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
2560 in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
2561 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
2562 &temp2, &temp3, &__rounding_256, &cos_bit);
2563 x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2564 x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2565
2566 // stage 7
2567 __m256i x7[8];
2568 x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
2569 x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
2570 x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
2571 x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
2572 x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
2573 x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
2574 x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
2575 x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
2576
2577 // stage 8
2578 in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
2579 in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
2580 btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
2581 &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
2582 in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
2583 in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
2584 btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
2585 &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
2586 in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
2587 in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
2588 btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
2589 &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
2590 in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
2591 in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
2592 btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
2593 &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
2594 }
2595
fidentity8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2596 static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
2597 int8_t cos_bit) {
2598 (void)cos_bit;
2599 const __m256i one = _mm256_set1_epi16(1);
2600 __m256i temp;
2601 for (int i = 0; i < 16; i += 2) {
2602 temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
2603 input[i + 1], 0x1);
2604 const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
2605 const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
2606 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
2607 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
2608 temp = _mm256_packs_epi32(b_lo, b_hi);
2609 output[i] = _mm256_castsi256_si128(temp);
2610 output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
2611 }
2612 }
2613
2614 static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
2615 fdct8x8_new_avx2, // DCT_DCT
2616 fdct8x8_new_avx2, // ADST_DCT
2617 fadst8x8_new_avx2, // DCT_ADST
2618 fadst8x8_new_avx2, // ADST_ADST
2619 fdct8x8_new_avx2, // FLIPADST_DCT
2620 fadst8x8_new_avx2, // DCT_FLIPADST
2621 fadst8x8_new_avx2, // FLIPADST_FLIPADST
2622 fadst8x8_new_avx2, // ADST_FLIPADST
2623 fadst8x8_new_avx2, // FLIPADST_ADST
2624 fidentity8x8_new_avx2, // IDTX
2625 fidentity8x8_new_avx2, // V_DCT
2626 fdct8x8_new_avx2, // H_DCT
2627 fidentity8x8_new_avx2, // V_ADST
2628 fadst8x8_new_avx2, // H_ADST
2629 fidentity8x8_new_avx2, // V_FLIPADST
2630 fadst8x8_new_avx2 // H_FLIPADST
2631 };
2632
2633 static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
2634 fdct8x16_new_avx2, // DCT_DCT
2635 fadst8x16_new_avx2, // ADST_DCT
2636 fdct8x16_new_avx2, // DCT_ADST
2637 fadst8x16_new_avx2, // ADST_ADST
2638 fadst8x16_new_avx2, // FLIPADST_DCT
2639 fdct8x16_new_avx2, // DCT_FLIPADST
2640 fadst8x16_new_avx2, // FLIPADST_FLIPADST
2641 fadst8x16_new_avx2, // ADST_FLIPADST
2642 fadst8x16_new_avx2, // FLIPADST_ADST
2643 fidentity8x16_new_avx2, // IDTX
2644 fdct8x16_new_avx2, // V_DCT
2645 fidentity8x16_new_avx2, // H_DCT
2646 fadst8x16_new_avx2, // V_ADST
2647 fidentity8x16_new_avx2, // H_ADST
2648 fadst8x16_new_avx2, // V_FLIPADST
2649 fidentity8x16_new_avx2 // H_FLIPADST
2650 };
2651
2652 static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
2653 fdct8x8_new_avx2, // DCT_DCT
2654 fadst8x8_new_avx2, // ADST_DCT
2655 fdct8x8_new_avx2, // DCT_ADST
2656 fadst8x8_new_avx2, // ADST_ADST
2657 fadst8x8_new_avx2, // FLIPADST_DCT
2658 fdct8x8_new_avx2, // DCT_FLIPADST
2659 fadst8x8_new_avx2, // FLIPADST_FLIPADST
2660 fadst8x8_new_avx2, // ADST_FLIPADST
2661 fadst8x8_new_avx2, // FLIPADST_ADST
2662 fidentity8x8_new_avx2, // IDTX
2663 fdct8x8_new_avx2, // V_DCT
2664 fidentity8x8_new_avx2, // H_DCT
2665 fadst8x8_new_avx2, // V_ADST
2666 fidentity8x8_new_avx2, // H_ADST
2667 fadst8x8_new_avx2, // V_FLIPADST
2668 fidentity8x8_new_avx2, // H_FLIPADST
2669 };
2670
2671 static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
2672 fdct8x16_new_avx2, // DCT_DCT
2673 fdct8x16_new_avx2, // ADST_DCT
2674 fadst8x16_new_avx2, // DCT_ADST
2675 fadst8x16_new_avx2, // ADST_ADST
2676 fdct8x16_new_avx2, // FLIPADST_DCT
2677 fadst8x16_new_avx2, // DCT_FLIPADST
2678 fadst8x16_new_avx2, // FLIPADST_FLIPADST
2679 fadst8x16_new_avx2, // ADST_FLIPADST
2680 fadst8x16_new_avx2, // FLIPADST_ADST
2681 fidentity8x16_new_avx2, // IDTX
2682 fidentity8x16_new_avx2, // V_DCT
2683 fdct8x16_new_avx2, // H_DCT
2684 fidentity8x16_new_avx2, // V_ADST
2685 fadst8x16_new_avx2, // H_ADST
2686 fidentity8x16_new_avx2, // V_FLIPADST
2687 fadst8x16_new_avx2 // H_FLIPADST
2688 };
2689
lowbd_fwd_txfm2d_8x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2690 static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
2691 int stride, TX_TYPE tx_type, int bd) {
2692 (void)bd;
2693 __m128i buf0[16], buf1[16];
2694 __m256i buf2[8];
2695 const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
2696 const int txw_idx = get_txw_idx(TX_8X16);
2697 const int txh_idx = get_txh_idx(TX_8X16);
2698 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
2699 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
2700 const int width = 8;
2701 const int height = 16;
2702 const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
2703 const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
2704 int ud_flip, lr_flip;
2705
2706 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2707 if (ud_flip) {
2708 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2709 } else {
2710 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2711 }
2712 round_shift_16bit(buf0, height, shift[0]);
2713 col_txfm(buf0, buf0, cos_bit_col);
2714 round_shift_16bit(buf0, height, shift[1]);
2715 transpose_16bit_8x8(buf0, buf1);
2716 transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2717
2718 __m128i *bufl, *bufu;
2719 if (lr_flip) {
2720 bufl = buf0;
2721 bufu = buf0 + 8;
2722 flip_buf_sse2(buf1 + width * 0, bufl, width);
2723 flip_buf_sse2(buf1 + width * 1, bufu, width);
2724 } else {
2725 bufl = buf1 + width * 0;
2726 bufu = buf1 + width * 1;
2727 }
2728 pack_reg(bufl, bufu, buf2);
2729 row_txfm(buf2, buf2, cos_bit_row);
2730 round_shift_16bit_w16_avx2(buf2, width, shift[2]);
2731 transpose_16bit_16x8_avx2(buf2, buf2);
2732 store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
2733 }
2734
lowbd_fwd_txfm2d_16x8_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2735 static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
2736 int stride, TX_TYPE tx_type, int bd) {
2737 (void)bd;
2738 __m128i buf0[16], buf1[16];
2739 __m256i buf2[8];
2740 const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
2741 const int txw_idx = get_txw_idx(TX_16X8);
2742 const int txh_idx = get_txh_idx(TX_16X8);
2743 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
2744 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
2745 const int width = 16;
2746 const int height = 8;
2747 const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
2748 const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
2749 __m128i *buf;
2750 int ud_flip, lr_flip;
2751
2752 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2753
2754 if (ud_flip) {
2755 load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
2756 load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
2757 } else {
2758 load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
2759 load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
2760 }
2761 pack_reg(buf0, &buf0[8], buf2);
2762 round_shift_16bit_w16_avx2(buf2, height, shift[0]);
2763 col_txfm(buf2, buf2, cos_bit_col);
2764 round_shift_16bit_w16_avx2(buf2, height, shift[1]);
2765 transpose_16bit_16x8_avx2(buf2, buf2);
2766 extract_reg(buf2, buf1);
2767
2768 if (lr_flip) {
2769 buf = buf0;
2770 flip_buf_sse2(buf1, buf, width);
2771 } else {
2772 buf = buf1;
2773 }
2774 row_txfm(buf, buf, cos_bit_row);
2775 round_shift_16bit(buf, width, shift[2]);
2776 transpose_16bit_8x8(buf, buf);
2777 store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
2778 transpose_16bit_8x8(buf + 8, buf + 8);
2779 store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
2780 }
2781
2782 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
2783 av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
2784 av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
2785 lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
2786 lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
2787 lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
2788 av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
2789 av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
2790 lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform
2791 lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform
2792 lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
2793 lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
2794 lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
2795 lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
2796 av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
2797 av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
2798 av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
2799 av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
2800 lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
2801 lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
2802 };
2803
av1_lowbd_fwd_txfm_avx2(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)2804 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
2805 int diff_stride, TxfmParam *txfm_param) {
2806 FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
2807 if ((fwd_txfm2d_func == NULL) ||
2808 (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
2809 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
2810 } else {
2811 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
2812 txfm_param->bd);
2813 }
2814 }
2815