1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <immintrin.h>
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22
23 // Note:
24 // Total 32x4 registers to represent 32x32 block coefficients.
25 // For high bit depth, each coefficient is 4-byte.
26 // Each __m256i register holds 8 coefficients.
27 // So each "row" we needs 4 register. Totally 32 rows
28 // Register layout:
29 // v0, v1, v2, v3,
30 // v4, v5, v6, v7,
31 // ... ...
32 // v124, v125, v126, v127
33
highbd_clamp_epi16_avx2(__m256i u,int bd)34 static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
35 const __m256i zero = _mm256_setzero_si256();
36 const __m256i one = _mm256_set1_epi16(1);
37 const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
38 __m256i clamped, mask;
39
40 mask = _mm256_cmpgt_epi16(u, max);
41 clamped = _mm256_andnot_si256(mask, u);
42 mask = _mm256_and_si256(mask, max);
43 clamped = _mm256_or_si256(mask, clamped);
44 mask = _mm256_cmpgt_epi16(clamped, zero);
45 clamped = _mm256_and_si256(clamped, mask);
46
47 return clamped;
48 }
49
highbd_get_recon_16x8_avx2(const __m256i pred,__m256i res0,__m256i res1,const int bd)50 static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
51 __m256i res0, __m256i res1,
52 const int bd) {
53 __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
54 __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
55
56 x0 = _mm256_add_epi32(res0, x0);
57 x1 = _mm256_add_epi32(res1, x1);
58 x0 = _mm256_packus_epi32(x0, x1);
59 x0 = _mm256_permute4x64_epi64(x0, 0xd8);
60 x0 = highbd_clamp_epi16_avx2(x0, bd);
61 return x0;
62 }
63
highbd_write_buffer_16xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)64 static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
65 int stride, int flipud,
66 int height, const int bd) {
67 int j = flipud ? (height - 1) : 0;
68 const int step = flipud ? -1 : 1;
69 for (int i = 0; i < height; ++i, j += step) {
70 __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
71 __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
72
73 _mm256_storeu_si256((__m256i *)(output + i * stride), u);
74 }
75 }
highbd_get_recon_8x8_avx2(const __m256i pred,__m256i res,const int bd)76 static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
77 const int bd) {
78 __m256i x0 = pred;
79 x0 = _mm256_add_epi32(res, x0);
80 x0 = _mm256_packus_epi32(x0, x0);
81 x0 = _mm256_permute4x64_epi64(x0, 0xd8);
82 x0 = highbd_clamp_epi16_avx2(x0, bd);
83 return x0;
84 }
85
highbd_write_buffer_8xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)86 static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
87 int stride, int flipud,
88 int height, const int bd) {
89 int j = flipud ? (height - 1) : 0;
90 __m128i temp;
91 const int step = flipud ? -1 : 1;
92 for (int i = 0; i < height; ++i, j += step) {
93 temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
94 __m256i v = _mm256_cvtepi16_epi32(temp);
95 __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
96 __m128i u1 = _mm256_castsi256_si128(u);
97 _mm_storeu_si128((__m128i *)(output + i * stride), u1);
98 }
99 }
neg_shift_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi,int shift)100 static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
101 __m256i *out1, const __m256i *clamp_lo,
102 const __m256i *clamp_hi, int shift) {
103 __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
104 __m256i a0 = _mm256_add_epi32(offset, in0);
105 __m256i a1 = _mm256_sub_epi32(offset, in1);
106
107 a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
108 a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
109
110 a0 = _mm256_max_epi32(a0, *clamp_lo);
111 a0 = _mm256_min_epi32(a0, *clamp_hi);
112 a1 = _mm256_max_epi32(a1, *clamp_lo);
113 a1 = _mm256_min_epi32(a1, *clamp_hi);
114
115 *out0 = a0;
116 *out1 = a1;
117 }
118
transpose_8x8_avx2(const __m256i * in,__m256i * out)119 static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
120 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
121 __m256i x0, x1;
122
123 u0 = _mm256_unpacklo_epi32(in[0], in[1]);
124 u1 = _mm256_unpackhi_epi32(in[0], in[1]);
125
126 u2 = _mm256_unpacklo_epi32(in[2], in[3]);
127 u3 = _mm256_unpackhi_epi32(in[2], in[3]);
128
129 u4 = _mm256_unpacklo_epi32(in[4], in[5]);
130 u5 = _mm256_unpackhi_epi32(in[4], in[5]);
131
132 u6 = _mm256_unpacklo_epi32(in[6], in[7]);
133 u7 = _mm256_unpackhi_epi32(in[6], in[7]);
134
135 x0 = _mm256_unpacklo_epi64(u0, u2);
136 x1 = _mm256_unpacklo_epi64(u4, u6);
137 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
138 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
139
140 x0 = _mm256_unpackhi_epi64(u0, u2);
141 x1 = _mm256_unpackhi_epi64(u4, u6);
142 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
143 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
144
145 x0 = _mm256_unpacklo_epi64(u1, u3);
146 x1 = _mm256_unpacklo_epi64(u5, u7);
147 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
148 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
149
150 x0 = _mm256_unpackhi_epi64(u1, u3);
151 x1 = _mm256_unpackhi_epi64(u5, u7);
152 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
153 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
154 }
155
transpose_8x8_flip_avx2(const __m256i * in,__m256i * out)156 static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
157 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
158 __m256i x0, x1;
159
160 u0 = _mm256_unpacklo_epi32(in[7], in[6]);
161 u1 = _mm256_unpackhi_epi32(in[7], in[6]);
162
163 u2 = _mm256_unpacklo_epi32(in[5], in[4]);
164 u3 = _mm256_unpackhi_epi32(in[5], in[4]);
165
166 u4 = _mm256_unpacklo_epi32(in[3], in[2]);
167 u5 = _mm256_unpackhi_epi32(in[3], in[2]);
168
169 u6 = _mm256_unpacklo_epi32(in[1], in[0]);
170 u7 = _mm256_unpackhi_epi32(in[1], in[0]);
171
172 x0 = _mm256_unpacklo_epi64(u0, u2);
173 x1 = _mm256_unpacklo_epi64(u4, u6);
174 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
175 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
176
177 x0 = _mm256_unpackhi_epi64(u0, u2);
178 x1 = _mm256_unpackhi_epi64(u4, u6);
179 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
180 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
181
182 x0 = _mm256_unpacklo_epi64(u1, u3);
183 x1 = _mm256_unpacklo_epi64(u5, u7);
184 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
185 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
186
187 x0 = _mm256_unpackhi_epi64(u1, u3);
188 x1 = _mm256_unpackhi_epi64(u5, u7);
189 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
190 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
191 }
192
load_buffer_32x32(const int32_t * coeff,__m256i * in,int input_stiride,int size)193 static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
194 int input_stiride, int size) {
195 int i;
196 for (i = 0; i < size; ++i) {
197 in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
198 }
199 }
200
half_btf_0_avx2(const __m256i * w0,const __m256i * n0,const __m256i * rounding,int bit)201 static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
202 const __m256i *rounding, int bit) {
203 __m256i x;
204 x = _mm256_mullo_epi32(*w0, *n0);
205 x = _mm256_add_epi32(x, *rounding);
206 x = _mm256_srai_epi32(x, bit);
207 return x;
208 }
209
half_btf_avx2(const __m256i * w0,const __m256i * n0,const __m256i * w1,const __m256i * n1,const __m256i * rounding,int bit)210 static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
211 const __m256i *w1, const __m256i *n1,
212 const __m256i *rounding, int bit) {
213 __m256i x, y;
214
215 x = _mm256_mullo_epi32(*w0, *n0);
216 y = _mm256_mullo_epi32(*w1, *n1);
217 x = _mm256_add_epi32(x, y);
218 x = _mm256_add_epi32(x, *rounding);
219 x = _mm256_srai_epi32(x, bit);
220 return x;
221 }
222
addsub_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi)223 static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
224 __m256i *out1, const __m256i *clamp_lo,
225 const __m256i *clamp_hi) {
226 __m256i a0 = _mm256_add_epi32(in0, in1);
227 __m256i a1 = _mm256_sub_epi32(in0, in1);
228
229 a0 = _mm256_max_epi32(a0, *clamp_lo);
230 a0 = _mm256_min_epi32(a0, *clamp_hi);
231 a1 = _mm256_max_epi32(a1, *clamp_lo);
232 a1 = _mm256_min_epi32(a1, *clamp_hi);
233
234 *out0 = a0;
235 *out1 = a1;
236 }
237
addsub_no_clamp_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1)238 static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
239 __m256i *out0, __m256i *out1) {
240 __m256i a0 = _mm256_add_epi32(in0, in1);
241 __m256i a1 = _mm256_sub_epi32(in0, in1);
242
243 *out0 = a0;
244 *out1 = a1;
245 }
246
addsub_shift_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi,int shift)247 static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
248 __m256i *out0, __m256i *out1,
249 const __m256i *clamp_lo, const __m256i *clamp_hi,
250 int shift) {
251 __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
252 __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
253 __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
254 __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
255
256 a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
257 a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
258
259 a0 = _mm256_max_epi32(a0, *clamp_lo);
260 a0 = _mm256_min_epi32(a0, *clamp_hi);
261 a1 = _mm256_max_epi32(a1, *clamp_lo);
262 a1 = _mm256_min_epi32(a1, *clamp_hi);
263
264 *out0 = a0;
265 *out1 = a1;
266 }
267
idct32_stage4_avx2(__m256i * bf1,const __m256i * cospim8,const __m256i * cospi56,const __m256i * cospi8,const __m256i * cospim56,const __m256i * cospim40,const __m256i * cospi24,const __m256i * cospi40,const __m256i * cospim24,const __m256i * rounding,int bit)268 static INLINE void idct32_stage4_avx2(
269 __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
270 const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
271 const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
272 const __m256i *rounding, int bit) {
273 __m256i temp1, temp2;
274 temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
275 bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
276 bf1[17] = temp1;
277
278 temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
279 bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
280 bf1[18] = temp2;
281
282 temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
283 bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
284 bf1[21] = temp1;
285
286 temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
287 bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
288 bf1[22] = temp2;
289 }
290
idct32_stage5_avx2(__m256i * bf1,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)291 static INLINE void idct32_stage5_avx2(
292 __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
293 const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
294 const __m256i *clamp_hi, const __m256i *rounding, int bit) {
295 __m256i temp1, temp2;
296 temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
297 bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
298 bf1[9] = temp1;
299
300 temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
301 bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
302 bf1[10] = temp2;
303
304 addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
305 addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
306 addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
307 addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
308 addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
309 addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
310 addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
311 addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
312 }
313
idct32_stage6_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)314 static INLINE void idct32_stage6_avx2(
315 __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
316 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
317 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
318 const __m256i *rounding, int bit) {
319 __m256i temp1, temp2;
320 temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
321 bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
322 bf1[5] = temp1;
323
324 addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
325 addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
326 addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
327 addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
328
329 temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
330 bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
331 bf1[18] = temp1;
332 temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
333 bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
334 bf1[19] = temp2;
335 temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
336 bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
337 bf1[20] = temp1;
338 temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
339 bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
340 bf1[21] = temp2;
341 }
342
idct32_stage7_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)343 static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
344 const __m256i *cospi32,
345 const __m256i *clamp_lo,
346 const __m256i *clamp_hi,
347 const __m256i *rounding, int bit) {
348 __m256i temp1, temp2;
349 addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
350 addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
351 addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
352 addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
353
354 temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
355 bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
356 bf1[10] = temp1;
357 temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
358 bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
359 bf1[11] = temp2;
360
361 addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
362 addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
363 addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
364 addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
365 addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
366 addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
367 addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
368 addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
369 }
370
idct32_stage8_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)371 static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
372 const __m256i *cospi32,
373 const __m256i *clamp_lo,
374 const __m256i *clamp_hi,
375 const __m256i *rounding, int bit) {
376 __m256i temp1, temp2;
377 addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
378 addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
379 addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
380 addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
381 addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
382 addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
383 addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
384 addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
385
386 temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
387 bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
388 bf1[20] = temp1;
389 temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
390 bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
391 bf1[21] = temp2;
392 temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
393 bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
394 bf1[22] = temp1;
395 temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
396 bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
397 bf1[23] = temp2;
398 }
399
idct32_stage9_avx2(__m256i * bf1,__m256i * out,const int do_cols,const int bd,const int out_shift,const int log_range)400 static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
401 const int do_cols, const int bd,
402 const int out_shift,
403 const int log_range) {
404 if (do_cols) {
405 addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
406 addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
407 addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
408 addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
409 addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
410 addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
411 addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
412 addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
413 addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
414 addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
415 addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
416 addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
417 addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
418 addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
419 addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
420 addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
421 } else {
422 const int log_range_out = AOMMAX(16, bd + 6);
423 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
424 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
425 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
426 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
427
428 addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
429 &clamp_hi_out, out_shift);
430 addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
431 &clamp_hi_out, out_shift);
432 addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
433 &clamp_hi_out, out_shift);
434 addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
435 &clamp_hi_out, out_shift);
436 addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
437 &clamp_hi_out, out_shift);
438 addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
439 &clamp_hi_out, out_shift);
440 addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
441 &clamp_hi_out, out_shift);
442 addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
443 &clamp_hi_out, out_shift);
444 addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
445 &clamp_hi_out, out_shift);
446 addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
447 &clamp_hi_out, out_shift);
448 addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
449 &clamp_hi_out, out_shift);
450 addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
451 &clamp_hi_out, out_shift);
452 addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
453 &clamp_hi_out, out_shift);
454 addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
455 &clamp_hi_out, out_shift);
456 addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
457 &clamp_hi_out, out_shift);
458 addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
459 &clamp_hi_out, out_shift);
460 }
461 }
462
idct32_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)463 static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
464 int bd, int out_shift) {
465 const int32_t *cospi = cospi_arr(bit);
466 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
467 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
468 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
469 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
470 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
471 __m256i x;
472 // stage 0
473 // stage 1
474 // stage 2
475 // stage 3
476 // stage 4
477 // stage 5
478 x = _mm256_mullo_epi32(in[0], cospi32);
479 x = _mm256_add_epi32(x, rounding);
480 x = _mm256_srai_epi32(x, bit);
481
482 // stage 6
483 // stage 7
484 // stage 8
485 // stage 9
486 if (do_cols) {
487 x = _mm256_max_epi32(x, clamp_lo);
488 x = _mm256_min_epi32(x, clamp_hi);
489 } else {
490 const int log_range_out = AOMMAX(16, bd + 6);
491 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
492 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
493 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
494 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
495 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
496 x = _mm256_add_epi32(offset, x);
497 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
498 x = _mm256_max_epi32(x, clamp_lo_out);
499 x = _mm256_min_epi32(x, clamp_hi_out);
500 }
501
502 out[0] = x;
503 out[1] = x;
504 out[2] = x;
505 out[3] = x;
506 out[4] = x;
507 out[5] = x;
508 out[6] = x;
509 out[7] = x;
510 out[8] = x;
511 out[9] = x;
512 out[10] = x;
513 out[11] = x;
514 out[12] = x;
515 out[13] = x;
516 out[14] = x;
517 out[15] = x;
518 out[16] = x;
519 out[17] = x;
520 out[18] = x;
521 out[19] = x;
522 out[20] = x;
523 out[21] = x;
524 out[22] = x;
525 out[23] = x;
526 out[24] = x;
527 out[25] = x;
528 out[26] = x;
529 out[27] = x;
530 out[28] = x;
531 out[29] = x;
532 out[30] = x;
533 out[31] = x;
534 }
535
idct32_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)536 static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
537 int bd, int out_shift) {
538 const int32_t *cospi = cospi_arr(bit);
539 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
540 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
541 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
542 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
543 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
544 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
545 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
546 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
547 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
548 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
549 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
550 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
551 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
552 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
553 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
554 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
555 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
556 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
557 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
558 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
559 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
560 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
561 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
562 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
563 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
564 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
565 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
566 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
567 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
568 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
569 __m256i bf1[32];
570
571 {
572 // stage 0
573 // stage 1
574 bf1[0] = in[0];
575 bf1[4] = in[4];
576 bf1[8] = in[2];
577 bf1[12] = in[6];
578 bf1[16] = in[1];
579 bf1[20] = in[5];
580 bf1[24] = in[3];
581 bf1[28] = in[7];
582
583 // stage 2
584 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
585 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
586 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
587 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
588 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
589 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
590 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
591 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
592
593 // stage 3
594 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
595 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
596
597 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
598 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
599 bf1[17] = bf1[16];
600 bf1[18] = bf1[19];
601 bf1[21] = bf1[20];
602 bf1[22] = bf1[23];
603 bf1[25] = bf1[24];
604 bf1[26] = bf1[27];
605 bf1[29] = bf1[28];
606 bf1[30] = bf1[31];
607
608 // stage 4
609 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
610 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
611
612 bf1[9] = bf1[8];
613 bf1[10] = bf1[11];
614 bf1[13] = bf1[12];
615 bf1[14] = bf1[15];
616
617 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
618 &cospi24, &cospi40, &cospim24, &rounding, bit);
619
620 // stage 5
621 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
622 bf1[1] = bf1[0];
623 bf1[5] = bf1[4];
624 bf1[6] = bf1[7];
625
626 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
627 &clamp_hi, &rounding, bit);
628
629 // stage 6
630 bf1[3] = bf1[0];
631 bf1[2] = bf1[1];
632
633 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
634 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
635
636 // stage 7
637 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
638 &rounding, bit);
639
640 // stage 8
641 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
642 &rounding, bit);
643
644 // stage 9
645 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
646 }
647 }
648
idct32_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)649 static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
650 int bd, int out_shift) {
651 const int32_t *cospi = cospi_arr(bit);
652 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
653 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
654 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
655 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
656 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
657 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
658 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
659 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
660 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
661 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
662 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
663 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
664 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
665 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
666 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
667 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
668 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
669 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
670 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
671 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
672 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
673 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
674 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
675 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
676 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
677 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
678 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
679 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
680 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
681 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
682 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
683 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
684 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
685 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
686 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
687 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
688 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
689 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
690 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
691 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
692 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
693 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
694 __m256i bf1[32];
695
696 {
697 // stage 0
698 // stage 1
699 bf1[0] = in[0];
700 bf1[2] = in[8];
701 bf1[4] = in[4];
702 bf1[6] = in[12];
703 bf1[8] = in[2];
704 bf1[10] = in[10];
705 bf1[12] = in[6];
706 bf1[14] = in[14];
707 bf1[16] = in[1];
708 bf1[18] = in[9];
709 bf1[20] = in[5];
710 bf1[22] = in[13];
711 bf1[24] = in[3];
712 bf1[26] = in[11];
713 bf1[28] = in[7];
714 bf1[30] = in[15];
715
716 // stage 2
717 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
718 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
719 bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
720 bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
721 bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
722 bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
723 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
724 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
725 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
726 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
727 bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
728 bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
729 bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
730 bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
731 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
732 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
733
734 // stage 3
735 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
736 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
737 bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
738 bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
739 bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
740 bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
741 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
742 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
743
744 addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
745 addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
746 addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
747 addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
748 addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
749 addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
750 addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
751 addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
752
753 // stage 4
754 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
755 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
756 bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
757 bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
758
759 addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
760 addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
761 addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
762 addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
763
764 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
765 &cospi24, &cospi40, &cospim24, &rounding, bit);
766
767 // stage 5
768 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
769 bf1[1] = bf1[0];
770 bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
771 bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
772
773 addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
774 addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
775
776 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
777 &clamp_hi, &rounding, bit);
778
779 // stage 6
780 addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
781 addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
782
783 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
784 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
785
786 // stage 7
787 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
788 &rounding, bit);
789
790 // stage 8
791 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
792 &rounding, bit);
793
794 // stage 9
795 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
796 }
797 }
798
idct32_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)799 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
800 int out_shift) {
801 const int32_t *cospi = cospi_arr(bit);
802 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
803 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
804 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
805 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
806 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
807 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
808 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
809 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
810 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
811 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
812 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
813 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
814 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
815 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
816 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
817 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
818 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
819 const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
820 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
821 const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
822 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
823 const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
824 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
825 const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
826 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
827 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
828 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
829 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
830 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
831 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
832 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
833 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
834 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
835 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
836 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
837 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
838 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
839 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
840 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
841 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
842 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
843 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
844 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
845 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
846 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
847 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
848 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
849 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
850 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
851 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
852 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
853 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
854 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
855 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
856 __m256i bf1[32], bf0[32];
857
858 {
859 // stage 0
860 // stage 1
861 bf1[0] = in[0];
862 bf1[1] = in[16];
863 bf1[2] = in[8];
864 bf1[3] = in[24];
865 bf1[4] = in[4];
866 bf1[5] = in[20];
867 bf1[6] = in[12];
868 bf1[7] = in[28];
869 bf1[8] = in[2];
870 bf1[9] = in[18];
871 bf1[10] = in[10];
872 bf1[11] = in[26];
873 bf1[12] = in[6];
874 bf1[13] = in[22];
875 bf1[14] = in[14];
876 bf1[15] = in[30];
877 bf1[16] = in[1];
878 bf1[17] = in[17];
879 bf1[18] = in[9];
880 bf1[19] = in[25];
881 bf1[20] = in[5];
882 bf1[21] = in[21];
883 bf1[22] = in[13];
884 bf1[23] = in[29];
885 bf1[24] = in[3];
886 bf1[25] = in[19];
887 bf1[26] = in[11];
888 bf1[27] = in[27];
889 bf1[28] = in[7];
890 bf1[29] = in[23];
891 bf1[30] = in[15];
892 bf1[31] = in[31];
893
894 // stage 2
895 bf0[0] = bf1[0];
896 bf0[1] = bf1[1];
897 bf0[2] = bf1[2];
898 bf0[3] = bf1[3];
899 bf0[4] = bf1[4];
900 bf0[5] = bf1[5];
901 bf0[6] = bf1[6];
902 bf0[7] = bf1[7];
903 bf0[8] = bf1[8];
904 bf0[9] = bf1[9];
905 bf0[10] = bf1[10];
906 bf0[11] = bf1[11];
907 bf0[12] = bf1[12];
908 bf0[13] = bf1[13];
909 bf0[14] = bf1[14];
910 bf0[15] = bf1[15];
911 bf0[16] =
912 half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
913 bf0[17] =
914 half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
915 bf0[18] =
916 half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
917 bf0[19] =
918 half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
919 bf0[20] =
920 half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
921 bf0[21] =
922 half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
923 bf0[22] =
924 half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
925 bf0[23] =
926 half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
927 bf0[24] =
928 half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
929 bf0[25] =
930 half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
931 bf0[26] =
932 half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
933 bf0[27] =
934 half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
935 bf0[28] =
936 half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
937 bf0[29] =
938 half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
939 bf0[30] =
940 half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
941 bf0[31] =
942 half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
943
944 // stage 3
945 bf1[0] = bf0[0];
946 bf1[1] = bf0[1];
947 bf1[2] = bf0[2];
948 bf1[3] = bf0[3];
949 bf1[4] = bf0[4];
950 bf1[5] = bf0[5];
951 bf1[6] = bf0[6];
952 bf1[7] = bf0[7];
953 bf1[8] =
954 half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
955 bf1[9] =
956 half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
957 bf1[10] =
958 half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
959 bf1[11] =
960 half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
961 bf1[12] =
962 half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
963 bf1[13] =
964 half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
965 bf1[14] =
966 half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
967 bf1[15] =
968 half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
969
970 addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
971 addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
972 addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
973 addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
974 addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
975 addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
976 addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
977 addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
978
979 // stage 4
980 bf0[0] = bf1[0];
981 bf0[1] = bf1[1];
982 bf0[2] = bf1[2];
983 bf0[3] = bf1[3];
984 bf0[4] =
985 half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
986 bf0[5] =
987 half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
988 bf0[6] =
989 half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
990 bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
991
992 addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
993 addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
994 addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
995 addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
996
997 bf0[16] = bf1[16];
998 bf0[17] =
999 half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
1000 bf0[18] =
1001 half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
1002 bf0[19] = bf1[19];
1003 bf0[20] = bf1[20];
1004 bf0[21] =
1005 half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
1006 bf0[22] =
1007 half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
1008 bf0[23] = bf1[23];
1009 bf0[24] = bf1[24];
1010 bf0[25] =
1011 half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
1012 bf0[26] =
1013 half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
1014 bf0[27] = bf1[27];
1015 bf0[28] = bf1[28];
1016 bf0[29] =
1017 half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
1018 bf0[30] =
1019 half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
1020 bf0[31] = bf1[31];
1021
1022 // stage 5
1023 bf1[0] =
1024 half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
1025 bf1[1] =
1026 half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
1027 bf1[2] =
1028 half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
1029 bf1[3] =
1030 half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
1031 addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
1032 addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
1033 bf1[8] = bf0[8];
1034 bf1[9] =
1035 half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
1036 bf1[10] =
1037 half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
1038 bf1[11] = bf0[11];
1039 bf1[12] = bf0[12];
1040 bf1[13] =
1041 half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
1042 bf1[14] =
1043 half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
1044 bf1[15] = bf0[15];
1045 addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
1046 addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
1047 addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
1048 addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
1049 addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
1050 addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
1051 addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
1052 addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
1053
1054 // stage 6
1055 addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
1056 addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
1057 bf0[4] = bf1[4];
1058 bf0[5] =
1059 half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1060 bf0[6] =
1061 half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1062 bf0[7] = bf1[7];
1063 addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
1064 addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
1065 addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
1066 addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
1067 bf0[16] = bf1[16];
1068 bf0[17] = bf1[17];
1069 bf0[18] =
1070 half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
1071 bf0[19] =
1072 half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
1073 bf0[20] =
1074 half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
1075 bf0[21] =
1076 half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
1077 bf0[22] = bf1[22];
1078 bf0[23] = bf1[23];
1079 bf0[24] = bf1[24];
1080 bf0[25] = bf1[25];
1081 bf0[26] =
1082 half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
1083 bf0[27] =
1084 half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
1085 bf0[28] =
1086 half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
1087 bf0[29] =
1088 half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
1089 bf0[30] = bf1[30];
1090 bf0[31] = bf1[31];
1091
1092 // stage 7
1093 addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
1094 addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
1095 addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
1096 addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
1097 bf1[8] = bf0[8];
1098 bf1[9] = bf0[9];
1099 bf1[10] =
1100 half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1101 bf1[11] =
1102 half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1103 bf1[12] =
1104 half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1105 bf1[13] =
1106 half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1107 bf1[14] = bf0[14];
1108 bf1[15] = bf0[15];
1109 addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
1110 addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
1111 addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
1112 addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
1113 addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
1114 addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
1115 addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
1116 addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
1117
1118 // stage 8
1119 addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
1120 addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
1121 addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
1122 addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
1123 addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
1124 addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
1125 addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
1126 addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
1127 bf0[16] = bf1[16];
1128 bf0[17] = bf1[17];
1129 bf0[18] = bf1[18];
1130 bf0[19] = bf1[19];
1131 bf0[20] =
1132 half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1133 bf0[21] =
1134 half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1135 bf0[22] =
1136 half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1137 bf0[23] =
1138 half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1139 bf0[24] =
1140 half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1141 bf0[25] =
1142 half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1143 bf0[26] =
1144 half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1145 bf0[27] =
1146 half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1147 bf0[28] = bf1[28];
1148 bf0[29] = bf1[29];
1149 bf0[30] = bf1[30];
1150 bf0[31] = bf1[31];
1151
1152 // stage 9
1153 if (do_cols) {
1154 addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
1155 addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
1156 addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
1157 addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
1158 addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
1159 addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
1160 addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
1161 addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
1162 addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
1163 addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
1164 addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
1165 addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
1166 addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
1167 addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
1168 addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
1169 addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
1170 } else {
1171 const int log_range_out = AOMMAX(16, bd + 6);
1172 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
1173 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1174 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
1175 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1176
1177 addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
1178 &clamp_hi_out, out_shift);
1179 addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
1180 &clamp_hi_out, out_shift);
1181 addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
1182 &clamp_hi_out, out_shift);
1183 addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
1184 &clamp_hi_out, out_shift);
1185 addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
1186 &clamp_hi_out, out_shift);
1187 addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
1188 &clamp_hi_out, out_shift);
1189 addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
1190 &clamp_hi_out, out_shift);
1191 addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
1192 &clamp_hi_out, out_shift);
1193 addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
1194 &clamp_hi_out, out_shift);
1195 addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
1196 &clamp_hi_out, out_shift);
1197 addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
1198 &clamp_hi_out, out_shift);
1199 addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
1200 &clamp_hi_out, out_shift);
1201 addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
1202 &clamp_hi_out, out_shift);
1203 addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
1204 &clamp_hi_out, out_shift);
1205 addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
1206 &clamp_hi_out, out_shift);
1207 addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
1208 &clamp_hi_out, out_shift);
1209 }
1210 }
1211 }
idct16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1212 static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1213 int bd, int out_shift) {
1214 const int32_t *cospi = cospi_arr(bit);
1215 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1216 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1217 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1218 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1219 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1220
1221 {
1222 // stage 0
1223 // stage 1
1224 // stage 2
1225 // stage 3
1226 // stage 4
1227 in[0] = _mm256_mullo_epi32(in[0], cospi32);
1228 in[0] = _mm256_add_epi32(in[0], rnding);
1229 in[0] = _mm256_srai_epi32(in[0], bit);
1230
1231 // stage 5
1232 // stage 6
1233 // stage 7
1234 if (do_cols) {
1235 in[0] = _mm256_max_epi32(in[0], clamp_lo);
1236 in[0] = _mm256_min_epi32(in[0], clamp_hi);
1237 } else {
1238 const int log_range_out = AOMMAX(16, bd + 6);
1239 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
1240 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1241 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
1242 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1243 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
1244 in[0] = _mm256_add_epi32(in[0], offset);
1245 in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1246 in[0] = _mm256_max_epi32(in[0], clamp_lo_out);
1247 in[0] = _mm256_min_epi32(in[0], clamp_hi_out);
1248 }
1249
1250 out[0] = in[0];
1251 out[1] = in[0];
1252 out[2] = in[0];
1253 out[3] = in[0];
1254 out[4] = in[0];
1255 out[5] = in[0];
1256 out[6] = in[0];
1257 out[7] = in[0];
1258 out[8] = in[0];
1259 out[9] = in[0];
1260 out[10] = in[0];
1261 out[11] = in[0];
1262 out[12] = in[0];
1263 out[13] = in[0];
1264 out[14] = in[0];
1265 out[15] = in[0];
1266 }
1267 }
1268
idct16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1269 static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1270 int bd, int out_shift) {
1271 const int32_t *cospi = cospi_arr(bit);
1272 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1273 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1274 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1275 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1276 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1277 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1278 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1279 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1280 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1281 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1282 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1283 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1284 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1285 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1286 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1287 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1288 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1289 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1290 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1291 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1292 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1293 __m256i u[16], x, y;
1294
1295 {
1296 // stage 0
1297 // stage 1
1298 u[0] = in[0];
1299 u[2] = in[4];
1300 u[4] = in[2];
1301 u[6] = in[6];
1302 u[8] = in[1];
1303 u[10] = in[5];
1304 u[12] = in[3];
1305 u[14] = in[7];
1306
1307 // stage 2
1308 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
1309 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
1310
1311 u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
1312 u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
1313
1314 u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
1315 u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
1316
1317 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
1318 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
1319
1320 // stage 3
1321 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
1322 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
1323 u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
1324 u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
1325
1326 addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1327 addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1328 addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1329 addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1330
1331 // stage 4
1332 x = _mm256_mullo_epi32(u[0], cospi32);
1333 u[0] = _mm256_add_epi32(x, rnding);
1334 u[0] = _mm256_srai_epi32(u[0], bit);
1335 u[1] = u[0];
1336
1337 u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
1338 u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
1339
1340 addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1341 addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1342
1343 x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1344 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1345 u[9] = x;
1346 y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1347 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1348 u[10] = y;
1349
1350 // stage 5
1351 addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1352 addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1353
1354 x = _mm256_mullo_epi32(u[5], cospi32);
1355 y = _mm256_mullo_epi32(u[6], cospi32);
1356 u[5] = _mm256_sub_epi32(y, x);
1357 u[5] = _mm256_add_epi32(u[5], rnding);
1358 u[5] = _mm256_srai_epi32(u[5], bit);
1359
1360 u[6] = _mm256_add_epi32(y, x);
1361 u[6] = _mm256_add_epi32(u[6], rnding);
1362 u[6] = _mm256_srai_epi32(u[6], bit);
1363
1364 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1365 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1366 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1367 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1368
1369 // stage 6
1370 addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1371 addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1372 addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1373 addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1374
1375 x = _mm256_mullo_epi32(u[10], cospi32);
1376 y = _mm256_mullo_epi32(u[13], cospi32);
1377 u[10] = _mm256_sub_epi32(y, x);
1378 u[10] = _mm256_add_epi32(u[10], rnding);
1379 u[10] = _mm256_srai_epi32(u[10], bit);
1380
1381 u[13] = _mm256_add_epi32(x, y);
1382 u[13] = _mm256_add_epi32(u[13], rnding);
1383 u[13] = _mm256_srai_epi32(u[13], bit);
1384
1385 x = _mm256_mullo_epi32(u[11], cospi32);
1386 y = _mm256_mullo_epi32(u[12], cospi32);
1387 u[11] = _mm256_sub_epi32(y, x);
1388 u[11] = _mm256_add_epi32(u[11], rnding);
1389 u[11] = _mm256_srai_epi32(u[11], bit);
1390
1391 u[12] = _mm256_add_epi32(x, y);
1392 u[12] = _mm256_add_epi32(u[12], rnding);
1393 u[12] = _mm256_srai_epi32(u[12], bit);
1394 // stage 7
1395 if (do_cols) {
1396 addsub_no_clamp_avx2(u[0], u[15], out + 0, out + 15);
1397 addsub_no_clamp_avx2(u[1], u[14], out + 1, out + 14);
1398 addsub_no_clamp_avx2(u[2], u[13], out + 2, out + 13);
1399 addsub_no_clamp_avx2(u[3], u[12], out + 3, out + 12);
1400 addsub_no_clamp_avx2(u[4], u[11], out + 4, out + 11);
1401 addsub_no_clamp_avx2(u[5], u[10], out + 5, out + 10);
1402 addsub_no_clamp_avx2(u[6], u[9], out + 6, out + 9);
1403 addsub_no_clamp_avx2(u[7], u[8], out + 7, out + 8);
1404 } else {
1405 const int log_range_out = AOMMAX(16, bd + 6);
1406 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
1407 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1408 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
1409 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1410
1411 addsub_shift_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
1412 &clamp_hi_out, out_shift);
1413 addsub_shift_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
1414 &clamp_hi_out, out_shift);
1415 addsub_shift_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
1416 &clamp_hi_out, out_shift);
1417 addsub_shift_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
1418 &clamp_hi_out, out_shift);
1419 addsub_shift_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
1420 &clamp_hi_out, out_shift);
1421 addsub_shift_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
1422 &clamp_hi_out, out_shift);
1423 addsub_shift_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
1424 &clamp_hi_out, out_shift);
1425 addsub_shift_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
1426 &clamp_hi_out, out_shift);
1427 }
1428 }
1429 }
1430
idct16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1431 static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
1432 int out_shift) {
1433 const int32_t *cospi = cospi_arr(bit);
1434 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1435 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1436 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1437 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1438 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1439 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1440 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1441 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1442 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1443 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1444 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1445 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1446 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1447 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1448 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1449 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1450 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1451 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1452 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1453 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1454 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1455 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1456 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1457 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1458 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1459 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1460 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1461 __m256i u[16], v[16], x, y;
1462
1463 {
1464 // stage 0
1465 // stage 1
1466 u[0] = in[0];
1467 u[1] = in[8];
1468 u[2] = in[4];
1469 u[3] = in[12];
1470 u[4] = in[2];
1471 u[5] = in[10];
1472 u[6] = in[6];
1473 u[7] = in[14];
1474 u[8] = in[1];
1475 u[9] = in[9];
1476 u[10] = in[5];
1477 u[11] = in[13];
1478 u[12] = in[3];
1479 u[13] = in[11];
1480 u[14] = in[7];
1481 u[15] = in[15];
1482
1483 // stage 2
1484 v[0] = u[0];
1485 v[1] = u[1];
1486 v[2] = u[2];
1487 v[3] = u[3];
1488 v[4] = u[4];
1489 v[5] = u[5];
1490 v[6] = u[6];
1491 v[7] = u[7];
1492
1493 v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1494 v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1495 v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1496 v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1497 v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1498 v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1499 v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1500 v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1501
1502 // stage 3
1503 u[0] = v[0];
1504 u[1] = v[1];
1505 u[2] = v[2];
1506 u[3] = v[3];
1507 u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1508 u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1509 u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1510 u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1511 addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1512 addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1513 addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1514 addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1515
1516 // stage 4
1517 x = _mm256_mullo_epi32(u[0], cospi32);
1518 y = _mm256_mullo_epi32(u[1], cospi32);
1519 v[0] = _mm256_add_epi32(x, y);
1520 v[0] = _mm256_add_epi32(v[0], rnding);
1521 v[0] = _mm256_srai_epi32(v[0], bit);
1522
1523 v[1] = _mm256_sub_epi32(x, y);
1524 v[1] = _mm256_add_epi32(v[1], rnding);
1525 v[1] = _mm256_srai_epi32(v[1], bit);
1526
1527 v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1528 v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1529 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
1530 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
1531 v[8] = u[8];
1532 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1533 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1534 v[11] = u[11];
1535 v[12] = u[12];
1536 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1537 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1538 v[15] = u[15];
1539
1540 // stage 5
1541 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1542 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1543 u[4] = v[4];
1544
1545 x = _mm256_mullo_epi32(v[5], cospi32);
1546 y = _mm256_mullo_epi32(v[6], cospi32);
1547 u[5] = _mm256_sub_epi32(y, x);
1548 u[5] = _mm256_add_epi32(u[5], rnding);
1549 u[5] = _mm256_srai_epi32(u[5], bit);
1550
1551 u[6] = _mm256_add_epi32(y, x);
1552 u[6] = _mm256_add_epi32(u[6], rnding);
1553 u[6] = _mm256_srai_epi32(u[6], bit);
1554
1555 u[7] = v[7];
1556 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1557 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1558 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1559 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1560
1561 // stage 6
1562 addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
1563 addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
1564 addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
1565 addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
1566 v[8] = u[8];
1567 v[9] = u[9];
1568
1569 x = _mm256_mullo_epi32(u[10], cospi32);
1570 y = _mm256_mullo_epi32(u[13], cospi32);
1571 v[10] = _mm256_sub_epi32(y, x);
1572 v[10] = _mm256_add_epi32(v[10], rnding);
1573 v[10] = _mm256_srai_epi32(v[10], bit);
1574
1575 v[13] = _mm256_add_epi32(x, y);
1576 v[13] = _mm256_add_epi32(v[13], rnding);
1577 v[13] = _mm256_srai_epi32(v[13], bit);
1578
1579 x = _mm256_mullo_epi32(u[11], cospi32);
1580 y = _mm256_mullo_epi32(u[12], cospi32);
1581 v[11] = _mm256_sub_epi32(y, x);
1582 v[11] = _mm256_add_epi32(v[11], rnding);
1583 v[11] = _mm256_srai_epi32(v[11], bit);
1584
1585 v[12] = _mm256_add_epi32(x, y);
1586 v[12] = _mm256_add_epi32(v[12], rnding);
1587 v[12] = _mm256_srai_epi32(v[12], bit);
1588
1589 v[14] = u[14];
1590 v[15] = u[15];
1591
1592 // stage 7
1593 if (do_cols) {
1594 addsub_no_clamp_avx2(v[0], v[15], out + 0, out + 15);
1595 addsub_no_clamp_avx2(v[1], v[14], out + 1, out + 14);
1596 addsub_no_clamp_avx2(v[2], v[13], out + 2, out + 13);
1597 addsub_no_clamp_avx2(v[3], v[12], out + 3, out + 12);
1598 addsub_no_clamp_avx2(v[4], v[11], out + 4, out + 11);
1599 addsub_no_clamp_avx2(v[5], v[10], out + 5, out + 10);
1600 addsub_no_clamp_avx2(v[6], v[9], out + 6, out + 9);
1601 addsub_no_clamp_avx2(v[7], v[8], out + 7, out + 8);
1602 } else {
1603 const int log_range_out = AOMMAX(16, bd + 6);
1604 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
1605 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1606 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
1607 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1608
1609 addsub_shift_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
1610 &clamp_hi_out, out_shift);
1611 addsub_shift_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
1612 &clamp_hi_out, out_shift);
1613 addsub_shift_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
1614 &clamp_hi_out, out_shift);
1615 addsub_shift_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
1616 &clamp_hi_out, out_shift);
1617 addsub_shift_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
1618 &clamp_hi_out, out_shift);
1619 addsub_shift_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
1620 &clamp_hi_out, out_shift);
1621 addsub_shift_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
1622 &clamp_hi_out, out_shift);
1623 addsub_shift_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
1624 &clamp_hi_out, out_shift);
1625 }
1626 }
1627 }
1628
iadst16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1629 static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1630 int bd, int out_shift) {
1631 const int32_t *cospi = cospi_arr(bit);
1632 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1633 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1634 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1635 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1636 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1637 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1638 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1639 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1640 const __m256i zero = _mm256_setzero_si256();
1641 __m256i v[16], x, y, temp1, temp2;
1642
1643 // Calculate the column 0, 1, 2, 3
1644 {
1645 // stage 0
1646 // stage 1
1647 // stage 2
1648 x = _mm256_mullo_epi32(in[0], cospi62);
1649 v[0] = _mm256_add_epi32(x, rnding);
1650 v[0] = _mm256_srai_epi32(v[0], bit);
1651
1652 x = _mm256_mullo_epi32(in[0], cospi2);
1653 v[1] = _mm256_sub_epi32(zero, x);
1654 v[1] = _mm256_add_epi32(v[1], rnding);
1655 v[1] = _mm256_srai_epi32(v[1], bit);
1656
1657 // stage 3
1658 v[8] = v[0];
1659 v[9] = v[1];
1660
1661 // stage 4
1662 temp1 = _mm256_mullo_epi32(v[8], cospi8);
1663 x = _mm256_mullo_epi32(v[9], cospi56);
1664 temp1 = _mm256_add_epi32(temp1, x);
1665 temp1 = _mm256_add_epi32(temp1, rnding);
1666 temp1 = _mm256_srai_epi32(temp1, bit);
1667
1668 temp2 = _mm256_mullo_epi32(v[8], cospi56);
1669 x = _mm256_mullo_epi32(v[9], cospi8);
1670 temp2 = _mm256_sub_epi32(temp2, x);
1671 temp2 = _mm256_add_epi32(temp2, rnding);
1672 temp2 = _mm256_srai_epi32(temp2, bit);
1673 v[8] = temp1;
1674 v[9] = temp2;
1675
1676 // stage 5
1677 v[4] = v[0];
1678 v[5] = v[1];
1679 v[12] = v[8];
1680 v[13] = v[9];
1681
1682 // stage 6
1683 temp1 = _mm256_mullo_epi32(v[4], cospi16);
1684 x = _mm256_mullo_epi32(v[5], cospi48);
1685 temp1 = _mm256_add_epi32(temp1, x);
1686 temp1 = _mm256_add_epi32(temp1, rnding);
1687 temp1 = _mm256_srai_epi32(temp1, bit);
1688
1689 temp2 = _mm256_mullo_epi32(v[4], cospi48);
1690 x = _mm256_mullo_epi32(v[5], cospi16);
1691 temp2 = _mm256_sub_epi32(temp2, x);
1692 temp2 = _mm256_add_epi32(temp2, rnding);
1693 temp2 = _mm256_srai_epi32(temp2, bit);
1694 v[4] = temp1;
1695 v[5] = temp2;
1696
1697 temp1 = _mm256_mullo_epi32(v[12], cospi16);
1698 x = _mm256_mullo_epi32(v[13], cospi48);
1699 temp1 = _mm256_add_epi32(temp1, x);
1700 temp1 = _mm256_add_epi32(temp1, rnding);
1701 temp1 = _mm256_srai_epi32(temp1, bit);
1702
1703 temp2 = _mm256_mullo_epi32(v[12], cospi48);
1704 x = _mm256_mullo_epi32(v[13], cospi16);
1705 temp2 = _mm256_sub_epi32(temp2, x);
1706 temp2 = _mm256_add_epi32(temp2, rnding);
1707 temp2 = _mm256_srai_epi32(temp2, bit);
1708 v[12] = temp1;
1709 v[13] = temp2;
1710
1711 // stage 7
1712 v[2] = v[0];
1713 v[3] = v[1];
1714 v[6] = v[4];
1715 v[7] = v[5];
1716 v[10] = v[8];
1717 v[11] = v[9];
1718 v[14] = v[12];
1719 v[15] = v[13];
1720
1721 // stage 8
1722 y = _mm256_mullo_epi32(v[2], cospi32);
1723 x = _mm256_mullo_epi32(v[3], cospi32);
1724 v[2] = _mm256_add_epi32(y, x);
1725 v[2] = _mm256_add_epi32(v[2], rnding);
1726 v[2] = _mm256_srai_epi32(v[2], bit);
1727
1728 v[3] = _mm256_sub_epi32(y, x);
1729 v[3] = _mm256_add_epi32(v[3], rnding);
1730 v[3] = _mm256_srai_epi32(v[3], bit);
1731
1732 y = _mm256_mullo_epi32(v[6], cospi32);
1733 x = _mm256_mullo_epi32(v[7], cospi32);
1734 v[6] = _mm256_add_epi32(y, x);
1735 v[6] = _mm256_add_epi32(v[6], rnding);
1736 v[6] = _mm256_srai_epi32(v[6], bit);
1737
1738 v[7] = _mm256_sub_epi32(y, x);
1739 v[7] = _mm256_add_epi32(v[7], rnding);
1740 v[7] = _mm256_srai_epi32(v[7], bit);
1741
1742 y = _mm256_mullo_epi32(v[10], cospi32);
1743 x = _mm256_mullo_epi32(v[11], cospi32);
1744 v[10] = _mm256_add_epi32(y, x);
1745 v[10] = _mm256_add_epi32(v[10], rnding);
1746 v[10] = _mm256_srai_epi32(v[10], bit);
1747
1748 v[11] = _mm256_sub_epi32(y, x);
1749 v[11] = _mm256_add_epi32(v[11], rnding);
1750 v[11] = _mm256_srai_epi32(v[11], bit);
1751
1752 y = _mm256_mullo_epi32(v[14], cospi32);
1753 x = _mm256_mullo_epi32(v[15], cospi32);
1754 v[14] = _mm256_add_epi32(y, x);
1755 v[14] = _mm256_add_epi32(v[14], rnding);
1756 v[14] = _mm256_srai_epi32(v[14], bit);
1757
1758 v[15] = _mm256_sub_epi32(y, x);
1759 v[15] = _mm256_add_epi32(v[15], rnding);
1760 v[15] = _mm256_srai_epi32(v[15], bit);
1761
1762 // stage 9
1763 if (do_cols) {
1764 out[0] = v[0];
1765 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
1766 out[2] = v[12];
1767 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
1768 out[4] = v[6];
1769 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
1770 out[6] = v[10];
1771 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
1772 out[8] = v[3];
1773 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
1774 out[10] = v[15];
1775 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
1776 out[12] = v[5];
1777 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
1778 out[14] = v[9];
1779 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
1780 } else {
1781 const int log_range_out = AOMMAX(16, bd + 6);
1782 const __m256i clamp_lo_out =
1783 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1784 const __m256i clamp_hi_out =
1785 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1786
1787 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1788 out_shift);
1789 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
1790 &clamp_hi_out, out_shift);
1791 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
1792 &clamp_hi_out, out_shift);
1793 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
1794 &clamp_hi_out, out_shift);
1795 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
1796 &clamp_hi_out, out_shift);
1797 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
1798 &clamp_hi_out, out_shift);
1799 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
1800 &clamp_hi_out, out_shift);
1801 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
1802 &clamp_hi_out, out_shift);
1803 }
1804 }
1805 }
1806
iadst16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1807 static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1808 int bd, int out_shift) {
1809 const int32_t *cospi = cospi_arr(bit);
1810 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1811 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1812 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1813 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1814 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1815 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1816 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1817 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1818 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1819 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1820 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1821 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1822 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1823 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1824 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1825 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1826 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1827 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1828 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1829 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1830 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1831 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1832 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1833 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1834 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1835 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1836 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1837 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1838 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1839 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1840 __m256i u[16], x, y;
1841
1842 {
1843 // stage 0
1844 // stage 1
1845 // stage 2
1846 __m256i zero = _mm256_setzero_si256();
1847 x = _mm256_mullo_epi32(in[0], cospi62);
1848 u[0] = _mm256_add_epi32(x, rnding);
1849 u[0] = _mm256_srai_epi32(u[0], bit);
1850
1851 x = _mm256_mullo_epi32(in[0], cospi2);
1852 u[1] = _mm256_sub_epi32(zero, x);
1853 u[1] = _mm256_add_epi32(u[1], rnding);
1854 u[1] = _mm256_srai_epi32(u[1], bit);
1855
1856 x = _mm256_mullo_epi32(in[2], cospi54);
1857 u[2] = _mm256_add_epi32(x, rnding);
1858 u[2] = _mm256_srai_epi32(u[2], bit);
1859
1860 x = _mm256_mullo_epi32(in[2], cospi10);
1861 u[3] = _mm256_sub_epi32(zero, x);
1862 u[3] = _mm256_add_epi32(u[3], rnding);
1863 u[3] = _mm256_srai_epi32(u[3], bit);
1864
1865 x = _mm256_mullo_epi32(in[4], cospi46);
1866 u[4] = _mm256_add_epi32(x, rnding);
1867 u[4] = _mm256_srai_epi32(u[4], bit);
1868
1869 x = _mm256_mullo_epi32(in[4], cospi18);
1870 u[5] = _mm256_sub_epi32(zero, x);
1871 u[5] = _mm256_add_epi32(u[5], rnding);
1872 u[5] = _mm256_srai_epi32(u[5], bit);
1873
1874 x = _mm256_mullo_epi32(in[6], cospi38);
1875 u[6] = _mm256_add_epi32(x, rnding);
1876 u[6] = _mm256_srai_epi32(u[6], bit);
1877
1878 x = _mm256_mullo_epi32(in[6], cospi26);
1879 u[7] = _mm256_sub_epi32(zero, x);
1880 u[7] = _mm256_add_epi32(u[7], rnding);
1881 u[7] = _mm256_srai_epi32(u[7], bit);
1882
1883 u[8] = _mm256_mullo_epi32(in[7], cospi34);
1884 u[8] = _mm256_add_epi32(u[8], rnding);
1885 u[8] = _mm256_srai_epi32(u[8], bit);
1886
1887 u[9] = _mm256_mullo_epi32(in[7], cospi30);
1888 u[9] = _mm256_add_epi32(u[9], rnding);
1889 u[9] = _mm256_srai_epi32(u[9], bit);
1890
1891 u[10] = _mm256_mullo_epi32(in[5], cospi42);
1892 u[10] = _mm256_add_epi32(u[10], rnding);
1893 u[10] = _mm256_srai_epi32(u[10], bit);
1894
1895 u[11] = _mm256_mullo_epi32(in[5], cospi22);
1896 u[11] = _mm256_add_epi32(u[11], rnding);
1897 u[11] = _mm256_srai_epi32(u[11], bit);
1898
1899 u[12] = _mm256_mullo_epi32(in[3], cospi50);
1900 u[12] = _mm256_add_epi32(u[12], rnding);
1901 u[12] = _mm256_srai_epi32(u[12], bit);
1902
1903 u[13] = _mm256_mullo_epi32(in[3], cospi14);
1904 u[13] = _mm256_add_epi32(u[13], rnding);
1905 u[13] = _mm256_srai_epi32(u[13], bit);
1906
1907 u[14] = _mm256_mullo_epi32(in[1], cospi58);
1908 u[14] = _mm256_add_epi32(u[14], rnding);
1909 u[14] = _mm256_srai_epi32(u[14], bit);
1910
1911 u[15] = _mm256_mullo_epi32(in[1], cospi6);
1912 u[15] = _mm256_add_epi32(u[15], rnding);
1913 u[15] = _mm256_srai_epi32(u[15], bit);
1914
1915 // stage 3
1916 addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
1917 addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
1918 addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
1919 addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
1920 addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
1921 addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
1922 addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
1923 addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
1924
1925 // stage 4
1926 y = _mm256_mullo_epi32(u[8], cospi56);
1927 x = _mm256_mullo_epi32(u[9], cospi56);
1928 u[8] = _mm256_mullo_epi32(u[8], cospi8);
1929 u[8] = _mm256_add_epi32(u[8], x);
1930 u[8] = _mm256_add_epi32(u[8], rnding);
1931 u[8] = _mm256_srai_epi32(u[8], bit);
1932
1933 x = _mm256_mullo_epi32(u[9], cospi8);
1934 u[9] = _mm256_sub_epi32(y, x);
1935 u[9] = _mm256_add_epi32(u[9], rnding);
1936 u[9] = _mm256_srai_epi32(u[9], bit);
1937
1938 x = _mm256_mullo_epi32(u[11], cospi24);
1939 y = _mm256_mullo_epi32(u[10], cospi24);
1940 u[10] = _mm256_mullo_epi32(u[10], cospi40);
1941 u[10] = _mm256_add_epi32(u[10], x);
1942 u[10] = _mm256_add_epi32(u[10], rnding);
1943 u[10] = _mm256_srai_epi32(u[10], bit);
1944
1945 x = _mm256_mullo_epi32(u[11], cospi40);
1946 u[11] = _mm256_sub_epi32(y, x);
1947 u[11] = _mm256_add_epi32(u[11], rnding);
1948 u[11] = _mm256_srai_epi32(u[11], bit);
1949
1950 x = _mm256_mullo_epi32(u[13], cospi8);
1951 y = _mm256_mullo_epi32(u[12], cospi8);
1952 u[12] = _mm256_mullo_epi32(u[12], cospim56);
1953 u[12] = _mm256_add_epi32(u[12], x);
1954 u[12] = _mm256_add_epi32(u[12], rnding);
1955 u[12] = _mm256_srai_epi32(u[12], bit);
1956
1957 x = _mm256_mullo_epi32(u[13], cospim56);
1958 u[13] = _mm256_sub_epi32(y, x);
1959 u[13] = _mm256_add_epi32(u[13], rnding);
1960 u[13] = _mm256_srai_epi32(u[13], bit);
1961
1962 x = _mm256_mullo_epi32(u[15], cospi40);
1963 y = _mm256_mullo_epi32(u[14], cospi40);
1964 u[14] = _mm256_mullo_epi32(u[14], cospim24);
1965 u[14] = _mm256_add_epi32(u[14], x);
1966 u[14] = _mm256_add_epi32(u[14], rnding);
1967 u[14] = _mm256_srai_epi32(u[14], bit);
1968
1969 x = _mm256_mullo_epi32(u[15], cospim24);
1970 u[15] = _mm256_sub_epi32(y, x);
1971 u[15] = _mm256_add_epi32(u[15], rnding);
1972 u[15] = _mm256_srai_epi32(u[15], bit);
1973
1974 // stage 5
1975 addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
1976 addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
1977 addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
1978 addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
1979 addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
1980 addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
1981 addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
1982 addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
1983
1984 // stage 6
1985 x = _mm256_mullo_epi32(u[5], cospi48);
1986 y = _mm256_mullo_epi32(u[4], cospi48);
1987 u[4] = _mm256_mullo_epi32(u[4], cospi16);
1988 u[4] = _mm256_add_epi32(u[4], x);
1989 u[4] = _mm256_add_epi32(u[4], rnding);
1990 u[4] = _mm256_srai_epi32(u[4], bit);
1991
1992 x = _mm256_mullo_epi32(u[5], cospi16);
1993 u[5] = _mm256_sub_epi32(y, x);
1994 u[5] = _mm256_add_epi32(u[5], rnding);
1995 u[5] = _mm256_srai_epi32(u[5], bit);
1996
1997 x = _mm256_mullo_epi32(u[7], cospi16);
1998 y = _mm256_mullo_epi32(u[6], cospi16);
1999 u[6] = _mm256_mullo_epi32(u[6], cospim48);
2000 u[6] = _mm256_add_epi32(u[6], x);
2001 u[6] = _mm256_add_epi32(u[6], rnding);
2002 u[6] = _mm256_srai_epi32(u[6], bit);
2003
2004 x = _mm256_mullo_epi32(u[7], cospim48);
2005 u[7] = _mm256_sub_epi32(y, x);
2006 u[7] = _mm256_add_epi32(u[7], rnding);
2007 u[7] = _mm256_srai_epi32(u[7], bit);
2008
2009 x = _mm256_mullo_epi32(u[13], cospi48);
2010 y = _mm256_mullo_epi32(u[12], cospi48);
2011 u[12] = _mm256_mullo_epi32(u[12], cospi16);
2012 u[12] = _mm256_add_epi32(u[12], x);
2013 u[12] = _mm256_add_epi32(u[12], rnding);
2014 u[12] = _mm256_srai_epi32(u[12], bit);
2015
2016 x = _mm256_mullo_epi32(u[13], cospi16);
2017 u[13] = _mm256_sub_epi32(y, x);
2018 u[13] = _mm256_add_epi32(u[13], rnding);
2019 u[13] = _mm256_srai_epi32(u[13], bit);
2020
2021 x = _mm256_mullo_epi32(u[15], cospi16);
2022 y = _mm256_mullo_epi32(u[14], cospi16);
2023 u[14] = _mm256_mullo_epi32(u[14], cospim48);
2024 u[14] = _mm256_add_epi32(u[14], x);
2025 u[14] = _mm256_add_epi32(u[14], rnding);
2026 u[14] = _mm256_srai_epi32(u[14], bit);
2027
2028 x = _mm256_mullo_epi32(u[15], cospim48);
2029 u[15] = _mm256_sub_epi32(y, x);
2030 u[15] = _mm256_add_epi32(u[15], rnding);
2031 u[15] = _mm256_srai_epi32(u[15], bit);
2032
2033 // stage 7
2034 addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2035 addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2036 addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2037 addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2038 addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2039 addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2040 addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2041 addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2042
2043 // stage 8
2044 y = _mm256_mullo_epi32(u[2], cospi32);
2045 x = _mm256_mullo_epi32(u[3], cospi32);
2046 u[2] = _mm256_add_epi32(y, x);
2047 u[2] = _mm256_add_epi32(u[2], rnding);
2048 u[2] = _mm256_srai_epi32(u[2], bit);
2049
2050 u[3] = _mm256_sub_epi32(y, x);
2051 u[3] = _mm256_add_epi32(u[3], rnding);
2052 u[3] = _mm256_srai_epi32(u[3], bit);
2053 y = _mm256_mullo_epi32(u[6], cospi32);
2054 x = _mm256_mullo_epi32(u[7], cospi32);
2055 u[6] = _mm256_add_epi32(y, x);
2056 u[6] = _mm256_add_epi32(u[6], rnding);
2057 u[6] = _mm256_srai_epi32(u[6], bit);
2058
2059 u[7] = _mm256_sub_epi32(y, x);
2060 u[7] = _mm256_add_epi32(u[7], rnding);
2061 u[7] = _mm256_srai_epi32(u[7], bit);
2062
2063 y = _mm256_mullo_epi32(u[10], cospi32);
2064 x = _mm256_mullo_epi32(u[11], cospi32);
2065 u[10] = _mm256_add_epi32(y, x);
2066 u[10] = _mm256_add_epi32(u[10], rnding);
2067 u[10] = _mm256_srai_epi32(u[10], bit);
2068
2069 u[11] = _mm256_sub_epi32(y, x);
2070 u[11] = _mm256_add_epi32(u[11], rnding);
2071 u[11] = _mm256_srai_epi32(u[11], bit);
2072
2073 y = _mm256_mullo_epi32(u[14], cospi32);
2074 x = _mm256_mullo_epi32(u[15], cospi32);
2075 u[14] = _mm256_add_epi32(y, x);
2076 u[14] = _mm256_add_epi32(u[14], rnding);
2077 u[14] = _mm256_srai_epi32(u[14], bit);
2078
2079 u[15] = _mm256_sub_epi32(y, x);
2080 u[15] = _mm256_add_epi32(u[15], rnding);
2081 u[15] = _mm256_srai_epi32(u[15], bit);
2082
2083 // stage 9
2084 if (do_cols) {
2085 out[0] = u[0];
2086 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
2087 out[2] = u[12];
2088 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
2089 out[4] = u[6];
2090 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
2091 out[6] = u[10];
2092 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
2093 out[8] = u[3];
2094 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
2095 out[10] = u[15];
2096 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
2097 out[12] = u[5];
2098 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
2099 out[14] = u[9];
2100 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
2101 } else {
2102 const int log_range_out = AOMMAX(16, bd + 6);
2103 const __m256i clamp_lo_out =
2104 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2105 const __m256i clamp_hi_out =
2106 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2107
2108 neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2109 out_shift);
2110 neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2111 &clamp_hi_out, out_shift);
2112 neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2113 &clamp_hi_out, out_shift);
2114 neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2115 &clamp_hi_out, out_shift);
2116 neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2117 &clamp_hi_out, out_shift);
2118 neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2119 &clamp_hi_out, out_shift);
2120 neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2121 &clamp_hi_out, out_shift);
2122 neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2123 &clamp_hi_out, out_shift);
2124 }
2125 }
2126 }
2127
iadst16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2128 static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2129 int bd, int out_shift) {
2130 const int32_t *cospi = cospi_arr(bit);
2131 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2132 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2133 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2134 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2135 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2136 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2137 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2138 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2139 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2140 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2141 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2142 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2143 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2144 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2145 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2146 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2147 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2148 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2149 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2150 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2151 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2152 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2153 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2154 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2155 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2156 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2157 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2158 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2159 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2160 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2161 __m256i u[16], v[16], x, y;
2162
2163 {
2164 // stage 0
2165 // stage 1
2166 // stage 2
2167 v[0] = _mm256_mullo_epi32(in[15], cospi2);
2168 x = _mm256_mullo_epi32(in[0], cospi62);
2169 v[0] = _mm256_add_epi32(v[0], x);
2170 v[0] = _mm256_add_epi32(v[0], rnding);
2171 v[0] = _mm256_srai_epi32(v[0], bit);
2172
2173 v[1] = _mm256_mullo_epi32(in[15], cospi62);
2174 x = _mm256_mullo_epi32(in[0], cospi2);
2175 v[1] = _mm256_sub_epi32(v[1], x);
2176 v[1] = _mm256_add_epi32(v[1], rnding);
2177 v[1] = _mm256_srai_epi32(v[1], bit);
2178
2179 v[2] = _mm256_mullo_epi32(in[13], cospi10);
2180 x = _mm256_mullo_epi32(in[2], cospi54);
2181 v[2] = _mm256_add_epi32(v[2], x);
2182 v[2] = _mm256_add_epi32(v[2], rnding);
2183 v[2] = _mm256_srai_epi32(v[2], bit);
2184
2185 v[3] = _mm256_mullo_epi32(in[13], cospi54);
2186 x = _mm256_mullo_epi32(in[2], cospi10);
2187 v[3] = _mm256_sub_epi32(v[3], x);
2188 v[3] = _mm256_add_epi32(v[3], rnding);
2189 v[3] = _mm256_srai_epi32(v[3], bit);
2190
2191 v[4] = _mm256_mullo_epi32(in[11], cospi18);
2192 x = _mm256_mullo_epi32(in[4], cospi46);
2193 v[4] = _mm256_add_epi32(v[4], x);
2194 v[4] = _mm256_add_epi32(v[4], rnding);
2195 v[4] = _mm256_srai_epi32(v[4], bit);
2196
2197 v[5] = _mm256_mullo_epi32(in[11], cospi46);
2198 x = _mm256_mullo_epi32(in[4], cospi18);
2199 v[5] = _mm256_sub_epi32(v[5], x);
2200 v[5] = _mm256_add_epi32(v[5], rnding);
2201 v[5] = _mm256_srai_epi32(v[5], bit);
2202
2203 v[6] = _mm256_mullo_epi32(in[9], cospi26);
2204 x = _mm256_mullo_epi32(in[6], cospi38);
2205 v[6] = _mm256_add_epi32(v[6], x);
2206 v[6] = _mm256_add_epi32(v[6], rnding);
2207 v[6] = _mm256_srai_epi32(v[6], bit);
2208
2209 v[7] = _mm256_mullo_epi32(in[9], cospi38);
2210 x = _mm256_mullo_epi32(in[6], cospi26);
2211 v[7] = _mm256_sub_epi32(v[7], x);
2212 v[7] = _mm256_add_epi32(v[7], rnding);
2213 v[7] = _mm256_srai_epi32(v[7], bit);
2214
2215 v[8] = _mm256_mullo_epi32(in[7], cospi34);
2216 x = _mm256_mullo_epi32(in[8], cospi30);
2217 v[8] = _mm256_add_epi32(v[8], x);
2218 v[8] = _mm256_add_epi32(v[8], rnding);
2219 v[8] = _mm256_srai_epi32(v[8], bit);
2220
2221 v[9] = _mm256_mullo_epi32(in[7], cospi30);
2222 x = _mm256_mullo_epi32(in[8], cospi34);
2223 v[9] = _mm256_sub_epi32(v[9], x);
2224 v[9] = _mm256_add_epi32(v[9], rnding);
2225 v[9] = _mm256_srai_epi32(v[9], bit);
2226
2227 v[10] = _mm256_mullo_epi32(in[5], cospi42);
2228 x = _mm256_mullo_epi32(in[10], cospi22);
2229 v[10] = _mm256_add_epi32(v[10], x);
2230 v[10] = _mm256_add_epi32(v[10], rnding);
2231 v[10] = _mm256_srai_epi32(v[10], bit);
2232
2233 v[11] = _mm256_mullo_epi32(in[5], cospi22);
2234 x = _mm256_mullo_epi32(in[10], cospi42);
2235 v[11] = _mm256_sub_epi32(v[11], x);
2236 v[11] = _mm256_add_epi32(v[11], rnding);
2237 v[11] = _mm256_srai_epi32(v[11], bit);
2238
2239 v[12] = _mm256_mullo_epi32(in[3], cospi50);
2240 x = _mm256_mullo_epi32(in[12], cospi14);
2241 v[12] = _mm256_add_epi32(v[12], x);
2242 v[12] = _mm256_add_epi32(v[12], rnding);
2243 v[12] = _mm256_srai_epi32(v[12], bit);
2244
2245 v[13] = _mm256_mullo_epi32(in[3], cospi14);
2246 x = _mm256_mullo_epi32(in[12], cospi50);
2247 v[13] = _mm256_sub_epi32(v[13], x);
2248 v[13] = _mm256_add_epi32(v[13], rnding);
2249 v[13] = _mm256_srai_epi32(v[13], bit);
2250
2251 v[14] = _mm256_mullo_epi32(in[1], cospi58);
2252 x = _mm256_mullo_epi32(in[14], cospi6);
2253 v[14] = _mm256_add_epi32(v[14], x);
2254 v[14] = _mm256_add_epi32(v[14], rnding);
2255 v[14] = _mm256_srai_epi32(v[14], bit);
2256
2257 v[15] = _mm256_mullo_epi32(in[1], cospi6);
2258 x = _mm256_mullo_epi32(in[14], cospi58);
2259 v[15] = _mm256_sub_epi32(v[15], x);
2260 v[15] = _mm256_add_epi32(v[15], rnding);
2261 v[15] = _mm256_srai_epi32(v[15], bit);
2262
2263 // stage 3
2264 addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2265 addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2266 addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2267 addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2268 addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2269 addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2270 addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2271 addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2272
2273 // stage 4
2274 v[0] = u[0];
2275 v[1] = u[1];
2276 v[2] = u[2];
2277 v[3] = u[3];
2278 v[4] = u[4];
2279 v[5] = u[5];
2280 v[6] = u[6];
2281 v[7] = u[7];
2282
2283 v[8] = _mm256_mullo_epi32(u[8], cospi8);
2284 x = _mm256_mullo_epi32(u[9], cospi56);
2285 v[8] = _mm256_add_epi32(v[8], x);
2286 v[8] = _mm256_add_epi32(v[8], rnding);
2287 v[8] = _mm256_srai_epi32(v[8], bit);
2288
2289 v[9] = _mm256_mullo_epi32(u[8], cospi56);
2290 x = _mm256_mullo_epi32(u[9], cospi8);
2291 v[9] = _mm256_sub_epi32(v[9], x);
2292 v[9] = _mm256_add_epi32(v[9], rnding);
2293 v[9] = _mm256_srai_epi32(v[9], bit);
2294
2295 v[10] = _mm256_mullo_epi32(u[10], cospi40);
2296 x = _mm256_mullo_epi32(u[11], cospi24);
2297 v[10] = _mm256_add_epi32(v[10], x);
2298 v[10] = _mm256_add_epi32(v[10], rnding);
2299 v[10] = _mm256_srai_epi32(v[10], bit);
2300
2301 v[11] = _mm256_mullo_epi32(u[10], cospi24);
2302 x = _mm256_mullo_epi32(u[11], cospi40);
2303 v[11] = _mm256_sub_epi32(v[11], x);
2304 v[11] = _mm256_add_epi32(v[11], rnding);
2305 v[11] = _mm256_srai_epi32(v[11], bit);
2306
2307 v[12] = _mm256_mullo_epi32(u[12], cospim56);
2308 x = _mm256_mullo_epi32(u[13], cospi8);
2309 v[12] = _mm256_add_epi32(v[12], x);
2310 v[12] = _mm256_add_epi32(v[12], rnding);
2311 v[12] = _mm256_srai_epi32(v[12], bit);
2312
2313 v[13] = _mm256_mullo_epi32(u[12], cospi8);
2314 x = _mm256_mullo_epi32(u[13], cospim56);
2315 v[13] = _mm256_sub_epi32(v[13], x);
2316 v[13] = _mm256_add_epi32(v[13], rnding);
2317 v[13] = _mm256_srai_epi32(v[13], bit);
2318
2319 v[14] = _mm256_mullo_epi32(u[14], cospim24);
2320 x = _mm256_mullo_epi32(u[15], cospi40);
2321 v[14] = _mm256_add_epi32(v[14], x);
2322 v[14] = _mm256_add_epi32(v[14], rnding);
2323 v[14] = _mm256_srai_epi32(v[14], bit);
2324
2325 v[15] = _mm256_mullo_epi32(u[14], cospi40);
2326 x = _mm256_mullo_epi32(u[15], cospim24);
2327 v[15] = _mm256_sub_epi32(v[15], x);
2328 v[15] = _mm256_add_epi32(v[15], rnding);
2329 v[15] = _mm256_srai_epi32(v[15], bit);
2330
2331 // stage 5
2332 addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2333 addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2334 addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2335 addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2336 addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2337 addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2338 addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2339 addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2340
2341 // stage 6
2342 v[0] = u[0];
2343 v[1] = u[1];
2344 v[2] = u[2];
2345 v[3] = u[3];
2346
2347 v[4] = _mm256_mullo_epi32(u[4], cospi16);
2348 x = _mm256_mullo_epi32(u[5], cospi48);
2349 v[4] = _mm256_add_epi32(v[4], x);
2350 v[4] = _mm256_add_epi32(v[4], rnding);
2351 v[4] = _mm256_srai_epi32(v[4], bit);
2352
2353 v[5] = _mm256_mullo_epi32(u[4], cospi48);
2354 x = _mm256_mullo_epi32(u[5], cospi16);
2355 v[5] = _mm256_sub_epi32(v[5], x);
2356 v[5] = _mm256_add_epi32(v[5], rnding);
2357 v[5] = _mm256_srai_epi32(v[5], bit);
2358
2359 v[6] = _mm256_mullo_epi32(u[6], cospim48);
2360 x = _mm256_mullo_epi32(u[7], cospi16);
2361 v[6] = _mm256_add_epi32(v[6], x);
2362 v[6] = _mm256_add_epi32(v[6], rnding);
2363 v[6] = _mm256_srai_epi32(v[6], bit);
2364
2365 v[7] = _mm256_mullo_epi32(u[6], cospi16);
2366 x = _mm256_mullo_epi32(u[7], cospim48);
2367 v[7] = _mm256_sub_epi32(v[7], x);
2368 v[7] = _mm256_add_epi32(v[7], rnding);
2369 v[7] = _mm256_srai_epi32(v[7], bit);
2370
2371 v[8] = u[8];
2372 v[9] = u[9];
2373 v[10] = u[10];
2374 v[11] = u[11];
2375
2376 v[12] = _mm256_mullo_epi32(u[12], cospi16);
2377 x = _mm256_mullo_epi32(u[13], cospi48);
2378 v[12] = _mm256_add_epi32(v[12], x);
2379 v[12] = _mm256_add_epi32(v[12], rnding);
2380 v[12] = _mm256_srai_epi32(v[12], bit);
2381
2382 v[13] = _mm256_mullo_epi32(u[12], cospi48);
2383 x = _mm256_mullo_epi32(u[13], cospi16);
2384 v[13] = _mm256_sub_epi32(v[13], x);
2385 v[13] = _mm256_add_epi32(v[13], rnding);
2386 v[13] = _mm256_srai_epi32(v[13], bit);
2387
2388 v[14] = _mm256_mullo_epi32(u[14], cospim48);
2389 x = _mm256_mullo_epi32(u[15], cospi16);
2390 v[14] = _mm256_add_epi32(v[14], x);
2391 v[14] = _mm256_add_epi32(v[14], rnding);
2392 v[14] = _mm256_srai_epi32(v[14], bit);
2393
2394 v[15] = _mm256_mullo_epi32(u[14], cospi16);
2395 x = _mm256_mullo_epi32(u[15], cospim48);
2396 v[15] = _mm256_sub_epi32(v[15], x);
2397 v[15] = _mm256_add_epi32(v[15], rnding);
2398 v[15] = _mm256_srai_epi32(v[15], bit);
2399
2400 // stage 7
2401 addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2402 addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2403 addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2404 addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2405 addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2406 addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2407 addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2408 addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2409
2410 // stage 8
2411 v[0] = u[0];
2412 v[1] = u[1];
2413
2414 y = _mm256_mullo_epi32(u[2], cospi32);
2415 x = _mm256_mullo_epi32(u[3], cospi32);
2416 v[2] = _mm256_add_epi32(y, x);
2417 v[2] = _mm256_add_epi32(v[2], rnding);
2418 v[2] = _mm256_srai_epi32(v[2], bit);
2419
2420 v[3] = _mm256_sub_epi32(y, x);
2421 v[3] = _mm256_add_epi32(v[3], rnding);
2422 v[3] = _mm256_srai_epi32(v[3], bit);
2423
2424 v[4] = u[4];
2425 v[5] = u[5];
2426
2427 y = _mm256_mullo_epi32(u[6], cospi32);
2428 x = _mm256_mullo_epi32(u[7], cospi32);
2429 v[6] = _mm256_add_epi32(y, x);
2430 v[6] = _mm256_add_epi32(v[6], rnding);
2431 v[6] = _mm256_srai_epi32(v[6], bit);
2432
2433 v[7] = _mm256_sub_epi32(y, x);
2434 v[7] = _mm256_add_epi32(v[7], rnding);
2435 v[7] = _mm256_srai_epi32(v[7], bit);
2436
2437 v[8] = u[8];
2438 v[9] = u[9];
2439
2440 y = _mm256_mullo_epi32(u[10], cospi32);
2441 x = _mm256_mullo_epi32(u[11], cospi32);
2442 v[10] = _mm256_add_epi32(y, x);
2443 v[10] = _mm256_add_epi32(v[10], rnding);
2444 v[10] = _mm256_srai_epi32(v[10], bit);
2445
2446 v[11] = _mm256_sub_epi32(y, x);
2447 v[11] = _mm256_add_epi32(v[11], rnding);
2448 v[11] = _mm256_srai_epi32(v[11], bit);
2449
2450 v[12] = u[12];
2451 v[13] = u[13];
2452
2453 y = _mm256_mullo_epi32(u[14], cospi32);
2454 x = _mm256_mullo_epi32(u[15], cospi32);
2455 v[14] = _mm256_add_epi32(y, x);
2456 v[14] = _mm256_add_epi32(v[14], rnding);
2457 v[14] = _mm256_srai_epi32(v[14], bit);
2458
2459 v[15] = _mm256_sub_epi32(y, x);
2460 v[15] = _mm256_add_epi32(v[15], rnding);
2461 v[15] = _mm256_srai_epi32(v[15], bit);
2462
2463 // stage 9
2464 if (do_cols) {
2465 out[0] = v[0];
2466 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
2467 out[2] = v[12];
2468 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
2469 out[4] = v[6];
2470 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
2471 out[6] = v[10];
2472 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
2473 out[8] = v[3];
2474 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
2475 out[10] = v[15];
2476 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
2477 out[12] = v[5];
2478 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
2479 out[14] = v[9];
2480 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
2481 } else {
2482 const int log_range_out = AOMMAX(16, bd + 6);
2483 const __m256i clamp_lo_out =
2484 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2485 const __m256i clamp_hi_out =
2486 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2487
2488 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2489 out_shift);
2490 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2491 &clamp_hi_out, out_shift);
2492 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2493 &clamp_hi_out, out_shift);
2494 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2495 &clamp_hi_out, out_shift);
2496 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2497 &clamp_hi_out, out_shift);
2498 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2499 &clamp_hi_out, out_shift);
2500 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2501 &clamp_hi_out, out_shift);
2502 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2503 &clamp_hi_out, out_shift);
2504 }
2505 }
2506 }
idct8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2507 static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2508 int bd, int out_shift) {
2509 const int32_t *cospi = cospi_arr(bit);
2510 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2511 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2512 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2513 __m256i x;
2514
2515 // stage 0
2516 // stage 1
2517 // stage 2
2518 // stage 3
2519 x = _mm256_mullo_epi32(in[0], cospi32);
2520 x = _mm256_add_epi32(x, rnding);
2521 x = _mm256_srai_epi32(x, bit);
2522
2523 // stage 4
2524 // stage 5
2525 if (!do_cols) {
2526 const int log_range_out = AOMMAX(16, bd + 6);
2527 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2528 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2529 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2530 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2531
2532 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2533 x = _mm256_add_epi32(x, offset);
2534 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2535 x = _mm256_max_epi32(x, clamp_lo_out);
2536 x = _mm256_min_epi32(x, clamp_hi_out);
2537 }
2538
2539 out[0] = x;
2540 out[1] = x;
2541 out[2] = x;
2542 out[3] = x;
2543 out[4] = x;
2544 out[5] = x;
2545 out[6] = x;
2546 out[7] = x;
2547 }
idct8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2548 static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2549 int bd, int out_shift) {
2550 const int32_t *cospi = cospi_arr(bit);
2551 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2552 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2553 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2554 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2555 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2556 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2557 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2558 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2559 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2560 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2561 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2562 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2563 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2564 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2565 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2566 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2567 __m256i x, y;
2568
2569 // stage 0
2570 // stage 1
2571 // stage 2
2572 u0 = in[0];
2573 u1 = in[4];
2574 u2 = in[2];
2575 u3 = in[6];
2576
2577 x = _mm256_mullo_epi32(in[1], cospi56);
2578 y = _mm256_mullo_epi32(in[7], cospim8);
2579 u4 = _mm256_add_epi32(x, y);
2580 u4 = _mm256_add_epi32(u4, rnding);
2581 u4 = _mm256_srai_epi32(u4, bit);
2582
2583 x = _mm256_mullo_epi32(in[1], cospi8);
2584 y = _mm256_mullo_epi32(in[7], cospi56);
2585 u7 = _mm256_add_epi32(x, y);
2586 u7 = _mm256_add_epi32(u7, rnding);
2587 u7 = _mm256_srai_epi32(u7, bit);
2588
2589 x = _mm256_mullo_epi32(in[5], cospi24);
2590 y = _mm256_mullo_epi32(in[3], cospim40);
2591 u5 = _mm256_add_epi32(x, y);
2592 u5 = _mm256_add_epi32(u5, rnding);
2593 u5 = _mm256_srai_epi32(u5, bit);
2594
2595 x = _mm256_mullo_epi32(in[5], cospi40);
2596 y = _mm256_mullo_epi32(in[3], cospi24);
2597 u6 = _mm256_add_epi32(x, y);
2598 u6 = _mm256_add_epi32(u6, rnding);
2599 u6 = _mm256_srai_epi32(u6, bit);
2600
2601 // stage 3
2602 x = _mm256_mullo_epi32(u0, cospi32);
2603 y = _mm256_mullo_epi32(u1, cospi32);
2604 v0 = _mm256_add_epi32(x, y);
2605 v0 = _mm256_add_epi32(v0, rnding);
2606 v0 = _mm256_srai_epi32(v0, bit);
2607
2608 v1 = _mm256_sub_epi32(x, y);
2609 v1 = _mm256_add_epi32(v1, rnding);
2610 v1 = _mm256_srai_epi32(v1, bit);
2611
2612 x = _mm256_mullo_epi32(u2, cospi48);
2613 y = _mm256_mullo_epi32(u3, cospim16);
2614 v2 = _mm256_add_epi32(x, y);
2615 v2 = _mm256_add_epi32(v2, rnding);
2616 v2 = _mm256_srai_epi32(v2, bit);
2617
2618 x = _mm256_mullo_epi32(u2, cospi16);
2619 y = _mm256_mullo_epi32(u3, cospi48);
2620 v3 = _mm256_add_epi32(x, y);
2621 v3 = _mm256_add_epi32(v3, rnding);
2622 v3 = _mm256_srai_epi32(v3, bit);
2623
2624 addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2625 addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2626
2627 // stage 4
2628 addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2629 addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2630 u4 = v4;
2631 u7 = v7;
2632
2633 x = _mm256_mullo_epi32(v5, cospi32);
2634 y = _mm256_mullo_epi32(v6, cospi32);
2635 u6 = _mm256_add_epi32(y, x);
2636 u6 = _mm256_add_epi32(u6, rnding);
2637 u6 = _mm256_srai_epi32(u6, bit);
2638
2639 u5 = _mm256_sub_epi32(y, x);
2640 u5 = _mm256_add_epi32(u5, rnding);
2641 u5 = _mm256_srai_epi32(u5, bit);
2642
2643 // stage 5
2644 if (do_cols) {
2645 addsub_no_clamp_avx2(u0, u7, out + 0, out + 7);
2646 addsub_no_clamp_avx2(u1, u6, out + 1, out + 6);
2647 addsub_no_clamp_avx2(u2, u5, out + 2, out + 5);
2648 addsub_no_clamp_avx2(u3, u4, out + 3, out + 4);
2649 } else {
2650 const int log_range_out = AOMMAX(16, bd + 6);
2651 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
2652 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2653 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
2654 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2655 addsub_shift_avx2(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
2656 out_shift);
2657 addsub_shift_avx2(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
2658 out_shift);
2659 addsub_shift_avx2(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
2660 out_shift);
2661 addsub_shift_avx2(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
2662 out_shift);
2663 }
2664 }
iadst8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2665 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2666 int bd, int out_shift) {
2667 const int32_t *cospi = cospi_arr(bit);
2668 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2669 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2670 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2671 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2672 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2673 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2674 const __m256i kZero = _mm256_setzero_si256();
2675 __m256i u[8], x;
2676
2677 // stage 0
2678 // stage 1
2679 // stage 2
2680
2681 x = _mm256_mullo_epi32(in[0], cospi60);
2682 u[0] = _mm256_add_epi32(x, rnding);
2683 u[0] = _mm256_srai_epi32(u[0], bit);
2684
2685 x = _mm256_mullo_epi32(in[0], cospi4);
2686 u[1] = _mm256_sub_epi32(kZero, x);
2687 u[1] = _mm256_add_epi32(u[1], rnding);
2688 u[1] = _mm256_srai_epi32(u[1], bit);
2689
2690 // stage 3
2691 // stage 4
2692 __m256i temp1, temp2;
2693 temp1 = _mm256_mullo_epi32(u[0], cospi16);
2694 x = _mm256_mullo_epi32(u[1], cospi48);
2695 temp1 = _mm256_add_epi32(temp1, x);
2696 temp1 = _mm256_add_epi32(temp1, rnding);
2697 temp1 = _mm256_srai_epi32(temp1, bit);
2698 u[4] = temp1;
2699
2700 temp2 = _mm256_mullo_epi32(u[0], cospi48);
2701 x = _mm256_mullo_epi32(u[1], cospi16);
2702 u[5] = _mm256_sub_epi32(temp2, x);
2703 u[5] = _mm256_add_epi32(u[5], rnding);
2704 u[5] = _mm256_srai_epi32(u[5], bit);
2705
2706 // stage 5
2707 // stage 6
2708 temp1 = _mm256_mullo_epi32(u[0], cospi32);
2709 x = _mm256_mullo_epi32(u[1], cospi32);
2710 u[2] = _mm256_add_epi32(temp1, x);
2711 u[2] = _mm256_add_epi32(u[2], rnding);
2712 u[2] = _mm256_srai_epi32(u[2], bit);
2713
2714 u[3] = _mm256_sub_epi32(temp1, x);
2715 u[3] = _mm256_add_epi32(u[3], rnding);
2716 u[3] = _mm256_srai_epi32(u[3], bit);
2717
2718 temp1 = _mm256_mullo_epi32(u[4], cospi32);
2719 x = _mm256_mullo_epi32(u[5], cospi32);
2720 u[6] = _mm256_add_epi32(temp1, x);
2721 u[6] = _mm256_add_epi32(u[6], rnding);
2722 u[6] = _mm256_srai_epi32(u[6], bit);
2723
2724 u[7] = _mm256_sub_epi32(temp1, x);
2725 u[7] = _mm256_add_epi32(u[7], rnding);
2726 u[7] = _mm256_srai_epi32(u[7], bit);
2727
2728 // stage 7
2729 if (do_cols) {
2730 out[0] = u[0];
2731 out[1] = _mm256_sub_epi32(kZero, u[4]);
2732 out[2] = u[6];
2733 out[3] = _mm256_sub_epi32(kZero, u[2]);
2734 out[4] = u[3];
2735 out[5] = _mm256_sub_epi32(kZero, u[7]);
2736 out[6] = u[5];
2737 out[7] = _mm256_sub_epi32(kZero, u[1]);
2738 } else {
2739 const int log_range_out = AOMMAX(16, bd + 6);
2740 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2741 const __m256i clamp_hi_out =
2742 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2743
2744 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2745 out_shift);
2746 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2747 out_shift);
2748 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2749 out_shift);
2750 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2751 out_shift);
2752 }
2753 }
2754
iadst8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2755 static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2756 int bd, int out_shift) {
2757 const int32_t *cospi = cospi_arr(bit);
2758 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2759 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2760 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2761 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2762 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2763 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2764 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2765 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2766 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2767 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2768 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2769 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2770 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2771 const __m256i kZero = _mm256_setzero_si256();
2772 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2773 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2774 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2775 __m256i u[8], v[8], x;
2776
2777 // stage 0
2778 // stage 1
2779 // stage 2
2780
2781 u[0] = _mm256_mullo_epi32(in[7], cospi4);
2782 x = _mm256_mullo_epi32(in[0], cospi60);
2783 u[0] = _mm256_add_epi32(u[0], x);
2784 u[0] = _mm256_add_epi32(u[0], rnding);
2785 u[0] = _mm256_srai_epi32(u[0], bit);
2786
2787 u[1] = _mm256_mullo_epi32(in[7], cospi60);
2788 x = _mm256_mullo_epi32(in[0], cospi4);
2789 u[1] = _mm256_sub_epi32(u[1], x);
2790 u[1] = _mm256_add_epi32(u[1], rnding);
2791 u[1] = _mm256_srai_epi32(u[1], bit);
2792
2793 u[2] = _mm256_mullo_epi32(in[5], cospi20);
2794 x = _mm256_mullo_epi32(in[2], cospi44);
2795 u[2] = _mm256_add_epi32(u[2], x);
2796 u[2] = _mm256_add_epi32(u[2], rnding);
2797 u[2] = _mm256_srai_epi32(u[2], bit);
2798
2799 u[3] = _mm256_mullo_epi32(in[5], cospi44);
2800 x = _mm256_mullo_epi32(in[2], cospi20);
2801 u[3] = _mm256_sub_epi32(u[3], x);
2802 u[3] = _mm256_add_epi32(u[3], rnding);
2803 u[3] = _mm256_srai_epi32(u[3], bit);
2804
2805 u[4] = _mm256_mullo_epi32(in[3], cospi36);
2806 x = _mm256_mullo_epi32(in[4], cospi28);
2807 u[4] = _mm256_add_epi32(u[4], x);
2808 u[4] = _mm256_add_epi32(u[4], rnding);
2809 u[4] = _mm256_srai_epi32(u[4], bit);
2810
2811 u[5] = _mm256_mullo_epi32(in[3], cospi28);
2812 x = _mm256_mullo_epi32(in[4], cospi36);
2813 u[5] = _mm256_sub_epi32(u[5], x);
2814 u[5] = _mm256_add_epi32(u[5], rnding);
2815 u[5] = _mm256_srai_epi32(u[5], bit);
2816
2817 u[6] = _mm256_mullo_epi32(in[1], cospi52);
2818 x = _mm256_mullo_epi32(in[6], cospi12);
2819 u[6] = _mm256_add_epi32(u[6], x);
2820 u[6] = _mm256_add_epi32(u[6], rnding);
2821 u[6] = _mm256_srai_epi32(u[6], bit);
2822
2823 u[7] = _mm256_mullo_epi32(in[1], cospi12);
2824 x = _mm256_mullo_epi32(in[6], cospi52);
2825 u[7] = _mm256_sub_epi32(u[7], x);
2826 u[7] = _mm256_add_epi32(u[7], rnding);
2827 u[7] = _mm256_srai_epi32(u[7], bit);
2828
2829 // stage 3
2830 addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2831 addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2832 addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2833 addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2834
2835 // stage 4
2836 u[0] = v[0];
2837 u[1] = v[1];
2838 u[2] = v[2];
2839 u[3] = v[3];
2840
2841 u[4] = _mm256_mullo_epi32(v[4], cospi16);
2842 x = _mm256_mullo_epi32(v[5], cospi48);
2843 u[4] = _mm256_add_epi32(u[4], x);
2844 u[4] = _mm256_add_epi32(u[4], rnding);
2845 u[4] = _mm256_srai_epi32(u[4], bit);
2846
2847 u[5] = _mm256_mullo_epi32(v[4], cospi48);
2848 x = _mm256_mullo_epi32(v[5], cospi16);
2849 u[5] = _mm256_sub_epi32(u[5], x);
2850 u[5] = _mm256_add_epi32(u[5], rnding);
2851 u[5] = _mm256_srai_epi32(u[5], bit);
2852
2853 u[6] = _mm256_mullo_epi32(v[6], cospim48);
2854 x = _mm256_mullo_epi32(v[7], cospi16);
2855 u[6] = _mm256_add_epi32(u[6], x);
2856 u[6] = _mm256_add_epi32(u[6], rnding);
2857 u[6] = _mm256_srai_epi32(u[6], bit);
2858
2859 u[7] = _mm256_mullo_epi32(v[6], cospi16);
2860 x = _mm256_mullo_epi32(v[7], cospim48);
2861 u[7] = _mm256_sub_epi32(u[7], x);
2862 u[7] = _mm256_add_epi32(u[7], rnding);
2863 u[7] = _mm256_srai_epi32(u[7], bit);
2864
2865 // stage 5
2866 addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2867 addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2868 addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2869 addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2870
2871 // stage 6
2872 u[0] = v[0];
2873 u[1] = v[1];
2874 u[4] = v[4];
2875 u[5] = v[5];
2876
2877 v[0] = _mm256_mullo_epi32(v[2], cospi32);
2878 x = _mm256_mullo_epi32(v[3], cospi32);
2879 u[2] = _mm256_add_epi32(v[0], x);
2880 u[2] = _mm256_add_epi32(u[2], rnding);
2881 u[2] = _mm256_srai_epi32(u[2], bit);
2882
2883 u[3] = _mm256_sub_epi32(v[0], x);
2884 u[3] = _mm256_add_epi32(u[3], rnding);
2885 u[3] = _mm256_srai_epi32(u[3], bit);
2886
2887 v[0] = _mm256_mullo_epi32(v[6], cospi32);
2888 x = _mm256_mullo_epi32(v[7], cospi32);
2889 u[6] = _mm256_add_epi32(v[0], x);
2890 u[6] = _mm256_add_epi32(u[6], rnding);
2891 u[6] = _mm256_srai_epi32(u[6], bit);
2892
2893 u[7] = _mm256_sub_epi32(v[0], x);
2894 u[7] = _mm256_add_epi32(u[7], rnding);
2895 u[7] = _mm256_srai_epi32(u[7], bit);
2896
2897 // stage 7
2898 if (do_cols) {
2899 out[0] = u[0];
2900 out[1] = _mm256_sub_epi32(kZero, u[4]);
2901 out[2] = u[6];
2902 out[3] = _mm256_sub_epi32(kZero, u[2]);
2903 out[4] = u[3];
2904 out[5] = _mm256_sub_epi32(kZero, u[7]);
2905 out[6] = u[5];
2906 out[7] = _mm256_sub_epi32(kZero, u[1]);
2907 } else {
2908 const int log_range_out = AOMMAX(16, bd + 6);
2909 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2910 const __m256i clamp_hi_out =
2911 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2912
2913 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2914 out_shift);
2915 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2916 out_shift);
2917 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2918 out_shift);
2919 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2920 out_shift);
2921 }
2922 }
idct64_stage8_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2923 static INLINE void idct64_stage8_avx2(
2924 __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
2925 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
2926 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
2927 const __m256i *rnding, int bit) {
2928 int i;
2929 __m256i temp1, temp2, temp3, temp4;
2930 temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2931 u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2932 u[10] = temp1;
2933 temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2934 u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2935 u[11] = temp2;
2936
2937 for (i = 16; i < 20; ++i) {
2938 addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2939 addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2940 }
2941
2942 temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2943 temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2944 temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2945 temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
2946 u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
2947 u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
2948 u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
2949 u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
2950 u[36] = temp1;
2951 u[37] = temp2;
2952 u[38] = temp3;
2953 u[39] = temp4;
2954
2955 temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
2956 temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
2957 temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
2958 temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
2959 u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
2960 u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
2961 u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
2962 u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
2963 u[40] = temp1;
2964 u[41] = temp2;
2965 u[42] = temp3;
2966 u[43] = temp4;
2967 }
2968
idct64_stage9_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2969 static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
2970 const __m256i *cospi32,
2971 const __m256i *clamp_lo,
2972 const __m256i *clamp_hi,
2973 const __m256i *rnding, int bit) {
2974 int i;
2975 __m256i temp1, temp2, temp3, temp4;
2976 for (i = 0; i < 8; ++i) {
2977 addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2978 }
2979
2980 temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
2981 temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
2982 temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
2983 temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
2984 u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
2985 u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
2986 u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
2987 u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
2988 u[20] = temp1;
2989 u[21] = temp2;
2990 u[22] = temp3;
2991 u[23] = temp4;
2992 for (i = 32; i < 40; i++) {
2993 addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2994 }
2995
2996 for (i = 48; i < 56; i++) {
2997 addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2998 }
2999 }
3000
idct64_stage10_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)3001 static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
3002 const __m256i *cospi32,
3003 const __m256i *clamp_lo,
3004 const __m256i *clamp_hi,
3005 const __m256i *rnding, int bit) {
3006 __m256i temp1, temp2, temp3, temp4;
3007 for (int i = 0; i < 16; i++) {
3008 addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3009 }
3010
3011 temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3012 temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3013 temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3014 temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3015 u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3016 u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3017 u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3018 u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3019 u[40] = temp1;
3020 u[41] = temp2;
3021 u[42] = temp3;
3022 u[43] = temp4;
3023
3024 temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3025 temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3026 temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3027 temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3028 u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3029 u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3030 u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3031 u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3032 u[44] = temp1;
3033 u[45] = temp2;
3034 u[46] = temp3;
3035 u[47] = temp4;
3036 }
3037
idct64_stage11_avx2(__m256i * u,__m256i * out,int do_cols,int bd,int out_shift,const int log_range)3038 static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
3039 int bd, int out_shift,
3040 const int log_range) {
3041 if (do_cols) {
3042 for (int i = 0; i < 32; i++) {
3043 addsub_no_clamp_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
3044 }
3045 } else {
3046 const int log_range_out = AOMMAX(16, bd + 6);
3047 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
3048 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3049 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
3050 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3051
3052 for (int i = 0; i < 32; i++) {
3053 addsub_shift_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
3054 &clamp_lo_out, &clamp_hi_out, out_shift);
3055 }
3056 }
3057 }
3058
idct64_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3059 static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3060 int bd, int out_shift) {
3061 const int32_t *cospi = cospi_arr(bit);
3062 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3063 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3064 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3065 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3066
3067 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3068
3069 {
3070 __m256i x;
3071
3072 // stage 1
3073 // stage 2
3074 // stage 3
3075 // stage 4
3076 // stage 5
3077 // stage 6
3078 x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
3079
3080 // stage 8
3081 // stage 9
3082 // stage 10
3083 // stage 11
3084 if (do_cols) {
3085 x = _mm256_max_epi32(x, clamp_lo);
3086 x = _mm256_min_epi32(x, clamp_hi);
3087 } else {
3088 const int log_range_out = AOMMAX(16, bd + 6);
3089 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
3090 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3091 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
3092 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3093
3094 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
3095 x = _mm256_add_epi32(x, offset);
3096 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3097
3098 x = _mm256_max_epi32(x, clamp_lo_out);
3099 x = _mm256_min_epi32(x, clamp_hi_out);
3100 }
3101
3102 out[0] = x;
3103 out[1] = x;
3104 out[2] = x;
3105 out[3] = x;
3106 out[4] = x;
3107 out[5] = x;
3108 out[6] = x;
3109 out[7] = x;
3110 out[8] = x;
3111 out[9] = x;
3112 out[10] = x;
3113 out[11] = x;
3114 out[12] = x;
3115 out[13] = x;
3116 out[14] = x;
3117 out[15] = x;
3118 out[16] = x;
3119 out[17] = x;
3120 out[18] = x;
3121 out[19] = x;
3122 out[20] = x;
3123 out[21] = x;
3124 out[22] = x;
3125 out[23] = x;
3126 out[24] = x;
3127 out[25] = x;
3128 out[26] = x;
3129 out[27] = x;
3130 out[28] = x;
3131 out[29] = x;
3132 out[30] = x;
3133 out[31] = x;
3134 out[32] = x;
3135 out[33] = x;
3136 out[34] = x;
3137 out[35] = x;
3138 out[36] = x;
3139 out[37] = x;
3140 out[38] = x;
3141 out[39] = x;
3142 out[40] = x;
3143 out[41] = x;
3144 out[42] = x;
3145 out[43] = x;
3146 out[44] = x;
3147 out[45] = x;
3148 out[46] = x;
3149 out[47] = x;
3150 out[48] = x;
3151 out[49] = x;
3152 out[50] = x;
3153 out[51] = x;
3154 out[52] = x;
3155 out[53] = x;
3156 out[54] = x;
3157 out[55] = x;
3158 out[56] = x;
3159 out[57] = x;
3160 out[58] = x;
3161 out[59] = x;
3162 out[60] = x;
3163 out[61] = x;
3164 out[62] = x;
3165 out[63] = x;
3166 }
3167 }
idct64_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3168 static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3169 int bd, int out_shift) {
3170 int i, j;
3171 const int32_t *cospi = cospi_arr(bit);
3172 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3173 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3174 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3175 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3176
3177 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3178 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3179 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3180 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3181 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3182 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3183 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3184 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3185 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3186 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3187 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3188 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3189 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3190 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3191 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3192 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3193 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3194 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3195 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3196 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3197 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3198 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3199 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3200 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3201 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3202 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3203 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3204 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3205 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3206 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3207 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3208 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3209 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3210 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3211 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3212 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3213 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3214 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3215
3216 {
3217 __m256i u[64];
3218
3219 // stage 1
3220 u[0] = in[0];
3221 u[8] = in[4];
3222 u[16] = in[2];
3223 u[24] = in[6];
3224 u[32] = in[1];
3225 u[40] = in[5];
3226 u[48] = in[3];
3227 u[56] = in[7];
3228
3229 // stage 2
3230 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3231 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3232 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3233 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3234 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3235 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3236 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3237 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3238
3239 // stage 3
3240 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3241 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3242 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3243 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3244 u[33] = u[32];
3245 u[38] = u[39];
3246 u[41] = u[40];
3247 u[46] = u[47];
3248 u[49] = u[48];
3249 u[54] = u[55];
3250 u[57] = u[56];
3251 u[62] = u[63];
3252
3253 // stage 4
3254 __m256i temp1, temp2;
3255 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3256 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3257 u[17] = u[16];
3258 u[22] = u[23];
3259 u[25] = u[24];
3260 u[30] = u[31];
3261
3262 temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3263 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3264 u[33] = temp1;
3265
3266 temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3267 u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3268 u[57] = temp2;
3269
3270 temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3271 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3272 u[41] = temp1;
3273
3274 temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3275 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3276 u[46] = temp2;
3277
3278 // stage 5
3279 u[9] = u[8];
3280 u[14] = u[15];
3281
3282 temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3283 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3284 u[17] = temp1;
3285
3286 temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3287 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3288 u[22] = temp2;
3289
3290 u[35] = u[32];
3291 u[34] = u[33];
3292 u[36] = u[39];
3293 u[37] = u[38];
3294 u[43] = u[40];
3295 u[42] = u[41];
3296 u[44] = u[47];
3297 u[45] = u[46];
3298 u[51] = u[48];
3299 u[50] = u[49];
3300 u[52] = u[55];
3301 u[53] = u[54];
3302 u[59] = u[56];
3303 u[58] = u[57];
3304 u[60] = u[63];
3305 u[61] = u[62];
3306
3307 // stage 6
3308 temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3309 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3310 u[0] = temp1;
3311
3312 temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3313 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3314 u[9] = temp2;
3315 u[19] = u[16];
3316 u[18] = u[17];
3317 u[20] = u[23];
3318 u[21] = u[22];
3319 u[27] = u[24];
3320 u[26] = u[25];
3321 u[28] = u[31];
3322 u[29] = u[30];
3323
3324 temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3325 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3326 u[34] = temp1;
3327 temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3328 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3329 u[35] = temp2;
3330 temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3331 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3332 u[36] = temp1;
3333 temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3334 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3335 u[37] = temp2;
3336 temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3337 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3338 u[42] = temp1;
3339 temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3340 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3341 u[43] = temp2;
3342 temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3343 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3344 u[44] = temp1;
3345 temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3346 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3347 u[45] = temp2;
3348
3349 // stage 7
3350 u[3] = u[0];
3351 u[2] = u[1];
3352 u[11] = u[8];
3353 u[10] = u[9];
3354 u[12] = u[15];
3355 u[13] = u[14];
3356
3357 temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3358 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3359 u[18] = temp1;
3360 temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3361 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3362 u[19] = temp2;
3363 temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3364 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3365 u[20] = temp1;
3366 temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3367 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3368 u[21] = temp2;
3369 for (i = 32; i < 64; i += 16) {
3370 for (j = i; j < i + 4; j++) {
3371 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3372 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3373 &clamp_hi);
3374 }
3375 }
3376
3377 // stage 8
3378 u[7] = u[0];
3379 u[6] = u[1];
3380 u[5] = u[2];
3381 u[4] = u[3];
3382 u[9] = u[9];
3383
3384 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3385 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3386
3387 // stage 9
3388 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3389 bit);
3390
3391 // stage 10
3392 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3393 bit);
3394
3395 // stage 11
3396 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
3397 }
3398 }
idct64_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3399 static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3400 int bd, int out_shift) {
3401 int i, j;
3402 const int32_t *cospi = cospi_arr(bit);
3403 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3404 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3405 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3406 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3407
3408 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3409 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3410 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3411 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3412 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3413 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3414 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3415 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3416 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3417 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3418 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3419 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3420 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3421 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3422 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3423 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3424 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3425 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3426 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3427 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3428 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3429 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3430 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3431 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3432 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3433 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3434 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3435 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3436 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3437 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3438 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3439 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3440 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3441
3442 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3443 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3444 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3445 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3446 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3447 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3448 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3449 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3450 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3451 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3452 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3453 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3454 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3455 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3456 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3457 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3458 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3459 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3460 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3461 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3462 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3463
3464 {
3465 __m256i u[64];
3466 __m256i tmp1, tmp2, tmp3, tmp4;
3467 // stage 1
3468 u[0] = in[0];
3469 u[32] = in[1];
3470 u[36] = in[9];
3471 u[40] = in[5];
3472 u[44] = in[13];
3473 u[48] = in[3];
3474 u[52] = in[11];
3475 u[56] = in[7];
3476 u[60] = in[15];
3477 u[16] = in[2];
3478 u[20] = in[10];
3479 u[24] = in[6];
3480 u[28] = in[14];
3481 u[4] = in[8];
3482 u[8] = in[4];
3483 u[12] = in[12];
3484
3485 // stage 2
3486 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3487 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3488 u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3489 u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3490 u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3491 u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3492 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3493 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3494 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3495 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3496 u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3497 u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3498 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3499 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3500 u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3501 u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3502
3503 // stage 3
3504 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3505 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3506 u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
3507 u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
3508 u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
3509 u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
3510 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3511 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3512 u[33] = u[32];
3513 u[34] = u[35];
3514 u[37] = u[36];
3515 u[38] = u[39];
3516 u[41] = u[40];
3517 u[42] = u[43];
3518 u[45] = u[44];
3519 u[46] = u[47];
3520 u[49] = u[48];
3521 u[50] = u[51];
3522 u[53] = u[52];
3523 u[54] = u[55];
3524 u[57] = u[56];
3525 u[58] = u[59];
3526 u[61] = u[60];
3527 u[62] = u[63];
3528
3529 // stage 4
3530 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3531 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3532 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3533 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3534
3535 u[17] = u[16];
3536 u[18] = u[19];
3537 u[21] = u[20];
3538 u[22] = u[23];
3539 u[25] = u[24];
3540 u[26] = u[27];
3541 u[29] = u[28];
3542 u[30] = u[31];
3543
3544 tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3545 tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3546 tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3547 tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3548 u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3549 u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3550 u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3551 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3552 u[33] = tmp1;
3553 u[34] = tmp2;
3554 u[37] = tmp3;
3555 u[38] = tmp4;
3556
3557 tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3558 tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3559 tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3560 tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3561 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3562 u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3563 u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3564 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3565 u[41] = tmp1;
3566 u[42] = tmp2;
3567 u[45] = tmp3;
3568 u[46] = tmp4;
3569
3570 // stage 5
3571 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3572 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3573
3574 u[9] = u[8];
3575 u[10] = u[11];
3576 u[13] = u[12];
3577 u[14] = u[15];
3578
3579 tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3580 tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3581 tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3582 tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3583 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3584 u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3585 u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3586 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3587 u[17] = tmp1;
3588 u[18] = tmp2;
3589 u[21] = tmp3;
3590 u[22] = tmp4;
3591
3592 for (i = 32; i < 64; i += 8) {
3593 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3594 &clamp_hi);
3595 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3596 &clamp_hi);
3597
3598 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3599 &clamp_hi);
3600 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3601 &clamp_hi);
3602 }
3603
3604 // stage 6
3605 tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3606 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3607 u[0] = tmp1;
3608 u[5] = u[4];
3609 u[6] = u[7];
3610
3611 tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3612 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3613 u[9] = tmp1;
3614 tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3615 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3616 u[10] = tmp2;
3617
3618 for (i = 16; i < 32; i += 8) {
3619 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3620 &clamp_hi);
3621 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3622 &clamp_hi);
3623
3624 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3625 &clamp_hi);
3626 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3627 &clamp_hi);
3628 }
3629
3630 tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3631 tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3632 tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3633 tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3634 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3635 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3636 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3637 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3638 u[34] = tmp1;
3639 u[35] = tmp2;
3640 u[36] = tmp3;
3641 u[37] = tmp4;
3642
3643 tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3644 tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3645 tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3646 tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3647 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3648 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3649 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3650 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3651 u[42] = tmp1;
3652 u[43] = tmp2;
3653 u[44] = tmp3;
3654 u[45] = tmp4;
3655
3656 // stage 7
3657 u[3] = u[0];
3658 u[2] = u[1];
3659 tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3660 u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3661 u[5] = tmp1;
3662 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3663 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3664 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3665 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3666
3667 tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3668 tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3669 tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3670 tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3671 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3672 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3673 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3674 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3675 u[18] = tmp1;
3676 u[19] = tmp2;
3677 u[20] = tmp3;
3678 u[21] = tmp4;
3679
3680 for (i = 32; i < 64; i += 16) {
3681 for (j = i; j < i + 4; j++) {
3682 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3683 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3684 &clamp_hi);
3685 }
3686 }
3687
3688 // stage 8
3689 for (i = 0; i < 4; ++i) {
3690 addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3691 }
3692
3693 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3694 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3695
3696 // stage 9
3697 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3698 bit);
3699
3700 // stage 10
3701 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3702 bit);
3703
3704 // stage 11
3705 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, log_range);
3706 }
3707 }
idct64_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3708 static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
3709 int out_shift) {
3710 int i, j;
3711 const int32_t *cospi = cospi_arr(bit);
3712 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3713 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3714 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3715 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3716
3717 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3718 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3719 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3720 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3721 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3722 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3723 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3724 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3725 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3726 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3727 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3728 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3729 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3730 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3731 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3732 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3733 const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
3734 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3735 const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
3736 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3737 const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
3738 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3739 const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
3740 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3741 const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
3742 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3743 const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
3744 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3745 const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
3746 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3747 const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
3748 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3749 const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
3750 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3751 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3752 const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
3753 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3754 const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
3755 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3756 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3757 const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
3758 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3759 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3760 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3761 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3762 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3763 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3764 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3765 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3766 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3767 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3768
3769 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3770 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3771 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3772 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3773 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3774 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3775 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3776 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3777 const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
3778 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
3779 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3780 const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
3781 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3782 const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
3783 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
3784 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3785 const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
3786 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3787 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3788 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3789 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3790 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3791 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3792 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3793 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3794 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3795 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3796
3797 {
3798 __m256i u[64], v[64];
3799
3800 // stage 1
3801 u[32] = in[1];
3802 u[34] = in[17];
3803 u[36] = in[9];
3804 u[38] = in[25];
3805 u[40] = in[5];
3806 u[42] = in[21];
3807 u[44] = in[13];
3808 u[46] = in[29];
3809 u[48] = in[3];
3810 u[50] = in[19];
3811 u[52] = in[11];
3812 u[54] = in[27];
3813 u[56] = in[7];
3814 u[58] = in[23];
3815 u[60] = in[15];
3816 u[62] = in[31];
3817
3818 v[16] = in[2];
3819 v[18] = in[18];
3820 v[20] = in[10];
3821 v[22] = in[26];
3822 v[24] = in[6];
3823 v[26] = in[22];
3824 v[28] = in[14];
3825 v[30] = in[30];
3826
3827 u[8] = in[4];
3828 u[10] = in[20];
3829 u[12] = in[12];
3830 u[14] = in[28];
3831
3832 v[4] = in[8];
3833 v[6] = in[24];
3834
3835 u[0] = in[0];
3836 u[2] = in[16];
3837
3838 // stage 2
3839 v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3840 v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
3841 v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
3842 v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3843 v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3844 v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
3845 v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
3846 v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3847 v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3848 v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
3849 v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
3850 v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3851 v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3852 v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
3853 v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
3854 v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3855 v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3856 v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
3857 v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
3858 v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3859 v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3860 v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
3861 v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
3862 v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3863 v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3864 v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
3865 v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
3866 v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3867 v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3868 v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
3869 v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
3870 v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3871
3872 // stage 3
3873 u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
3874 u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
3875 u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
3876 u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
3877 u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
3878 u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
3879 u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
3880 u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
3881 u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
3882 u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
3883 u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
3884 u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
3885 u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
3886 u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
3887 u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
3888 u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
3889
3890 for (i = 32; i < 64; i += 4) {
3891 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3892 &clamp_hi);
3893 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3894 &clamp_hi);
3895 }
3896
3897 // stage 4
3898 v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3899 v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3900 v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3901 v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3902 v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3903 v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3904 v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3905 v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3906
3907 for (i = 16; i < 32; i += 4) {
3908 addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3909 &clamp_hi);
3910 addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3911 &clamp_hi);
3912 }
3913
3914 for (i = 32; i < 64; i += 4) {
3915 v[i + 0] = u[i + 0];
3916 v[i + 3] = u[i + 3];
3917 }
3918
3919 v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3920 v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3921 v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3922 v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3923 v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3924 v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3925 v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3926 v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3927 v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3928 v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3929 v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3930 v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3931 v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3932 v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3933 v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3934 v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3935
3936 // stage 5
3937 u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
3938 u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
3939 u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
3940 u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
3941
3942 for (i = 8; i < 16; i += 4) {
3943 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3944 &clamp_hi);
3945 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3946 &clamp_hi);
3947 }
3948
3949 for (i = 16; i < 32; i += 4) {
3950 u[i + 0] = v[i + 0];
3951 u[i + 3] = v[i + 3];
3952 }
3953
3954 u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
3955 u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
3956 u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
3957 u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
3958 u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
3959 u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
3960 u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
3961 u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
3962
3963 for (i = 32; i < 64; i += 8) {
3964 addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3965 &clamp_hi);
3966 addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3967 &clamp_hi);
3968
3969 addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3970 &clamp_hi);
3971 addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3972 &clamp_hi);
3973 }
3974
3975 // stage 6
3976 v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3977 v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3978 v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3979 v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3980
3981 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3982 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3983
3984 for (i = 8; i < 16; i += 4) {
3985 v[i + 0] = u[i + 0];
3986 v[i + 3] = u[i + 3];
3987 }
3988
3989 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3990 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3991 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3992 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3993
3994 for (i = 16; i < 32; i += 8) {
3995 addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3996 &clamp_hi);
3997 addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3998 &clamp_hi);
3999
4000 addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4001 &clamp_hi);
4002 addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4003 &clamp_hi);
4004 }
4005
4006 for (i = 32; i < 64; i += 8) {
4007 v[i + 0] = u[i + 0];
4008 v[i + 1] = u[i + 1];
4009 v[i + 6] = u[i + 6];
4010 v[i + 7] = u[i + 7];
4011 }
4012
4013 v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4014 v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4015 v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4016 v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4017 v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4018 v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4019 v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4020 v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4021 v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4022 v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4023 v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4024 v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4025 v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4026 v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4027 v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4028 v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4029
4030 // stage 7
4031 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4032 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4033
4034 u[4] = v[4];
4035 u[7] = v[7];
4036 u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4037 u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4038
4039 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4040 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4041 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4042 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4043
4044 for (i = 16; i < 32; i += 8) {
4045 u[i + 0] = v[i + 0];
4046 u[i + 1] = v[i + 1];
4047 u[i + 6] = v[i + 6];
4048 u[i + 7] = v[i + 7];
4049 }
4050
4051 u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4052 u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4053 u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4054 u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4055 u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4056 u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4057 u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4058 u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4059
4060 for (i = 32; i < 64; i += 16) {
4061 for (j = i; j < i + 4; j++) {
4062 addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4063 addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4064 &clamp_hi);
4065 }
4066 }
4067
4068 // stage 8
4069 for (i = 0; i < 4; ++i) {
4070 addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4071 }
4072
4073 v[8] = u[8];
4074 v[9] = u[9];
4075 v[14] = u[14];
4076 v[15] = u[15];
4077
4078 v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4079 v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4080 v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4081 v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4082
4083 for (i = 16; i < 20; ++i) {
4084 addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4085 addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4086 &clamp_hi);
4087 }
4088
4089 for (i = 32; i < 36; ++i) {
4090 v[i] = u[i];
4091 v[i + 12] = u[i + 12];
4092 v[i + 16] = u[i + 16];
4093 v[i + 28] = u[i + 28];
4094 }
4095
4096 v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4097 v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4098 v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4099 v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4100 v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4101 v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4102 v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4103 v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4104 v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4105 v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4106 v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4107 v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4108 v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4109 v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4110 v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4111 v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4112
4113 // stage 9
4114 for (i = 0; i < 8; ++i) {
4115 addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4116 }
4117
4118 for (i = 16; i < 20; ++i) {
4119 u[i] = v[i];
4120 u[i + 12] = v[i + 12];
4121 }
4122
4123 u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4124 u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4125 u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4126 u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4127 u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4128 u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4129 u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4130 u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4131
4132 for (i = 32; i < 40; i++) {
4133 addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4134 }
4135
4136 for (i = 48; i < 56; i++) {
4137 addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4138 }
4139
4140 // stage 10
4141 for (i = 0; i < 16; i++) {
4142 addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4143 }
4144
4145 for (i = 32; i < 40; i++) v[i] = u[i];
4146
4147 v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4148 v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4149 v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4150 v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4151 v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4152 v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4153 v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4154 v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4155 v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4156 v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4157 v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4158 v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4159 v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4160 v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4161 v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4162 v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4163
4164 for (i = 56; i < 64; i++) v[i] = u[i];
4165
4166 // stage 11
4167 if (do_cols) {
4168 for (i = 0; i < 32; i++) {
4169 addsub_no_clamp_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
4170 }
4171 } else {
4172 const int log_range_out = AOMMAX(16, bd + 6);
4173 const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
4174 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4175 const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
4176 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4177
4178 for (i = 0; i < 32; i++) {
4179 addsub_shift_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
4180 &clamp_lo_out, &clamp_hi_out, out_shift);
4181 }
4182 }
4183 }
4184 }
4185 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
4186 int do_cols, int bd, int out_shift);
4187
4188 static const transform_1d_avx2
4189 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4190 {
4191 { NULL, NULL, NULL, NULL },
4192 { NULL, NULL, NULL, NULL },
4193 { NULL, NULL, NULL, NULL },
4194 },
4195 {
4196 { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
4197 { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
4198 { NULL, NULL, NULL, NULL },
4199 },
4200 {
4201 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
4202 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
4203 { NULL, NULL, NULL, NULL },
4204 },
4205 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
4206 { NULL, NULL, NULL, NULL },
4207 { NULL, NULL, NULL, NULL } },
4208
4209 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
4210 { NULL, NULL, NULL, NULL },
4211 { NULL, NULL, NULL, NULL } }
4212 };
4213
highbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4214 static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
4215 uint16_t *output, int stride,
4216 TX_TYPE tx_type,
4217 TX_SIZE tx_size, int eob,
4218 const int bd) {
4219 __m256i buf1[64 * 8];
4220 int eobx, eoby;
4221 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4222 const int8_t *shift = inv_txfm_shift_ls[tx_size];
4223 const int txw_idx = get_txw_idx(tx_size);
4224 const int txh_idx = get_txh_idx(tx_size);
4225 const int txfm_size_col = tx_size_wide[tx_size];
4226 const int txfm_size_row = tx_size_high[tx_size];
4227 const int buf_size_w_div8 = txfm_size_col >> 3;
4228 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
4229 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4230 const int input_stride = AOMMIN(32, txfm_size_col);
4231 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4232 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4233 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4234 const transform_1d_avx2 row_txfm =
4235 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4236 const transform_1d_avx2 col_txfm =
4237 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4238
4239 assert(col_txfm != NULL);
4240 assert(row_txfm != NULL);
4241 int ud_flip, lr_flip;
4242 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4243
4244 // 1st stage: column transform
4245 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4246 __m256i buf0[64];
4247 const int32_t *input_row = input + i * input_stride * 8;
4248 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
4249 __m256i *buf0_cur = buf0 + j * 8;
4250 load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
4251
4252 transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
4253 }
4254 if (rect_type == 1 || rect_type == -1) {
4255 av1_round_shift_rect_array_32_avx2(
4256 buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
4257 }
4258 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
4259
4260 __m256i *_buf1 = buf1 + i * 8;
4261 if (lr_flip) {
4262 for (int j = 0; j < buf_size_w_div8; ++j) {
4263 transpose_8x8_flip_avx2(
4264 &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
4265 }
4266 } else {
4267 for (int j = 0; j < buf_size_w_div8; ++j) {
4268 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
4269 }
4270 }
4271 }
4272 // 2nd stage: column transform
4273 for (int i = 0; i < buf_size_w_div8; i++) {
4274 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
4275 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
4276
4277 av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
4278 buf1 + i * txfm_size_row, txfm_size_row,
4279 -shift[1]);
4280 }
4281
4282 // write to buffer
4283 if (txfm_size_col >= 16) {
4284 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4285 highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
4286 output + 16 * i, stride, ud_flip,
4287 txfm_size_row, bd);
4288 }
4289 } else if (txfm_size_col == 8) {
4290 highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
4291 bd);
4292 }
4293 }
4294
av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4295 void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
4296 uint8_t *output, int stride,
4297 TX_TYPE tx_type, TX_SIZE tx_size,
4298 int eob, const int bd) {
4299 switch (tx_type) {
4300 case DCT_DCT:
4301 case ADST_DCT:
4302 case DCT_ADST:
4303 case ADST_ADST:
4304 case FLIPADST_DCT:
4305 case DCT_FLIPADST:
4306 case FLIPADST_FLIPADST:
4307 case ADST_FLIPADST:
4308 case FLIPADST_ADST:
4309 highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
4310 stride, tx_type, tx_size, eob, bd);
4311 break;
4312 case IDTX:
4313 case H_DCT:
4314 case H_ADST:
4315 case H_FLIPADST:
4316 case V_DCT:
4317 case V_ADST:
4318 case V_FLIPADST:
4319 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
4320 tx_size, eob, bd);
4321 break;
4322 default: assert(0); break;
4323 }
4324 }
av1_highbd_inv_txfm_add_avx2(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)4325 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
4326 int stride, const TxfmParam *txfm_param) {
4327 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4328 const TX_SIZE tx_size = txfm_param->tx_size;
4329 switch (tx_size) {
4330 case TX_4X8:
4331 av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
4332 break;
4333 case TX_8X4:
4334 av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
4335 break;
4336 case TX_4X4:
4337 av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
4338 break;
4339 case TX_16X4:
4340 av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
4341 break;
4342 case TX_4X16:
4343 av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
4344 break;
4345 default:
4346 av1_highbd_inv_txfm2d_add_universe_avx2(
4347 input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
4348 txfm_param->eob, txfm_param->bd);
4349 break;
4350 }
4351 }
4352