1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_config.h"
13 #include "config/av1_rtcd.h"
14
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18
19 // TODO(venkatsanampudi@ittiam.com): move this to header file
20
21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
22 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
23 4 * 5793 };
24
25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
26
idct4_sse2(const __m128i * input,__m128i * output)27 static void idct4_sse2(const __m128i *input, __m128i *output) {
28 const int8_t cos_bit = INV_COS_BIT;
29 const int32_t *cospi = cospi_arr(INV_COS_BIT);
30 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
31
32 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
33 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
34 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
35 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
36
37 // stage 1
38 __m128i x[4];
39 x[0] = input[0];
40 x[1] = input[2];
41 x[2] = input[1];
42 x[3] = input[3];
43
44 // stage 2
45 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
46 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
47
48 // stage 3
49 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
50 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
51 }
52
idct4_w4_sse2(const __m128i * input,__m128i * output)53 static void idct4_w4_sse2(const __m128i *input, __m128i *output) {
54 const int8_t cos_bit = INV_COS_BIT;
55 const int32_t *cospi = cospi_arr(INV_COS_BIT);
56 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
57
58 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
59 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
60 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
61 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
62
63 // stage 1
64 __m128i x[4];
65 x[0] = input[0];
66 x[1] = input[2];
67 x[2] = input[1];
68 x[3] = input[3];
69
70 // stage 2
71 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
72 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
73
74 // stage 3
75 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
76 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
77 }
78
idct8_low1_ssse3(const __m128i * input,__m128i * output)79 static void idct8_low1_ssse3(const __m128i *input, __m128i *output) {
80 const int32_t *cospi = cospi_arr(INV_COS_BIT);
81
82 // stage 1
83 __m128i x[2];
84 x[0] = input[0];
85
86 // stage 2
87 // stage 3
88 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
89
90 // stage 4
91 // stage 5
92 output[0] = x[0];
93 output[7] = x[0];
94 output[1] = x[1];
95 output[6] = x[1];
96 output[2] = x[1];
97 output[5] = x[1];
98 output[3] = x[0];
99 output[4] = x[0];
100 }
101
idct8_sse2(const __m128i * input,__m128i * output)102 static void idct8_sse2(const __m128i *input, __m128i *output) {
103 const int8_t cos_bit = INV_COS_BIT;
104 const int32_t *cospi = cospi_arr(INV_COS_BIT);
105 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
106
107 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
108 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
109 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
110 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
111 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
112 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
113 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
114 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
115 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
116
117 // stage 1
118 __m128i x[8];
119 x[0] = input[0];
120 x[1] = input[4];
121 x[2] = input[2];
122 x[3] = input[6];
123 x[4] = input[1];
124 x[5] = input[5];
125 x[6] = input[3];
126 x[7] = input[7];
127
128 // stage 2
129 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
130 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
131
132 // stage 3
133 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
134 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
135 btf_16_adds_subs_sse2(x[4], x[5]);
136 btf_16_subs_adds_sse2(x[7], x[6]);
137
138 // stage 4
139 btf_16_adds_subs_sse2(x[0], x[3]);
140 btf_16_adds_subs_sse2(x[1], x[2]);
141 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
142
143 // stage 5
144 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
145 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
146 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
147 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
148 }
149
idct8_w4_sse2(const __m128i * input,__m128i * output)150 static void idct8_w4_sse2(const __m128i *input, __m128i *output) {
151 const int8_t cos_bit = INV_COS_BIT;
152 const int32_t *cospi = cospi_arr(INV_COS_BIT);
153 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
154
155 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
156 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
157 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
158 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
159 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
160 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
161 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
162 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
163 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
164
165 // stage 1
166 __m128i x[8];
167 x[0] = input[0];
168 x[1] = input[4];
169 x[2] = input[2];
170 x[3] = input[6];
171 x[4] = input[1];
172 x[5] = input[5];
173 x[6] = input[3];
174 x[7] = input[7];
175
176 // stage 2
177 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
178 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
179
180 // stage 3
181 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
182 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
183 btf_16_adds_subs_sse2(x[4], x[5]);
184 btf_16_subs_adds_sse2(x[7], x[6]);
185
186 // stage 4
187 btf_16_adds_subs_sse2(x[0], x[3]);
188 btf_16_adds_subs_sse2(x[1], x[2]);
189 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
190
191 // stage 5
192 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
193 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
194 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
195 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
196 }
197
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)198 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
199 const __m128i __rounding,
200 int8_t cos_bit) {
201 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
202 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
203 btf_16_adds_subs_sse2(x[0], x[3]);
204 btf_16_adds_subs_sse2(x[1], x[2]);
205 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
206 btf_16_adds_subs_sse2(x[8], x[11]);
207 btf_16_adds_subs_sse2(x[9], x[10]);
208 btf_16_subs_adds_sse2(x[15], x[12]);
209 btf_16_subs_adds_sse2(x[14], x[13]);
210 }
211
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)212 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
213 const __m128i __rounding,
214 int8_t cos_bit) {
215 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
216 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
217 btf_16_adds_subs_sse2(x[0], x[7]);
218 btf_16_adds_subs_sse2(x[1], x[6]);
219 btf_16_adds_subs_sse2(x[2], x[5]);
220 btf_16_adds_subs_sse2(x[3], x[4]);
221 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
222 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
223 }
224
idct16_stage7_sse2(__m128i * output,__m128i * x)225 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
226 btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
227 btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
228 btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
229 btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
230 btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
231 btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
232 btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
233 btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
234 }
235
idct16_low1_ssse3(const __m128i * input,__m128i * output)236 static void idct16_low1_ssse3(const __m128i *input, __m128i *output) {
237 const int32_t *cospi = cospi_arr(INV_COS_BIT);
238
239 // stage 1
240 __m128i x[2];
241 x[0] = input[0];
242
243 // stage 2
244 // stage 3
245 // stage 4
246 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
247
248 // stage 5
249 // stage 6
250 // stage 7
251 output[0] = x[0];
252 output[15] = x[0];
253 output[1] = x[1];
254 output[14] = x[1];
255 output[2] = x[1];
256 output[13] = x[1];
257 output[3] = x[0];
258 output[12] = x[0];
259 output[4] = x[0];
260 output[11] = x[0];
261 output[5] = x[1];
262 output[10] = x[1];
263 output[6] = x[1];
264 output[9] = x[1];
265 output[7] = x[0];
266 output[8] = x[0];
267 }
268
idct16_low8_ssse3(const __m128i * input,__m128i * output)269 static void idct16_low8_ssse3(const __m128i *input, __m128i *output) {
270 const int8_t cos_bit = INV_COS_BIT;
271 const int32_t *cospi = cospi_arr(INV_COS_BIT);
272 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
273 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
274 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
275 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
276
277 // stage 1
278 __m128i x[16];
279 x[0] = input[0];
280 x[2] = input[4];
281 x[4] = input[2];
282 x[6] = input[6];
283 x[8] = input[1];
284 x[10] = input[5];
285 x[12] = input[3];
286 x[14] = input[7];
287
288 // stage 2
289 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
290 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
291 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
292 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
293
294 // stage 3
295 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
296 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
297 btf_16_adds_subs_sse2(x[8], x[9]);
298 btf_16_subs_adds_sse2(x[11], x[10]);
299 btf_16_adds_subs_sse2(x[12], x[13]);
300 btf_16_subs_adds_sse2(x[15], x[14]);
301
302 // stage 4
303 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
304 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
305 btf_16_adds_subs_sse2(x[4], x[5]);
306 btf_16_subs_adds_sse2(x[7], x[6]);
307 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
308 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
309
310 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
311 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
312 idct16_stage7_sse2(output, x);
313 }
314
idct16_sse2(const __m128i * input,__m128i * output)315 static void idct16_sse2(const __m128i *input, __m128i *output) {
316 const int8_t cos_bit = INV_COS_BIT;
317 const int32_t *cospi = cospi_arr(INV_COS_BIT);
318 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
319
320 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
321 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
322 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
323 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
324 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
325 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
326 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
327 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
328 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
329 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
330 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
331 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
332 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
333 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
334 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
335 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
336 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
337 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
338 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
339
340 // stage 1
341 __m128i x[16];
342 x[0] = input[0];
343 x[1] = input[8];
344 x[2] = input[4];
345 x[3] = input[12];
346 x[4] = input[2];
347 x[5] = input[10];
348 x[6] = input[6];
349 x[7] = input[14];
350 x[8] = input[1];
351 x[9] = input[9];
352 x[10] = input[5];
353 x[11] = input[13];
354 x[12] = input[3];
355 x[13] = input[11];
356 x[14] = input[7];
357 x[15] = input[15];
358
359 // stage 2
360 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
361 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
362 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
363 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
364
365 // stage 3
366 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
367 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
368 btf_16_adds_subs_sse2(x[8], x[9]);
369 btf_16_subs_adds_sse2(x[11], x[10]);
370 btf_16_adds_subs_sse2(x[12], x[13]);
371 btf_16_subs_adds_sse2(x[15], x[14]);
372
373 // stage 4
374 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
375 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
376 btf_16_adds_subs_sse2(x[4], x[5]);
377 btf_16_subs_adds_sse2(x[7], x[6]);
378 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
379 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
380
381 // stage 5~7
382 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
383 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
384 idct16_stage7_sse2(output, x);
385 }
386
idct16_w4_sse2(const __m128i * input,__m128i * output)387 static void idct16_w4_sse2(const __m128i *input, __m128i *output) {
388 const int8_t cos_bit = INV_COS_BIT;
389 const int32_t *cospi = cospi_arr(INV_COS_BIT);
390 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
391
392 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
393 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
394 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
395 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
396 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
397 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
398 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
399 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
400 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
401 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
402 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
403 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
404 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
405 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
406 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
407 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
408 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
409 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
410 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
411 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
412
413 // stage 1
414 __m128i x[16];
415 x[0] = input[0];
416 x[1] = input[8];
417 x[2] = input[4];
418 x[3] = input[12];
419 x[4] = input[2];
420 x[5] = input[10];
421 x[6] = input[6];
422 x[7] = input[14];
423 x[8] = input[1];
424 x[9] = input[9];
425 x[10] = input[5];
426 x[11] = input[13];
427 x[12] = input[3];
428 x[13] = input[11];
429 x[14] = input[7];
430 x[15] = input[15];
431
432 // stage 2
433 btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
434 btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
435 btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
436 btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
437
438 // stage 3
439 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
440 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
441 btf_16_adds_subs_sse2(x[8], x[9]);
442 btf_16_subs_adds_sse2(x[11], x[10]);
443 btf_16_adds_subs_sse2(x[12], x[13]);
444 btf_16_subs_adds_sse2(x[15], x[14]);
445
446 // stage 4
447 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
448 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
449 btf_16_adds_subs_sse2(x[4], x[5]);
450 btf_16_subs_adds_sse2(x[7], x[6]);
451 btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
452 btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
453
454 // stage 5
455 btf_16_adds_subs_sse2(x[0], x[3]);
456 btf_16_adds_subs_sse2(x[1], x[2]);
457 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
458 btf_16_adds_subs_sse2(x[8], x[11]);
459 btf_16_adds_subs_sse2(x[9], x[10]);
460 btf_16_subs_adds_sse2(x[15], x[12]);
461 btf_16_subs_adds_sse2(x[14], x[13]);
462
463 // stage 6
464 btf_16_adds_subs_sse2(x[0], x[7]);
465 btf_16_adds_subs_sse2(x[1], x[6]);
466 btf_16_adds_subs_sse2(x[2], x[5]);
467 btf_16_adds_subs_sse2(x[3], x[4]);
468 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
469 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
470
471 // stage 7
472 idct16_stage7_sse2(output, x);
473 }
474
idct32_high16_stage3_sse2(__m128i * x)475 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
476 btf_16_adds_subs_sse2(x[16], x[17]);
477 btf_16_subs_adds_sse2(x[19], x[18]);
478 btf_16_adds_subs_sse2(x[20], x[21]);
479 btf_16_subs_adds_sse2(x[23], x[22]);
480 btf_16_adds_subs_sse2(x[24], x[25]);
481 btf_16_subs_adds_sse2(x[27], x[26]);
482 btf_16_adds_subs_sse2(x[28], x[29]);
483 btf_16_subs_adds_sse2(x[31], x[30]);
484 }
485
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)486 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
487 const __m128i __rounding,
488 int8_t cos_bit) {
489 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
490 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
491 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
492 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
493 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
494 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
495 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
496 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
497 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
498 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
499 }
500
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)501 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
502 const __m128i __rounding,
503 int8_t cos_bit) {
504 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
505 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
506 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
507 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
508 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
509 btf_16_adds_subs_sse2(x[16], x[19]);
510 btf_16_adds_subs_sse2(x[17], x[18]);
511 btf_16_subs_adds_sse2(x[23], x[20]);
512 btf_16_subs_adds_sse2(x[22], x[21]);
513 btf_16_adds_subs_sse2(x[24], x[27]);
514 btf_16_adds_subs_sse2(x[25], x[26]);
515 btf_16_subs_adds_sse2(x[31], x[28]);
516 btf_16_subs_adds_sse2(x[30], x[29]);
517 }
518
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)519 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
520 const __m128i __rounding,
521 int8_t cos_bit) {
522 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
523 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
524 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
525 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
526 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
527 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
528 btf_16_adds_subs_sse2(x[8], x[11]);
529 btf_16_adds_subs_sse2(x[9], x[10]);
530 btf_16_subs_adds_sse2(x[15], x[12]);
531 btf_16_subs_adds_sse2(x[14], x[13]);
532 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
533 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
534 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
535 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
536 }
537
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)538 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
539 const __m128i __rounding,
540 int8_t cos_bit) {
541 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
542 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
543 btf_16_adds_subs_sse2(x[0], x[7]);
544 btf_16_adds_subs_sse2(x[1], x[6]);
545 btf_16_adds_subs_sse2(x[2], x[5]);
546 btf_16_adds_subs_sse2(x[3], x[4]);
547 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
548 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
549 btf_16_adds_subs_sse2(x[16], x[23]);
550 btf_16_adds_subs_sse2(x[17], x[22]);
551 btf_16_adds_subs_sse2(x[18], x[21]);
552 btf_16_adds_subs_sse2(x[19], x[20]);
553 btf_16_subs_adds_sse2(x[31], x[24]);
554 btf_16_subs_adds_sse2(x[30], x[25]);
555 btf_16_subs_adds_sse2(x[29], x[26]);
556 btf_16_subs_adds_sse2(x[28], x[27]);
557 }
558
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)559 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
560 const __m128i __rounding,
561 int8_t cos_bit) {
562 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
563 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
564 btf_16_adds_subs_sse2(x[0], x[15]);
565 btf_16_adds_subs_sse2(x[1], x[14]);
566 btf_16_adds_subs_sse2(x[2], x[13]);
567 btf_16_adds_subs_sse2(x[3], x[12]);
568 btf_16_adds_subs_sse2(x[4], x[11]);
569 btf_16_adds_subs_sse2(x[5], x[10]);
570 btf_16_adds_subs_sse2(x[6], x[9]);
571 btf_16_adds_subs_sse2(x[7], x[8]);
572 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
573 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
574 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
575 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
576 }
577
idct32_stage9_sse2(__m128i * output,__m128i * x)578 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
579 btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
580 btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
581 btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
582 btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
583 btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
584 btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
585 btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
586 btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
587 btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
588 btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
589 btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
590 btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
591 btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
592 btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
593 btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
594 btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
595 }
596
idct32_low1_ssse3(const __m128i * input,__m128i * output)597 static void idct32_low1_ssse3(const __m128i *input, __m128i *output) {
598 const int32_t *cospi = cospi_arr(INV_COS_BIT);
599
600 // stage 1
601 __m128i x[2];
602 x[0] = input[0];
603
604 // stage 2
605 // stage 3
606 // stage 4
607 // stage 5
608 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
609
610 // stage 6
611 // stage 7
612 // stage 8
613 // stage 9
614 output[0] = x[0];
615 output[31] = x[0];
616 output[1] = x[1];
617 output[30] = x[1];
618 output[2] = x[1];
619 output[29] = x[1];
620 output[3] = x[0];
621 output[28] = x[0];
622 output[4] = x[0];
623 output[27] = x[0];
624 output[5] = x[1];
625 output[26] = x[1];
626 output[6] = x[1];
627 output[25] = x[1];
628 output[7] = x[0];
629 output[24] = x[0];
630 output[8] = x[0];
631 output[23] = x[0];
632 output[9] = x[1];
633 output[22] = x[1];
634 output[10] = x[1];
635 output[21] = x[1];
636 output[11] = x[0];
637 output[20] = x[0];
638 output[12] = x[0];
639 output[19] = x[0];
640 output[13] = x[1];
641 output[18] = x[1];
642 output[14] = x[1];
643 output[17] = x[1];
644 output[15] = x[0];
645 output[16] = x[0];
646 }
647
idct32_low8_ssse3(const __m128i * input,__m128i * output)648 static void idct32_low8_ssse3(const __m128i *input, __m128i *output) {
649 const int8_t cos_bit = INV_COS_BIT;
650 const int32_t *cospi = cospi_arr(INV_COS_BIT);
651 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
652
653 // stage 1
654 __m128i x[32];
655 x[0] = input[0];
656 x[4] = input[4];
657 x[8] = input[2];
658 x[12] = input[6];
659 x[16] = input[1];
660 x[20] = input[5];
661 x[24] = input[3];
662 x[28] = input[7];
663
664 // stage 2
665 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
666 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
667 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
668 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
669
670 // stage 3
671 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
672 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
673 x[17] = x[16];
674 x[18] = x[19];
675 x[21] = x[20];
676 x[22] = x[23];
677 x[25] = x[24];
678 x[26] = x[27];
679 x[29] = x[28];
680 x[30] = x[31];
681
682 // stage 4
683 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
684 x[9] = x[8];
685 x[10] = x[11];
686 x[13] = x[12];
687 x[14] = x[15];
688 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
689
690 // stage 5
691 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
692 x[5] = x[4];
693 x[6] = x[7];
694 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
695 // stage 6
696 x[3] = x[0];
697 x[2] = x[1];
698 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
699
700 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
701 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
702 idct32_stage9_sse2(output, x);
703 }
704
idct32_low16_ssse3(const __m128i * input,__m128i * output)705 static void idct32_low16_ssse3(const __m128i *input, __m128i *output) {
706 const int8_t cos_bit = INV_COS_BIT;
707 const int32_t *cospi = cospi_arr(INV_COS_BIT);
708 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
709
710 // stage 1
711 __m128i x[32];
712 x[0] = input[0];
713 x[2] = input[8];
714 x[4] = input[4];
715 x[6] = input[12];
716 x[8] = input[2];
717 x[10] = input[10];
718 x[12] = input[6];
719 x[14] = input[14];
720 x[16] = input[1];
721 x[18] = input[9];
722 x[20] = input[5];
723 x[22] = input[13];
724 x[24] = input[3];
725 x[26] = input[11];
726 x[28] = input[7];
727 x[30] = input[15];
728
729 // stage 2
730 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
731 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
732 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
733 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
734 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
735 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
736 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
737 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
738
739 // stage 3
740 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
741 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
742 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
743 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
744 idct32_high16_stage3_sse2(x);
745
746 // stage 4
747 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
748 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
749 btf_16_adds_subs_sse2(x[8], x[9]);
750 btf_16_subs_adds_sse2(x[11], x[10]);
751 btf_16_adds_subs_sse2(x[12], x[13]);
752 btf_16_subs_adds_sse2(x[15], x[14]);
753 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
754
755 // stage 5
756 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
757 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
758 btf_16_adds_subs_sse2(x[4], x[5]);
759 btf_16_subs_adds_sse2(x[7], x[6]);
760 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
761
762 btf_16_adds_subs_sse2(x[0], x[3]);
763 btf_16_adds_subs_sse2(x[1], x[2]);
764 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
765
766 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
767 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
768 idct32_stage9_sse2(output, x);
769 }
770
idct32_sse2(const __m128i * input,__m128i * output)771 static void idct32_sse2(const __m128i *input, __m128i *output) {
772 const int8_t cos_bit = INV_COS_BIT;
773 const int32_t *cospi = cospi_arr(INV_COS_BIT);
774 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
775
776 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
777 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
778 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
779 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
780 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
781 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
782 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
783 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
784 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
785 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
786 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
787 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
788 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
789 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
790 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
791 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
792 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
793 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
794 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
795 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
796 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
797 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
798 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
799 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
800 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
801 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
802 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
803 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
804 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
805 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
806 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
807 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
808
809 // stage 1
810 __m128i x[32];
811 x[0] = input[0];
812 x[1] = input[16];
813 x[2] = input[8];
814 x[3] = input[24];
815 x[4] = input[4];
816 x[5] = input[20];
817 x[6] = input[12];
818 x[7] = input[28];
819 x[8] = input[2];
820 x[9] = input[18];
821 x[10] = input[10];
822 x[11] = input[26];
823 x[12] = input[6];
824 x[13] = input[22];
825 x[14] = input[14];
826 x[15] = input[30];
827 x[16] = input[1];
828 x[17] = input[17];
829 x[18] = input[9];
830 x[19] = input[25];
831 x[20] = input[5];
832 x[21] = input[21];
833 x[22] = input[13];
834 x[23] = input[29];
835 x[24] = input[3];
836 x[25] = input[19];
837 x[26] = input[11];
838 x[27] = input[27];
839 x[28] = input[7];
840 x[29] = input[23];
841 x[30] = input[15];
842 x[31] = input[31];
843
844 // stage 2
845 btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
846 btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
847 btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
848 btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
849 btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
850 btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
851 btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
852 btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
853
854 // stage 3
855 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
856 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
857 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
858 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
859 idct32_high16_stage3_sse2(x);
860
861 // stage 4
862 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
863 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
864 btf_16_adds_subs_sse2(x[8], x[9]);
865 btf_16_subs_adds_sse2(x[11], x[10]);
866 btf_16_adds_subs_sse2(x[12], x[13]);
867 btf_16_subs_adds_sse2(x[15], x[14]);
868 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
869
870 // stage 5
871 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
872 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
873 btf_16_adds_subs_sse2(x[4], x[5]);
874 btf_16_adds_subs_sse2(x[7], x[6]);
875 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
876
877 // stage 6
878 btf_16_adds_subs_sse2(x[0], x[3]);
879 btf_16_adds_subs_sse2(x[1], x[2]);
880 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
881
882 // stage 7~8
883 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
884 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
885 idct32_stage9_sse2(output, x);
886 }
887
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)888 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
889 const __m128i __rounding,
890 int8_t cos_bit) {
891 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
892 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
893 const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
894 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
895 const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
896 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
897 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
898 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
899 const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
900 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
901 const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
902 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
903 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
904 btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
905 btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
906 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
907 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
908 btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
909 btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
910 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
911 }
912
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)913 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
914 const __m128i __rounding,
915 int8_t cos_bit) {
916 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
917 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
918 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
919 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
920 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
921 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
922 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
923 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
924 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
925 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
926 btf_16_adds_subs_sse2(x[32], x[35]);
927 btf_16_adds_subs_sse2(x[33], x[34]);
928 btf_16_subs_adds_sse2(x[39], x[36]);
929 btf_16_subs_adds_sse2(x[38], x[37]);
930 btf_16_adds_subs_sse2(x[40], x[43]);
931 btf_16_adds_subs_sse2(x[41], x[42]);
932 btf_16_subs_adds_sse2(x[47], x[44]);
933 btf_16_subs_adds_sse2(x[46], x[45]);
934 btf_16_adds_subs_sse2(x[48], x[51]);
935 btf_16_adds_subs_sse2(x[49], x[50]);
936 btf_16_subs_adds_sse2(x[55], x[52]);
937 btf_16_subs_adds_sse2(x[54], x[53]);
938 btf_16_adds_subs_sse2(x[56], x[59]);
939 btf_16_adds_subs_sse2(x[57], x[58]);
940 btf_16_subs_adds_sse2(x[63], x[60]);
941 btf_16_subs_adds_sse2(x[62], x[61]);
942 }
943
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)944 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
945 const __m128i __rounding,
946 int8_t cos_bit) {
947 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
948 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
949 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
950 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
951 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
952 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
953 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
954 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
955 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
956 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
957 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
958 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
959 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
960 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
961 }
962
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)963 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
964 const __m128i __rounding,
965 int8_t cos_bit) {
966 btf_16_adds_subs_sse2(x[16], x[19]);
967 btf_16_adds_subs_sse2(x[17], x[18]);
968 btf_16_subs_adds_sse2(x[23], x[20]);
969 btf_16_subs_adds_sse2(x[22], x[21]);
970 btf_16_adds_subs_sse2(x[24], x[27]);
971 btf_16_adds_subs_sse2(x[25], x[26]);
972 btf_16_subs_adds_sse2(x[31], x[28]);
973 btf_16_subs_adds_sse2(x[30], x[29]);
974 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
975 }
976
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)977 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
978 const __m128i __rounding,
979 int8_t cos_bit) {
980 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
981 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
982 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
983 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
984 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
985 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
986 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
987 btf_16_adds_subs_sse2(x[32], x[39]);
988 btf_16_adds_subs_sse2(x[33], x[38]);
989 btf_16_adds_subs_sse2(x[34], x[37]);
990 btf_16_adds_subs_sse2(x[35], x[36]);
991 btf_16_subs_adds_sse2(x[47], x[40]);
992 btf_16_subs_adds_sse2(x[46], x[41]);
993 btf_16_subs_adds_sse2(x[45], x[42]);
994 btf_16_subs_adds_sse2(x[44], x[43]);
995 btf_16_adds_subs_sse2(x[48], x[55]);
996 btf_16_adds_subs_sse2(x[49], x[54]);
997 btf_16_adds_subs_sse2(x[50], x[53]);
998 btf_16_adds_subs_sse2(x[51], x[52]);
999 btf_16_subs_adds_sse2(x[63], x[56]);
1000 btf_16_subs_adds_sse2(x[62], x[57]);
1001 btf_16_subs_adds_sse2(x[61], x[58]);
1002 btf_16_subs_adds_sse2(x[60], x[59]);
1003 }
1004
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1005 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
1006 const __m128i __rounding,
1007 int8_t cos_bit) {
1008 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1009 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1010 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1011 btf_16_adds_subs_sse2(x[16], x[23]);
1012 btf_16_adds_subs_sse2(x[17], x[22]);
1013 btf_16_adds_subs_sse2(x[18], x[21]);
1014 btf_16_adds_subs_sse2(x[19], x[20]);
1015 btf_16_subs_adds_sse2(x[31], x[24]);
1016 btf_16_subs_adds_sse2(x[30], x[25]);
1017 btf_16_subs_adds_sse2(x[29], x[26]);
1018 btf_16_subs_adds_sse2(x[28], x[27]);
1019 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
1020 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
1021 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
1022 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
1023 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
1024 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
1025 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
1026 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
1027 }
1028
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1029 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
1030 const __m128i __rounding,
1031 int8_t cos_bit) {
1032 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1033 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1034 btf_16_adds_subs_sse2(x[0], x[15]);
1035 btf_16_adds_subs_sse2(x[1], x[14]);
1036 btf_16_adds_subs_sse2(x[2], x[13]);
1037 btf_16_adds_subs_sse2(x[3], x[12]);
1038 btf_16_adds_subs_sse2(x[4], x[11]);
1039 btf_16_adds_subs_sse2(x[5], x[10]);
1040 btf_16_adds_subs_sse2(x[6], x[9]);
1041 btf_16_adds_subs_sse2(x[7], x[8]);
1042 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
1043 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
1044 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
1045 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
1046 btf_16_adds_subs_sse2(x[32], x[47]);
1047 btf_16_adds_subs_sse2(x[33], x[46]);
1048 btf_16_adds_subs_sse2(x[34], x[45]);
1049 btf_16_adds_subs_sse2(x[35], x[44]);
1050 btf_16_adds_subs_sse2(x[36], x[43]);
1051 btf_16_adds_subs_sse2(x[37], x[42]);
1052 btf_16_adds_subs_sse2(x[38], x[41]);
1053 btf_16_adds_subs_sse2(x[39], x[40]);
1054 btf_16_subs_adds_sse2(x[63], x[48]);
1055 btf_16_subs_adds_sse2(x[62], x[49]);
1056 btf_16_subs_adds_sse2(x[61], x[50]);
1057 btf_16_subs_adds_sse2(x[60], x[51]);
1058 btf_16_subs_adds_sse2(x[59], x[52]);
1059 btf_16_subs_adds_sse2(x[58], x[53]);
1060 btf_16_subs_adds_sse2(x[57], x[54]);
1061 btf_16_subs_adds_sse2(x[56], x[55]);
1062 }
1063
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1064 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
1065 const __m128i __rounding,
1066 int8_t cos_bit) {
1067 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1068 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1069 btf_16_adds_subs_sse2(x[0], x[31]);
1070 btf_16_adds_subs_sse2(x[1], x[30]);
1071 btf_16_adds_subs_sse2(x[2], x[29]);
1072 btf_16_adds_subs_sse2(x[3], x[28]);
1073 btf_16_adds_subs_sse2(x[4], x[27]);
1074 btf_16_adds_subs_sse2(x[5], x[26]);
1075 btf_16_adds_subs_sse2(x[6], x[25]);
1076 btf_16_adds_subs_sse2(x[7], x[24]);
1077 btf_16_adds_subs_sse2(x[8], x[23]);
1078 btf_16_adds_subs_sse2(x[9], x[22]);
1079 btf_16_adds_subs_sse2(x[10], x[21]);
1080 btf_16_adds_subs_sse2(x[11], x[20]);
1081 btf_16_adds_subs_sse2(x[12], x[19]);
1082 btf_16_adds_subs_sse2(x[13], x[18]);
1083 btf_16_adds_subs_sse2(x[14], x[17]);
1084 btf_16_adds_subs_sse2(x[15], x[16]);
1085 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
1086 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
1087 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
1088 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
1089 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
1090 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
1091 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
1092 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
1093 }
1094
idct64_stage11_sse2(__m128i * output,__m128i * x)1095 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1096 btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1097 btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1098 btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1099 btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1100 btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1101 btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1102 btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1103 btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1104 btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1105 btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1106 btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1107 btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1108 btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1109 btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1110 btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1111 btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1112 btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1113 btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1114 btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1115 btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1116 btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1117 btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1118 btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1119 btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1120 btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1121 btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1122 btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1123 btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1124 btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1125 btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1126 btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1127 btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1128 }
1129
idct64_low1_ssse3(const __m128i * input,__m128i * output)1130 static void idct64_low1_ssse3(const __m128i *input, __m128i *output) {
1131 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1132
1133 // stage 1
1134 __m128i x[32];
1135 x[0] = input[0];
1136
1137 // stage 2
1138 // stage 3
1139 // stage 4
1140 // stage 5
1141 // stage 6
1142 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1143
1144 // stage 7
1145 // stage 8
1146 // stage 9
1147 // stage 10
1148 // stage 11
1149 output[0] = x[0];
1150 output[63] = x[0];
1151 output[1] = x[1];
1152 output[62] = x[1];
1153 output[2] = x[1];
1154 output[61] = x[1];
1155 output[3] = x[0];
1156 output[60] = x[0];
1157 output[4] = x[0];
1158 output[59] = x[0];
1159 output[5] = x[1];
1160 output[58] = x[1];
1161 output[6] = x[1];
1162 output[57] = x[1];
1163 output[7] = x[0];
1164 output[56] = x[0];
1165 output[8] = x[0];
1166 output[55] = x[0];
1167 output[9] = x[1];
1168 output[54] = x[1];
1169 output[10] = x[1];
1170 output[53] = x[1];
1171 output[11] = x[0];
1172 output[52] = x[0];
1173 output[12] = x[0];
1174 output[51] = x[0];
1175 output[13] = x[1];
1176 output[50] = x[1];
1177 output[14] = x[1];
1178 output[49] = x[1];
1179 output[15] = x[0];
1180 output[48] = x[0];
1181 output[16] = x[0];
1182 output[47] = x[0];
1183 output[17] = x[1];
1184 output[46] = x[1];
1185 output[18] = x[1];
1186 output[45] = x[1];
1187 output[19] = x[0];
1188 output[44] = x[0];
1189 output[20] = x[0];
1190 output[43] = x[0];
1191 output[21] = x[1];
1192 output[42] = x[1];
1193 output[22] = x[1];
1194 output[41] = x[1];
1195 output[23] = x[0];
1196 output[40] = x[0];
1197 output[24] = x[0];
1198 output[39] = x[0];
1199 output[25] = x[1];
1200 output[38] = x[1];
1201 output[26] = x[1];
1202 output[37] = x[1];
1203 output[27] = x[0];
1204 output[36] = x[0];
1205 output[28] = x[0];
1206 output[35] = x[0];
1207 output[29] = x[1];
1208 output[34] = x[1];
1209 output[30] = x[1];
1210 output[33] = x[1];
1211 output[31] = x[0];
1212 output[32] = x[0];
1213 }
1214
idct64_low8_ssse3(const __m128i * input,__m128i * output)1215 static void idct64_low8_ssse3(const __m128i *input, __m128i *output) {
1216 const int8_t cos_bit = INV_COS_BIT;
1217 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1218 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1219 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1220 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1221 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1222 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1223 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1224 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1225 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1226 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1227 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1228 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1229 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1230 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1231 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1232 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1233 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1234 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1235
1236 // stage 1
1237 __m128i x[64];
1238 x[0] = input[0];
1239 x[8] = input[4];
1240 x[16] = input[2];
1241 x[24] = input[6];
1242 x[32] = input[1];
1243 x[40] = input[5];
1244 x[48] = input[3];
1245 x[56] = input[7];
1246
1247 // stage 2
1248 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1249 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1250 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1251 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1252
1253 // stage 3
1254 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1255 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1256 x[33] = x[32];
1257 x[38] = x[39];
1258 x[41] = x[40];
1259 x[46] = x[47];
1260 x[49] = x[48];
1261 x[54] = x[55];
1262 x[57] = x[56];
1263 x[62] = x[63];
1264
1265 // stage 4
1266 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1267 x[17] = x[16];
1268 x[22] = x[23];
1269 x[25] = x[24];
1270 x[30] = x[31];
1271 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
1272 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
1273 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
1274 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
1275
1276 // stage 5
1277 x[9] = x[8];
1278 x[14] = x[15];
1279 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
1280 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
1281 x[35] = x[32];
1282 x[34] = x[33];
1283 x[36] = x[39];
1284 x[37] = x[38];
1285 x[43] = x[40];
1286 x[42] = x[41];
1287 x[44] = x[47];
1288 x[45] = x[46];
1289 x[51] = x[48];
1290 x[50] = x[49];
1291 x[52] = x[55];
1292 x[53] = x[54];
1293 x[59] = x[56];
1294 x[58] = x[57];
1295 x[60] = x[63];
1296 x[61] = x[62];
1297
1298 // stage 6
1299 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1300 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1301 x[19] = x[16];
1302 x[18] = x[17];
1303 x[20] = x[23];
1304 x[21] = x[22];
1305 x[27] = x[24];
1306 x[26] = x[25];
1307 x[28] = x[31];
1308 x[29] = x[30];
1309 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1310
1311 // stage 7
1312 x[3] = x[0];
1313 x[2] = x[1];
1314 x[11] = x[8];
1315 x[10] = x[9];
1316 x[12] = x[15];
1317 x[13] = x[14];
1318 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1319
1320 // stage 8
1321 x[7] = x[0];
1322 x[6] = x[1];
1323 x[5] = x[2];
1324 x[4] = x[3];
1325 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1326 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1327 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1328
1329 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1330 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1331 idct64_stage11_sse2(output, x);
1332 }
1333
idct64_low16_ssse3(const __m128i * input,__m128i * output)1334 static void idct64_low16_ssse3(const __m128i *input, __m128i *output) {
1335 const int8_t cos_bit = INV_COS_BIT;
1336 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1337 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1338
1339 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1340 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1341 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1342 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1343 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1344
1345 // stage 1
1346 __m128i x[64];
1347 x[0] = input[0];
1348 x[4] = input[8];
1349 x[8] = input[4];
1350 x[12] = input[12];
1351 x[16] = input[2];
1352 x[20] = input[10];
1353 x[24] = input[6];
1354 x[28] = input[14];
1355 x[32] = input[1];
1356 x[36] = input[9];
1357 x[40] = input[5];
1358 x[44] = input[13];
1359 x[48] = input[3];
1360 x[52] = input[11];
1361 x[56] = input[7];
1362 x[60] = input[15];
1363
1364 // stage 2
1365 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1366 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1367 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1368 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1369 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1370 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1371 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1372 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1373
1374 // stage 3
1375 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1376 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1377 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1378 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1379 x[33] = x[32];
1380 x[34] = x[35];
1381 x[37] = x[36];
1382 x[38] = x[39];
1383 x[41] = x[40];
1384 x[42] = x[43];
1385 x[45] = x[44];
1386 x[46] = x[47];
1387 x[49] = x[48];
1388 x[50] = x[51];
1389 x[53] = x[52];
1390 x[54] = x[55];
1391 x[57] = x[56];
1392 x[58] = x[59];
1393 x[61] = x[60];
1394 x[62] = x[63];
1395
1396 // stage 4
1397 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1398 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1399 x[17] = x[16];
1400 x[18] = x[19];
1401 x[21] = x[20];
1402 x[22] = x[23];
1403 x[25] = x[24];
1404 x[26] = x[27];
1405 x[29] = x[28];
1406 x[30] = x[31];
1407 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1408
1409 // stage 5
1410 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1411 x[9] = x[8];
1412 x[10] = x[11];
1413 x[13] = x[12];
1414 x[14] = x[15];
1415 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1416
1417 // stage 6
1418 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1419 x[5] = x[4];
1420 x[6] = x[7];
1421 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1422 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1423 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1424
1425 // stage 7
1426 x[3] = x[0];
1427 x[2] = x[1];
1428 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1429 btf_16_adds_subs_sse2(x[8], x[11]);
1430 btf_16_adds_subs_sse2(x[9], x[10]);
1431 btf_16_subs_adds_sse2(x[15], x[12]);
1432 btf_16_subs_adds_sse2(x[14], x[13]);
1433 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1434
1435 // stage 8
1436 btf_16_adds_subs_sse2(x[0], x[7]);
1437 btf_16_adds_subs_sse2(x[1], x[6]);
1438 btf_16_adds_subs_sse2(x[2], x[5]);
1439 btf_16_adds_subs_sse2(x[3], x[4]);
1440 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1441 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1442 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1443
1444 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1445 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1446 idct64_stage11_sse2(output, x);
1447 }
1448
idct64_low32_ssse3(const __m128i * input,__m128i * output)1449 static void idct64_low32_ssse3(const __m128i *input, __m128i *output) {
1450 const int8_t cos_bit = INV_COS_BIT;
1451 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1452 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1453
1454 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1455 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1456 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1457 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1458 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1459
1460 // stage 1
1461 __m128i x[64];
1462 x[0] = input[0];
1463 x[2] = input[16];
1464 x[4] = input[8];
1465 x[6] = input[24];
1466 x[8] = input[4];
1467 x[10] = input[20];
1468 x[12] = input[12];
1469 x[14] = input[28];
1470 x[16] = input[2];
1471 x[18] = input[18];
1472 x[20] = input[10];
1473 x[22] = input[26];
1474 x[24] = input[6];
1475 x[26] = input[22];
1476 x[28] = input[14];
1477 x[30] = input[30];
1478 x[32] = input[1];
1479 x[34] = input[17];
1480 x[36] = input[9];
1481 x[38] = input[25];
1482 x[40] = input[5];
1483 x[42] = input[21];
1484 x[44] = input[13];
1485 x[46] = input[29];
1486 x[48] = input[3];
1487 x[50] = input[19];
1488 x[52] = input[11];
1489 x[54] = input[27];
1490 x[56] = input[7];
1491 x[58] = input[23];
1492 x[60] = input[15];
1493 x[62] = input[31];
1494
1495 // stage 2
1496 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1497 btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1498 btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1499 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1500 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1501 btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1502 btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1503 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1504 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1505 btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1506 btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1507 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1508 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1509 btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1510 btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1511 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1512
1513 // stage 3
1514 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1515 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1516 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1517 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1518 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1519 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1520 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1521 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1522 btf_16_adds_subs_sse2(x[32], x[33]);
1523 btf_16_subs_adds_sse2(x[35], x[34]);
1524 btf_16_adds_subs_sse2(x[36], x[37]);
1525 btf_16_subs_adds_sse2(x[39], x[38]);
1526 btf_16_adds_subs_sse2(x[40], x[41]);
1527 btf_16_subs_adds_sse2(x[43], x[42]);
1528 btf_16_adds_subs_sse2(x[44], x[45]);
1529 btf_16_subs_adds_sse2(x[47], x[46]);
1530 btf_16_adds_subs_sse2(x[48], x[49]);
1531 btf_16_subs_adds_sse2(x[51], x[50]);
1532 btf_16_adds_subs_sse2(x[52], x[53]);
1533 btf_16_subs_adds_sse2(x[55], x[54]);
1534 btf_16_adds_subs_sse2(x[56], x[57]);
1535 btf_16_subs_adds_sse2(x[59], x[58]);
1536 btf_16_adds_subs_sse2(x[60], x[61]);
1537 btf_16_subs_adds_sse2(x[63], x[62]);
1538
1539 // stage 4
1540 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1541 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1542 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1543 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1544 btf_16_adds_subs_sse2(x[16], x[17]);
1545 btf_16_subs_adds_sse2(x[19], x[18]);
1546 btf_16_adds_subs_sse2(x[20], x[21]);
1547 btf_16_subs_adds_sse2(x[23], x[22]);
1548 btf_16_adds_subs_sse2(x[24], x[25]);
1549 btf_16_subs_adds_sse2(x[27], x[26]);
1550 btf_16_adds_subs_sse2(x[28], x[29]);
1551 btf_16_subs_adds_sse2(x[31], x[30]);
1552 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1553
1554 // stage 5
1555 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1556 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1557 btf_16_adds_subs_sse2(x[8], x[9]);
1558 btf_16_subs_adds_sse2(x[11], x[10]);
1559 btf_16_adds_subs_sse2(x[12], x[13]);
1560 btf_16_subs_adds_sse2(x[15], x[14]);
1561 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1562
1563 // stage 6
1564 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1565 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1566 btf_16_adds_subs_sse2(x[4], x[5]);
1567 btf_16_subs_adds_sse2(x[7], x[6]);
1568 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1569 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1570 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1571
1572 // stage 7
1573 btf_16_adds_subs_sse2(x[0], x[3]);
1574 btf_16_adds_subs_sse2(x[1], x[2]);
1575 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1576 btf_16_adds_subs_sse2(x[8], x[11]);
1577 btf_16_adds_subs_sse2(x[9], x[10]);
1578 btf_16_subs_adds_sse2(x[15], x[12]);
1579 btf_16_subs_adds_sse2(x[14], x[13]);
1580 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1581
1582 // stage 8
1583 btf_16_adds_subs_sse2(x[0], x[7]);
1584 btf_16_adds_subs_sse2(x[1], x[6]);
1585 btf_16_adds_subs_sse2(x[2], x[5]);
1586 btf_16_adds_subs_sse2(x[3], x[4]);
1587 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1588 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1589 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1590
1591 // stage 9~11
1592 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1593 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1594 idct64_stage11_sse2(output, x);
1595 }
1596
iadst4_sse2(const __m128i * input,__m128i * output)1597 static void iadst4_sse2(const __m128i *input, __m128i *output) {
1598 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1599 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1600 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1601 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1602 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1603 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1604 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1605 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1606 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1607 __m128i x0[4];
1608 x0[0] = input[0];
1609 x0[1] = input[1];
1610 x0[2] = input[2];
1611 x0[3] = input[3];
1612
1613 __m128i u[4];
1614 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1615 u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1616 u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1617 u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1618
1619 __m128i x1[16];
1620 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1621 x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1622 x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1623 x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1624 x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
1625 x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1626 x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
1627 x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1628 x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1629 x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1630 x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
1631 x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1632 x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1633 x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1634 x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1635 x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1636
1637 __m128i x2[8];
1638 x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1639 x2[1] = _mm_add_epi32(x1[1], x1[5]);
1640 x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1641 x2[3] = _mm_add_epi32(x1[3], x1[7]);
1642 x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
1643 x2[5] = _mm_add_epi32(x1[9], x1[11]);
1644 x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1645 x2[7] = _mm_add_epi32(x1[13], x1[15]);
1646
1647 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1648 for (int i = 0; i < 4; ++i) {
1649 __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1650 __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1651 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1652 out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1653 output[i] = _mm_packs_epi32(out0, out1);
1654 }
1655 }
1656
iadst4_w4_sse2(const __m128i * input,__m128i * output)1657 static void iadst4_w4_sse2(const __m128i *input, __m128i *output) {
1658 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1659 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1660 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1661 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1662 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1663 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1664 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1665 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1666 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1667 __m128i x0[4];
1668 x0[0] = input[0];
1669 x0[1] = input[1];
1670 x0[2] = input[2];
1671 x0[3] = input[3];
1672
1673 __m128i u[2];
1674 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1675 u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1676
1677 __m128i x1[8];
1678 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1679 x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1680 x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
1681 x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
1682 x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1683 x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
1684 x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1685 x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1686
1687 __m128i x2[4];
1688 x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1689 x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1690 x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
1691 x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1692
1693 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1694 for (int i = 0; i < 4; ++i) {
1695 __m128i out0 = _mm_add_epi32(x2[i], rounding);
1696 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1697 output[i] = _mm_packs_epi32(out0, out0);
1698 }
1699 }
1700
iadst8_low1_ssse3(const __m128i * input,__m128i * output)1701 static void iadst8_low1_ssse3(const __m128i *input, __m128i *output) {
1702 const int8_t cos_bit = INV_COS_BIT;
1703 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1704 const __m128i __zero = _mm_setzero_si128();
1705 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1706
1707 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1708 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1709 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1710 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1711
1712 // stage 1
1713 __m128i x[8];
1714 x[1] = input[0];
1715
1716 // stage 2
1717 btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1718
1719 // stage 3
1720 x[4] = x[0];
1721 x[5] = x[1];
1722
1723 // stage 4
1724 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1725
1726 // stage 5
1727 x[2] = x[0];
1728 x[3] = x[1];
1729 x[6] = x[4];
1730 x[7] = x[5];
1731
1732 // stage 6
1733 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1734 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1735
1736 // stage 7
1737 output[0] = x[0];
1738 output[1] = _mm_subs_epi16(__zero, x[4]);
1739 output[2] = x[6];
1740 output[3] = _mm_subs_epi16(__zero, x[2]);
1741 output[4] = x[3];
1742 output[5] = _mm_subs_epi16(__zero, x[7]);
1743 output[6] = x[5];
1744 output[7] = _mm_subs_epi16(__zero, x[1]);
1745 }
1746
iadst8_sse2(const __m128i * input,__m128i * output)1747 static void iadst8_sse2(const __m128i *input, __m128i *output) {
1748 const int8_t cos_bit = INV_COS_BIT;
1749 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1750 const __m128i __zero = _mm_setzero_si128();
1751 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1752
1753 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1754 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1755 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1756 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1757 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1758 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1759 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1760 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1761 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1762 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1763 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1764 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1765 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1766
1767 // stage 1
1768 __m128i x[8];
1769 x[0] = input[7];
1770 x[1] = input[0];
1771 x[2] = input[5];
1772 x[3] = input[2];
1773 x[4] = input[3];
1774 x[5] = input[4];
1775 x[6] = input[1];
1776 x[7] = input[6];
1777
1778 // stage 2
1779 btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1780 btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1781 btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1782 btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1783
1784 // stage 3
1785 btf_16_adds_subs_sse2(x[0], x[4]);
1786 btf_16_adds_subs_sse2(x[1], x[5]);
1787 btf_16_adds_subs_sse2(x[2], x[6]);
1788 btf_16_adds_subs_sse2(x[3], x[7]);
1789
1790 // stage 4
1791 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1792 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1793
1794 // stage 5
1795 btf_16_adds_subs_sse2(x[0], x[2]);
1796 btf_16_adds_subs_sse2(x[1], x[3]);
1797 btf_16_adds_subs_sse2(x[4], x[6]);
1798 btf_16_adds_subs_sse2(x[5], x[7]);
1799
1800 // stage 6
1801 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1802 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1803
1804 // stage 7
1805 output[0] = x[0];
1806 output[1] = _mm_subs_epi16(__zero, x[4]);
1807 output[2] = x[6];
1808 output[3] = _mm_subs_epi16(__zero, x[2]);
1809 output[4] = x[3];
1810 output[5] = _mm_subs_epi16(__zero, x[7]);
1811 output[6] = x[5];
1812 output[7] = _mm_subs_epi16(__zero, x[1]);
1813 }
1814
iadst8_w4_sse2(const __m128i * input,__m128i * output)1815 static void iadst8_w4_sse2(const __m128i *input, __m128i *output) {
1816 const int8_t cos_bit = INV_COS_BIT;
1817 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1818 const __m128i __zero = _mm_setzero_si128();
1819 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1820
1821 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1822 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1823 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1824 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1825 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1826 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1827 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1828 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1829 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1830 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1831 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1832 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1833 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1834
1835 // stage 1
1836 __m128i x[8];
1837 x[0] = input[7];
1838 x[1] = input[0];
1839 x[2] = input[5];
1840 x[3] = input[2];
1841 x[4] = input[3];
1842 x[5] = input[4];
1843 x[6] = input[1];
1844 x[7] = input[6];
1845
1846 // stage 2
1847 btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1848 btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1849 btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1850 btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1851
1852 // stage 3
1853 btf_16_adds_subs_sse2(x[0], x[4]);
1854 btf_16_adds_subs_sse2(x[1], x[5]);
1855 btf_16_adds_subs_sse2(x[2], x[6]);
1856 btf_16_adds_subs_sse2(x[3], x[7]);
1857
1858 // stage 4
1859 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1860 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1861
1862 // stage 5
1863 btf_16_adds_subs_sse2(x[0], x[2]);
1864 btf_16_adds_subs_sse2(x[1], x[3]);
1865 btf_16_adds_subs_sse2(x[4], x[6]);
1866 btf_16_adds_subs_sse2(x[5], x[7]);
1867
1868 // stage 6
1869 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1870 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1871
1872 // stage 7
1873 output[0] = x[0];
1874 output[1] = _mm_subs_epi16(__zero, x[4]);
1875 output[2] = x[6];
1876 output[3] = _mm_subs_epi16(__zero, x[2]);
1877 output[4] = x[3];
1878 output[5] = _mm_subs_epi16(__zero, x[7]);
1879 output[6] = x[5];
1880 output[7] = _mm_subs_epi16(__zero, x[1]);
1881 }
1882
iadst16_stage3_ssse3(__m128i * x)1883 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1884 btf_16_adds_subs_sse2(x[0], x[8]);
1885 btf_16_adds_subs_sse2(x[1], x[9]);
1886 btf_16_adds_subs_sse2(x[2], x[10]);
1887 btf_16_adds_subs_sse2(x[3], x[11]);
1888 btf_16_adds_subs_sse2(x[4], x[12]);
1889 btf_16_adds_subs_sse2(x[5], x[13]);
1890 btf_16_adds_subs_sse2(x[6], x[14]);
1891 btf_16_adds_subs_sse2(x[7], x[15]);
1892 }
1893
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1894 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
1895 const __m128i __rounding,
1896 int8_t cos_bit) {
1897 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1898 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1899 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1900 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1901 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1902 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1903 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1904 btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
1905 btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
1906 btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
1907 }
1908
iadst16_stage5_ssse3(__m128i * x)1909 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1910 btf_16_adds_subs_sse2(x[0], x[4]);
1911 btf_16_adds_subs_sse2(x[1], x[5]);
1912 btf_16_adds_subs_sse2(x[2], x[6]);
1913 btf_16_adds_subs_sse2(x[3], x[7]);
1914 btf_16_adds_subs_sse2(x[8], x[12]);
1915 btf_16_adds_subs_sse2(x[9], x[13]);
1916 btf_16_adds_subs_sse2(x[10], x[14]);
1917 btf_16_adds_subs_sse2(x[11], x[15]);
1918 }
1919
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1920 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
1921 const __m128i __rounding,
1922 int8_t cos_bit) {
1923 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1924 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1925 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1926 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1927 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1928 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
1929 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
1930 }
1931
iadst16_stage7_ssse3(__m128i * x)1932 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1933 btf_16_adds_subs_sse2(x[0], x[2]);
1934 btf_16_adds_subs_sse2(x[1], x[3]);
1935 btf_16_adds_subs_sse2(x[4], x[6]);
1936 btf_16_adds_subs_sse2(x[5], x[7]);
1937 btf_16_adds_subs_sse2(x[8], x[10]);
1938 btf_16_adds_subs_sse2(x[9], x[11]);
1939 btf_16_adds_subs_sse2(x[12], x[14]);
1940 btf_16_adds_subs_sse2(x[13], x[15]);
1941 }
1942
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1943 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
1944 const __m128i __rounding,
1945 int8_t cos_bit) {
1946 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1947 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1948 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1949 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1950 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
1951 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
1952 }
1953
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1954 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1955 const __m128i __zero = _mm_setzero_si128();
1956 output[0] = x[0];
1957 output[1] = _mm_subs_epi16(__zero, x[8]);
1958 output[2] = x[12];
1959 output[3] = _mm_subs_epi16(__zero, x[4]);
1960 output[4] = x[6];
1961 output[5] = _mm_subs_epi16(__zero, x[14]);
1962 output[6] = x[10];
1963 output[7] = _mm_subs_epi16(__zero, x[2]);
1964 output[8] = x[3];
1965 output[9] = _mm_subs_epi16(__zero, x[11]);
1966 output[10] = x[15];
1967 output[11] = _mm_subs_epi16(__zero, x[7]);
1968 output[12] = x[5];
1969 output[13] = _mm_subs_epi16(__zero, x[13]);
1970 output[14] = x[9];
1971 output[15] = _mm_subs_epi16(__zero, x[1]);
1972 }
1973
iadst16_low1_ssse3(const __m128i * input,__m128i * output)1974 static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) {
1975 const int8_t cos_bit = INV_COS_BIT;
1976 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1977 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1978
1979 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1980 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1981 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1982 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1983
1984 // stage 1
1985 __m128i x[16];
1986 x[1] = input[0];
1987
1988 // stage 2
1989 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
1990
1991 // stage 3
1992 x[8] = x[0];
1993 x[9] = x[1];
1994
1995 // stage 4
1996 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1997
1998 // stage 5
1999 x[4] = x[0];
2000 x[5] = x[1];
2001 x[12] = x[8];
2002 x[13] = x[9];
2003
2004 // stage 6
2005 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2006 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2007
2008 // stage 7
2009 x[2] = x[0];
2010 x[3] = x[1];
2011 x[6] = x[4];
2012 x[7] = x[5];
2013 x[10] = x[8];
2014 x[11] = x[9];
2015 x[14] = x[12];
2016 x[15] = x[13];
2017
2018 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2019 iadst16_stage9_ssse3(output, x);
2020 }
2021
iadst16_low8_ssse3(const __m128i * input,__m128i * output)2022 static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) {
2023 const int8_t cos_bit = INV_COS_BIT;
2024 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2025 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2026
2027 // stage 1
2028 __m128i x[16];
2029 x[1] = input[0];
2030 x[3] = input[2];
2031 x[5] = input[4];
2032 x[7] = input[6];
2033 x[8] = input[7];
2034 x[10] = input[5];
2035 x[12] = input[3];
2036 x[14] = input[1];
2037
2038 // stage 2
2039 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2040 btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2041 btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2042 btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2043 btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2044 btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2045 btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2046 btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2047
2048 // stage 3
2049 iadst16_stage3_ssse3(x);
2050 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2051 iadst16_stage5_ssse3(x);
2052 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2053 iadst16_stage7_ssse3(x);
2054 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2055 iadst16_stage9_ssse3(output, x);
2056 }
iadst16_sse2(const __m128i * input,__m128i * output)2057 static void iadst16_sse2(const __m128i *input, __m128i *output) {
2058 const int8_t cos_bit = INV_COS_BIT;
2059 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2060 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2061 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2062 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2063 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2064 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2065 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2066 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2067 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2068 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2069 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2070 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2071 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2072 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2073 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2074 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2075 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2076 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2077
2078 // stage 1
2079 __m128i x[16];
2080 x[0] = input[15];
2081 x[1] = input[0];
2082 x[2] = input[13];
2083 x[3] = input[2];
2084 x[4] = input[11];
2085 x[5] = input[4];
2086 x[6] = input[9];
2087 x[7] = input[6];
2088 x[8] = input[7];
2089 x[9] = input[8];
2090 x[10] = input[5];
2091 x[11] = input[10];
2092 x[12] = input[3];
2093 x[13] = input[12];
2094 x[14] = input[1];
2095 x[15] = input[14];
2096
2097 // stage 2
2098 btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2099 btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2100 btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2101 btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2102 btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2103 btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2104 btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2105 btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2106
2107 // stage 3~9
2108 iadst16_stage3_ssse3(x);
2109 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2110 iadst16_stage5_ssse3(x);
2111 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2112 iadst16_stage7_ssse3(x);
2113 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2114 iadst16_stage9_ssse3(output, x);
2115 }
2116
iadst16_w4_sse2(const __m128i * input,__m128i * output)2117 static void iadst16_w4_sse2(const __m128i *input, __m128i *output) {
2118 const int8_t cos_bit = INV_COS_BIT;
2119 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2120 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2121
2122 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2123 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2124 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2125 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2126 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2127 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2128 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2129 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2130 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2131 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2132 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2133 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2134 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2135 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2136 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2137 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2138 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2139 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2140 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2141 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2142 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2143 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2144 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2145 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2146 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2147 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2148 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2149
2150 // stage 1
2151 __m128i x[16];
2152 x[0] = input[15];
2153 x[1] = input[0];
2154 x[2] = input[13];
2155 x[3] = input[2];
2156 x[4] = input[11];
2157 x[5] = input[4];
2158 x[6] = input[9];
2159 x[7] = input[6];
2160 x[8] = input[7];
2161 x[9] = input[8];
2162 x[10] = input[5];
2163 x[11] = input[10];
2164 x[12] = input[3];
2165 x[13] = input[12];
2166 x[14] = input[1];
2167 x[15] = input[14];
2168
2169 // stage 2
2170 btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2171 btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2172 btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2173 btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2174 btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2175 btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2176 btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2177 btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2178
2179 // stage 3
2180 iadst16_stage3_ssse3(x);
2181
2182 // stage 4
2183 btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2184 btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
2185 btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
2186 btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
2187
2188 // stage 5
2189 iadst16_stage5_ssse3(x);
2190
2191 // stage 6
2192 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2193 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
2194 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2195 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
2196
2197 // stage 7
2198 iadst16_stage7_ssse3(x);
2199
2200 // stage 8
2201 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
2202 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
2203 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
2204 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
2205
2206 // stage 9
2207 iadst16_stage9_ssse3(output, x);
2208 }
2209
iidentity4_ssse3(const __m128i * input,__m128i * output)2210 static void iidentity4_ssse3(const __m128i *input, __m128i *output) {
2211 const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
2212 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2213 for (int i = 0; i < 4; ++i) {
2214 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2215 output[i] = _mm_adds_epi16(x, input[i]);
2216 }
2217 }
2218
iidentity8_sse2(const __m128i * input,__m128i * output)2219 static void iidentity8_sse2(const __m128i *input, __m128i *output) {
2220 for (int i = 0; i < 8; ++i) {
2221 output[i] = _mm_adds_epi16(input[i], input[i]);
2222 }
2223 }
2224
iidentity16_ssse3(const __m128i * input,__m128i * output)2225 static void iidentity16_ssse3(const __m128i *input, __m128i *output) {
2226 const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
2227 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2228 for (int i = 0; i < 16; ++i) {
2229 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2230 __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2231 output[i] = _mm_adds_epi16(x, srcx2);
2232 }
2233 }
2234
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2235 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
2236 __m128i res) {
2237 const __m128i zero = _mm_setzero_si128();
2238 __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2239 return _mm_packus_epi16(x0, x0);
2240 }
2241
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2242 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
2243 int stride, int flipud,
2244 const int height) {
2245 int j = flipud ? (height - 1) : 0;
2246 const int step = flipud ? -1 : 1;
2247 const __m128i zero = _mm_setzero_si128();
2248 for (int i = 0; i < height; ++i, j += step) {
2249 const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride)));
2250 __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2251 u = _mm_packus_epi16(u, zero);
2252 *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u);
2253 }
2254 }
2255
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2256 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
2257 int stride, int flipud,
2258 const int height) {
2259 int j = flipud ? (height - 1) : 0;
2260 const int step = flipud ? -1 : 1;
2261 for (int i = 0; i < height; ++i, j += step) {
2262 const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
2263 const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2264 _mm_storel_epi64((__m128i *)(output + i * stride), u);
2265 }
2266 }
2267
2268 // 1D functions process process 8 pixels at one time.
2269 static const transform_1d_ssse3
2270 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2271 { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
2272 { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
2273 { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
2274 { idct32_sse2, NULL, NULL },
2275 { idct64_low32_ssse3, NULL, NULL },
2276 };
2277
2278 // functions for blocks with eob at DC and within
2279 // topleft 8x8, 16x16, 32x32 corner
2280 static const transform_1d_ssse3
2281 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2282 {
2283 { idct4_sse2, idct4_sse2, NULL, NULL },
2284 { iadst4_sse2, iadst4_sse2, NULL, NULL },
2285 { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
2286 },
2287 { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
2288 { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
2289 { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
2290 {
2291 { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
2292 { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
2293 { NULL, NULL, NULL, NULL },
2294 },
2295 { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
2296 idct32_sse2 },
2297 { NULL, NULL, NULL, NULL },
2298 { NULL, NULL, NULL, NULL } },
2299 { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
2300 idct64_low32_ssse3 },
2301 { NULL, NULL, NULL, NULL },
2302 { NULL, NULL, NULL, NULL } }
2303 };
2304
2305 // 1D functions process process 4 pixels at one time.
2306 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2307 static const transform_1d_ssse3
2308 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2309 { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
2310 { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
2311 { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
2312 { NULL, NULL, NULL },
2313 { NULL, NULL, NULL },
2314 };
2315
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)2316 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
2317 int stride, int shift, int height,
2318 int txw_idx, int rect_type) {
2319 const int32_t *input_row = input;
2320 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
2321 const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
2322 (1 << (NewSqrt2Bits - shift - 1)));
2323 const __m128i one = _mm_set1_epi16(1);
2324 const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2325 if (rect_type != 1 && rect_type != -1) {
2326 for (int i = 0; i < height; ++i) {
2327 const __m128i src = load_32bit_to_16bit(input_row);
2328 input_row += stride;
2329 __m128i lo = _mm_unpacklo_epi16(src, one);
2330 __m128i hi = _mm_unpackhi_epi16(src, one);
2331 lo = _mm_madd_epi16(lo, scale_rounding);
2332 hi = _mm_madd_epi16(hi, scale_rounding);
2333 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2334 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2335 out[i] = _mm_packs_epi32(lo, hi);
2336 }
2337 } else {
2338 const __m128i rect_scale =
2339 _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
2340 for (int i = 0; i < height; ++i) {
2341 __m128i src = load_32bit_to_16bit(input_row);
2342 src = _mm_mulhrs_epi16(src, rect_scale);
2343 input_row += stride;
2344 __m128i lo = _mm_unpacklo_epi16(src, one);
2345 __m128i hi = _mm_unpackhi_epi16(src, one);
2346 lo = _mm_madd_epi16(lo, scale_rounding);
2347 hi = _mm_madd_epi16(hi, scale_rounding);
2348 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2349 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2350 out[i] = _mm_packs_epi32(lo, hi);
2351 }
2352 }
2353 }
2354
iidentity_col_8xn_ssse3(uint8_t * output,int stride,__m128i * buf,int shift,int height,int txh_idx)2355 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
2356 __m128i *buf, int shift, int height,
2357 int txh_idx) {
2358 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
2359 const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
2360 const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2361 const __m128i one = _mm_set1_epi16(1);
2362 const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2363 const __m128i zero = _mm_setzero_si128();
2364 for (int h = 0; h < height; ++h) {
2365 __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2366 __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2367 lo = _mm_madd_epi16(lo, scale_coeff);
2368 hi = _mm_madd_epi16(hi, scale_coeff);
2369 lo = _mm_srai_epi32(lo, NewSqrt2Bits);
2370 hi = _mm_srai_epi32(hi, NewSqrt2Bits);
2371 lo = _mm_add_epi32(lo, shift_rounding);
2372 hi = _mm_add_epi32(hi, shift_rounding);
2373 lo = _mm_srai_epi32(lo, -shift);
2374 hi = _mm_srai_epi32(hi, -shift);
2375 __m128i x = _mm_packs_epi32(lo, hi);
2376
2377 const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
2378 x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2379 const __m128i u = _mm_packus_epi16(x, x);
2380 _mm_storel_epi64((__m128i *)(output), u);
2381 output += stride;
2382 }
2383 }
2384
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size)2385 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
2386 uint8_t *output, int stride,
2387 TX_SIZE tx_size) {
2388 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2389 const int txw_idx = get_txw_idx(tx_size);
2390 const int txh_idx = get_txh_idx(tx_size);
2391 const int txfm_size_col = tx_size_wide[tx_size];
2392 const int txfm_size_row = tx_size_high[tx_size];
2393 const int input_stride = AOMMIN(32, txfm_size_col);
2394 const int row_max = AOMMIN(32, txfm_size_row);
2395 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2396 __m128i buf[32];
2397
2398 for (int i = 0; i < (input_stride >> 3); ++i) {
2399 iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
2400 txw_idx, rect_type);
2401 iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
2402 txh_idx);
2403 }
2404 }
2405
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2406 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
2407 uint8_t *output, int stride,
2408 TX_TYPE tx_type, TX_SIZE tx_size_,
2409 int eob) {
2410 (void)tx_size_;
2411 (void)eob;
2412 __m128i buf[4];
2413 const TX_SIZE tx_size = TX_4X4;
2414 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2415 const int txw_idx = get_txw_idx(tx_size);
2416 const int txh_idx = get_txh_idx(tx_size);
2417 const int txfm_size_col = tx_size_wide[tx_size];
2418 const int txfm_size_row = tx_size_high[tx_size];
2419
2420 const transform_1d_ssse3 row_txfm =
2421 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2422 const transform_1d_ssse3 col_txfm =
2423 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2424
2425 int ud_flip, lr_flip;
2426 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2427 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2428 transpose_16bit_4x4(buf, buf);
2429 row_txfm(buf, buf);
2430 if (lr_flip) {
2431 __m128i temp[4];
2432 flip_buf_sse2(buf, temp, txfm_size_col);
2433 transpose_16bit_4x4(temp, buf);
2434 } else {
2435 transpose_16bit_4x4(buf, buf);
2436 }
2437 col_txfm(buf, buf);
2438 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2439 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2440 }
2441
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2442 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
2443 __m128i res0, __m128i res1) {
2444 const __m128i zero = _mm_setzero_si128();
2445 __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2446 __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2447 x0 = _mm_adds_epi16(res0, x0);
2448 x1 = _mm_adds_epi16(res1, x1);
2449 return _mm_packus_epi16(x0, x1);
2450 }
2451
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,int height)2452 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
2453 int stride, int flipud,
2454 int height) {
2455 int j = flipud ? (height - 1) : 0;
2456 const int step = flipud ? -1 : 1;
2457 for (int i = 0; i < height; ++i, j += step) {
2458 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
2459 __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2460 _mm_storeu_si128((__m128i *)(output + i * stride), u);
2461 }
2462 }
2463
round_shift_ssse3(const __m128i * input,__m128i * output,int size)2464 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
2465 int size) {
2466 const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
2467 for (int i = 0; i < size; ++i) {
2468 output[i] = _mm_mulhrs_epi16(input[i], scale);
2469 }
2470 }
2471
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2472 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
2473 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2474 TX_SIZE tx_size, int eob) {
2475 __m128i buf1[64 * 8];
2476 int eobx, eoby;
2477 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2478 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2479 const int txw_idx = get_txw_idx(tx_size);
2480 const int txh_idx = get_txh_idx(tx_size);
2481 const int txfm_size_col = tx_size_wide[tx_size];
2482 const int txfm_size_row = tx_size_high[tx_size];
2483 const int buf_size_w_div8 = txfm_size_col >> 3;
2484 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2485 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2486 const int input_stride = AOMMIN(32, txfm_size_col);
2487 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2488
2489 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2490 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2491 const transform_1d_ssse3 row_txfm =
2492 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2493 const transform_1d_ssse3 col_txfm =
2494 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2495
2496 assert(col_txfm != NULL);
2497 assert(row_txfm != NULL);
2498 int ud_flip, lr_flip;
2499 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2500 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
2501 __m128i buf0[64];
2502 const int32_t *input_row = input + i * input_stride * 8;
2503 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
2504 __m128i *buf0_cur = buf0 + j * 8;
2505 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2506 transpose_16bit_8x8(buf0_cur, buf0_cur);
2507 }
2508 if (rect_type == 1 || rect_type == -1) {
2509 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2510 }
2511 row_txfm(buf0, buf0);
2512 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2513 __m128i *_buf1 = buf1 + i * 8;
2514 if (lr_flip) {
2515 for (int j = 0; j < buf_size_w_div8; ++j) {
2516 __m128i temp[8];
2517 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2518 transpose_16bit_8x8(temp,
2519 _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2520 }
2521 } else {
2522 for (int j = 0; j < buf_size_w_div8; ++j) {
2523 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2524 }
2525 }
2526 }
2527 for (int i = 0; i < buf_size_w_div8; i++) {
2528 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row);
2529 round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2530 }
2531
2532 if (txfm_size_col >= 16) {
2533 for (int i = 0; i < (txfm_size_col >> 4); i++) {
2534 lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2535 output + 16 * i, stride, ud_flip,
2536 txfm_size_row);
2537 }
2538 } else if (txfm_size_col == 8) {
2539 lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
2540 }
2541 }
2542
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2543 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
2544 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2545 TX_SIZE tx_size, int eob) {
2546 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2547 int eobx, eoby;
2548 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2549 const int txw_idx = get_txw_idx(tx_size);
2550 const int txh_idx = get_txh_idx(tx_size);
2551 const int txfm_size_col = tx_size_wide[tx_size];
2552 const int txfm_size_row = tx_size_high[tx_size];
2553 const int buf_size_w_div8 = (eobx + 8) >> 3;
2554 const int input_stride = AOMMIN(32, txfm_size_col);
2555 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2556
2557 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2558 assert(fun_idx < 5);
2559 const transform_1d_ssse3 col_txfm =
2560 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2561
2562 assert(col_txfm != NULL);
2563
2564 int ud_flip, lr_flip;
2565 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2566 for (int i = 0; i < buf_size_w_div8; i++) {
2567 __m128i buf0[64];
2568 iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
2569 eoby + 1, txw_idx, rect_type);
2570 col_txfm(buf0, buf0);
2571 __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2572 int k = ud_flip ? (txfm_size_row - 1) : 0;
2573 const int step = ud_flip ? -1 : 1;
2574 uint8_t *out = output + 8 * i;
2575 for (int j = 0; j < txfm_size_row; ++j, k += step) {
2576 const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
2577 __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2578 const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2579 _mm_storel_epi64((__m128i *)(out), u);
2580 out += stride;
2581 }
2582 }
2583 }
2584
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2585 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
2586 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2587 TX_SIZE tx_size, int eob) {
2588 __m128i buf1[64];
2589 int eobx, eoby;
2590 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2591 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2592 const int txw_idx = get_txw_idx(tx_size);
2593 const int txh_idx = get_txh_idx(tx_size);
2594 const int txfm_size_col = tx_size_wide[tx_size];
2595 const int txfm_size_row = tx_size_high[tx_size];
2596 const int buf_size_w_div8 = txfm_size_col >> 3;
2597 const int buf_size_h_div8 = (eoby + 8) >> 3;
2598 const int input_stride = AOMMIN(32, txfm_size_col);
2599 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2600
2601 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2602 const transform_1d_ssse3 row_txfm =
2603 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2604
2605 assert(row_txfm != NULL);
2606 int ud_flip, lr_flip;
2607 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2608 for (int i = 0; i < buf_size_h_div8; i++) {
2609 __m128i buf0[64];
2610 const int32_t *input_row = input + i * input_stride * 8;
2611 for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2612 __m128i *buf0_cur = buf0 + j * 8;
2613 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2614 transpose_16bit_8x8(buf0_cur, buf0_cur);
2615 }
2616 if (rect_type == 1 || rect_type == -1) {
2617 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2618 }
2619 row_txfm(buf0, buf0);
2620 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2621 __m128i *_buf1 = buf1;
2622 if (lr_flip) {
2623 for (int j = 0; j < buf_size_w_div8; ++j) {
2624 __m128i temp[8];
2625 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2626 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2627 }
2628 } else {
2629 for (int j = 0; j < buf_size_w_div8; ++j) {
2630 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2631 }
2632 }
2633
2634 for (int j = 0; j < buf_size_w_div8; ++j) {
2635 iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
2636 buf1 + j * 8, shift[1], 8, txh_idx);
2637 }
2638 }
2639 }
2640
2641 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2642 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
2643 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2644 TX_SIZE tx_size, int eob) {
2645 switch (tx_type) {
2646 case DCT_DCT:
2647 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2648 tx_size, eob);
2649 break;
2650 case IDTX:
2651 lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2652 break;
2653 case V_DCT:
2654 case V_ADST:
2655 case V_FLIPADST:
2656 lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2657 tx_size, eob);
2658 break;
2659 case H_DCT:
2660 case H_ADST:
2661 case H_FLIPADST:
2662 lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2663 tx_size, eob);
2664 break;
2665 default:
2666 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2667 tx_size, eob);
2668 break;
2669 }
2670 }
2671
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2672 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
2673 uint8_t *output, int stride,
2674 TX_TYPE tx_type, TX_SIZE tx_size_,
2675 int eob) {
2676 (void)tx_size_;
2677 (void)eob;
2678 __m128i buf[8];
2679 const TX_SIZE tx_size = TX_4X8;
2680 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2681 const int txw_idx = get_txw_idx(tx_size);
2682 const int txh_idx = get_txh_idx(tx_size);
2683 const int txfm_size_col = tx_size_wide[tx_size];
2684 const int txfm_size_row = tx_size_high[tx_size];
2685
2686 const transform_1d_ssse3 row_txfm =
2687 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2688 const transform_1d_ssse3 col_txfm =
2689 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2690
2691 int ud_flip, lr_flip;
2692 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2693 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2694 transpose_16bit_4x8(buf, buf);
2695 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2696 row_txfm(buf, buf);
2697 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2698 if (lr_flip) {
2699 __m128i temp[4];
2700 flip_buf_sse2(buf, temp, txfm_size_col);
2701 transpose_16bit_8x4(temp, buf);
2702 } else {
2703 transpose_16bit_8x4(buf, buf);
2704 }
2705 col_txfm(buf, buf);
2706 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2707 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2708 }
2709
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2710 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
2711 uint8_t *output, int stride,
2712 TX_TYPE tx_type, TX_SIZE tx_size_,
2713 int eob) {
2714 (void)tx_size_;
2715 (void)eob;
2716 __m128i buf[8];
2717 const TX_SIZE tx_size = TX_8X4;
2718 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2719 const int txw_idx = get_txw_idx(tx_size);
2720 const int txh_idx = get_txh_idx(tx_size);
2721 const int txfm_size_col = tx_size_wide[tx_size];
2722 const int txfm_size_row = tx_size_high[tx_size];
2723
2724 const transform_1d_ssse3 row_txfm =
2725 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2726 const transform_1d_ssse3 col_txfm =
2727 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2728
2729 int ud_flip, lr_flip;
2730 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2731 load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2732 transpose_16bit_8x4(buf, buf);
2733 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2734 row_txfm(buf, buf);
2735 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2736 if (lr_flip) {
2737 __m128i temp[8];
2738 flip_buf_sse2(buf, temp, txfm_size_col);
2739 transpose_16bit_4x8(temp, buf);
2740 } else {
2741 transpose_16bit_4x8(buf, buf);
2742 }
2743 col_txfm(buf, buf);
2744 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2745 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2746 }
2747
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2748 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
2749 uint8_t *output, int stride,
2750 TX_TYPE tx_type, TX_SIZE tx_size_,
2751 int eob) {
2752 (void)tx_size_;
2753 (void)eob;
2754 __m128i buf[16];
2755 const TX_SIZE tx_size = TX_4X16;
2756 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2757 const int txw_idx = get_txw_idx(tx_size);
2758 const int txh_idx = get_txh_idx(tx_size);
2759 const int txfm_size_col = tx_size_wide[tx_size];
2760 const int txfm_size_row = tx_size_high[tx_size];
2761
2762 const transform_1d_ssse3 row_txfm =
2763 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2764 const transform_1d_ssse3 col_txfm =
2765 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2766
2767 int ud_flip, lr_flip;
2768 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2769
2770 const int row_one_loop = 8;
2771 for (int i = 0; i < 2; ++i) {
2772 const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2773 __m128i *buf_cur = buf + i * row_one_loop;
2774 load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
2775 row_one_loop);
2776 transpose_16bit_4x8(buf_cur, buf_cur);
2777 if (row_txfm == iidentity4_ssse3) {
2778 const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
2779 const __m128i ones = _mm_set1_epi16(1);
2780 for (int j = 0; j < 4; ++j) {
2781 const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
2782 const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
2783 const __m128i buf_32_lo =
2784 _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
2785 const __m128i buf_32_hi =
2786 _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
2787 buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2788 }
2789 } else {
2790 row_txfm(buf_cur, buf_cur);
2791 round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2792 }
2793 if (lr_flip) {
2794 __m128i temp[8];
2795 flip_buf_sse2(buf_cur, temp, txfm_size_col);
2796 transpose_16bit_8x4(temp, buf_cur);
2797 } else {
2798 transpose_16bit_8x4(buf_cur, buf_cur);
2799 }
2800 }
2801 col_txfm(buf, buf);
2802 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2803 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2804 }
2805
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2806 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
2807 uint8_t *output, int stride,
2808 TX_TYPE tx_type, TX_SIZE tx_size_,
2809 int eob) {
2810 (void)tx_size_;
2811 (void)eob;
2812 __m128i buf[16];
2813 const TX_SIZE tx_size = TX_16X4;
2814 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2815 const int txw_idx = get_txw_idx(tx_size);
2816 const int txh_idx = get_txh_idx(tx_size);
2817 const int txfm_size_col = tx_size_wide[tx_size];
2818 const int txfm_size_row = tx_size_high[tx_size];
2819 const int buf_size_w_div8 = txfm_size_col >> 3;
2820
2821 const transform_1d_ssse3 row_txfm =
2822 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2823 const transform_1d_ssse3 col_txfm =
2824 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2825
2826 int ud_flip, lr_flip;
2827 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2828 const int row_one_loop = 8;
2829 for (int i = 0; i < buf_size_w_div8; ++i) {
2830 const int32_t *input_cur = input + i * row_one_loop;
2831 __m128i *buf_cur = buf + i * row_one_loop;
2832 load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
2833 txfm_size_row);
2834 transpose_16bit_8x4(buf_cur, buf_cur);
2835 }
2836 if (row_txfm == iidentity16_ssse3) {
2837 const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
2838 const __m128i ones = _mm_set1_epi16(1);
2839 for (int j = 0; j < 16; ++j) {
2840 const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
2841 const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
2842 const __m128i buf_32_lo =
2843 _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
2844 const __m128i buf_32_hi =
2845 _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
2846 buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
2847 }
2848 } else {
2849 row_txfm(buf, buf);
2850 round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2851 }
2852 if (lr_flip) {
2853 __m128i temp[16];
2854 flip_buf_sse2(buf, temp, 16);
2855 transpose_16bit_4x8(temp, buf);
2856 transpose_16bit_4x8(temp + 8, buf + 8);
2857 } else {
2858 transpose_16bit_4x8(buf, buf);
2859 transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2860 }
2861 for (int i = 0; i < buf_size_w_div8; i++) {
2862 col_txfm(buf + i * row_one_loop, buf + i * row_one_loop);
2863 round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2864 }
2865 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
2866 lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
2867 }
2868
av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2869 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
2870 int stride, TX_TYPE tx_type,
2871 TX_SIZE tx_size, int eob) {
2872 switch (tx_size) {
2873 case TX_4X4:
2874 lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
2875 eob);
2876 break;
2877 case TX_4X8:
2878 lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
2879 eob);
2880 break;
2881 case TX_8X4:
2882 lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
2883 eob);
2884 break;
2885 case TX_4X16:
2886 lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
2887 eob);
2888 break;
2889 case TX_16X4:
2890 lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
2891 eob);
2892 break;
2893 default:
2894 lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
2895 tx_size, eob);
2896 break;
2897 }
2898 }
2899
av1_inv_txfm_add_ssse3(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)2900 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2901 const TxfmParam *txfm_param) {
2902 if (!txfm_param->lossless) {
2903 const TX_TYPE tx_type = txfm_param->tx_type;
2904 av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
2905 txfm_param->tx_size, txfm_param->eob);
2906
2907 } else {
2908 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2909 }
2910 }
2911