1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_config.h"
13 #include "config/av1_rtcd.h"
14
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18
19 // TODO(venkatsanampudi@ittiam.com): move this to header file
20
21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
22 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
23 4 * 5793 };
24
25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
26
idct4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)27 static void idct4_new_sse2(const __m128i *input, __m128i *output,
28 int8_t cos_bit) {
29 (void)cos_bit;
30 const int32_t *cospi = cospi_arr(INV_COS_BIT);
31 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
32
33 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
34 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
35 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
36 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
37
38 // stage 1
39 __m128i x[4];
40 x[0] = input[0];
41 x[1] = input[2];
42 x[2] = input[1];
43 x[3] = input[3];
44
45 // stage 2
46 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
47 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
48
49 // stage 3
50 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
51 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
52 }
53
idct4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)54 void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
55 (void)cos_bit;
56 const int32_t *cospi = cospi_arr(INV_COS_BIT);
57 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
58
59 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
60 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
61 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
62 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
63
64 // stage 1
65 __m128i x[4];
66 x[0] = input[0];
67 x[1] = input[2];
68 x[2] = input[1];
69 x[3] = input[3];
70
71 // stage 2
72 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
73 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
74
75 // stage 3
76 btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
77 btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
78 }
79
idct8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)80 void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
81 int8_t cos_bit) {
82 (void)cos_bit;
83 const int32_t *cospi = cospi_arr(INV_COS_BIT);
84
85 // stage 1
86 __m128i x[2];
87 x[0] = input[0];
88
89 // stage 2
90 // stage 3
91 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
92
93 // stage 4
94 // stage 5
95 output[0] = x[0];
96 output[7] = x[0];
97 output[1] = x[1];
98 output[6] = x[1];
99 output[2] = x[1];
100 output[5] = x[1];
101 output[3] = x[0];
102 output[4] = x[0];
103 }
104
idct8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)105 void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
106 (void)cos_bit;
107 const int32_t *cospi = cospi_arr(INV_COS_BIT);
108 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
109
110 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
111 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
112 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
113 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
114 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
115 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
116 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
117 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
118 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
119
120 // stage 1
121 __m128i x[8];
122 x[0] = input[0];
123 x[1] = input[4];
124 x[2] = input[2];
125 x[3] = input[6];
126 x[4] = input[1];
127 x[5] = input[5];
128 x[6] = input[3];
129 x[7] = input[7];
130
131 // stage 2
132 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
133 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
134
135 // stage 3
136 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
137 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
138 btf_16_adds_subs_sse2(x[4], x[5]);
139 btf_16_subs_adds_sse2(x[7], x[6]);
140
141 // stage 4
142 btf_16_adds_subs_sse2(x[0], x[3]);
143 btf_16_adds_subs_sse2(x[1], x[2]);
144 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
145
146 // stage 5
147 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
148 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
149 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
150 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
151 }
152
idct8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)153 void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
154 (void)cos_bit;
155 const int32_t *cospi = cospi_arr(INV_COS_BIT);
156 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
157
158 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
159 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
160 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
161 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
162 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
163 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
164 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
165 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
166 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
167
168 // stage 1
169 __m128i x[8];
170 x[0] = input[0];
171 x[1] = input[4];
172 x[2] = input[2];
173 x[3] = input[6];
174 x[4] = input[1];
175 x[5] = input[5];
176 x[6] = input[3];
177 x[7] = input[7];
178
179 // stage 2
180 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
181 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
182
183 // stage 3
184 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
185 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
186 btf_16_adds_subs_sse2(x[4], x[5]);
187 btf_16_subs_adds_sse2(x[7], x[6]);
188
189 // stage 4
190 btf_16_adds_subs_sse2(x[0], x[3]);
191 btf_16_adds_subs_sse2(x[1], x[2]);
192 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
193
194 // stage 5
195 btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
196 btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
197 btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
198 btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
199 }
200
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)201 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
202 const __m128i __rounding,
203 int8_t cos_bit) {
204 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
205 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
206 btf_16_adds_subs_sse2(x[0], x[3]);
207 btf_16_adds_subs_sse2(x[1], x[2]);
208 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
209 btf_16_adds_subs_sse2(x[8], x[11]);
210 btf_16_adds_subs_sse2(x[9], x[10]);
211 btf_16_subs_adds_sse2(x[15], x[12]);
212 btf_16_subs_adds_sse2(x[14], x[13]);
213 }
214
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)215 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
216 const __m128i __rounding,
217 int8_t cos_bit) {
218 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
219 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
220 btf_16_adds_subs_sse2(x[0], x[7]);
221 btf_16_adds_subs_sse2(x[1], x[6]);
222 btf_16_adds_subs_sse2(x[2], x[5]);
223 btf_16_adds_subs_sse2(x[3], x[4]);
224 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
225 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
226 }
227
idct16_stage7_sse2(__m128i * output,__m128i * x)228 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
229 btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
230 btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
231 btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
232 btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
233 btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
234 btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
235 btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
236 btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
237 }
238
idct16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)239 static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
240 int8_t cos_bit) {
241 (void)cos_bit;
242 const int32_t *cospi = cospi_arr(INV_COS_BIT);
243
244 // stage 1
245 __m128i x[2];
246 x[0] = input[0];
247
248 // stage 2
249 // stage 3
250 // stage 4
251 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
252
253 // stage 5
254 // stage 6
255 // stage 7
256 output[0] = x[0];
257 output[15] = x[0];
258 output[1] = x[1];
259 output[14] = x[1];
260 output[2] = x[1];
261 output[13] = x[1];
262 output[3] = x[0];
263 output[12] = x[0];
264 output[4] = x[0];
265 output[11] = x[0];
266 output[5] = x[1];
267 output[10] = x[1];
268 output[6] = x[1];
269 output[9] = x[1];
270 output[7] = x[0];
271 output[8] = x[0];
272 }
273
idct16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)274 static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
275 int8_t cos_bit) {
276 (void)cos_bit;
277 const int32_t *cospi = cospi_arr(INV_COS_BIT);
278 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
279 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
280 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
281 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
282
283 // stage 1
284 __m128i x[16];
285 x[0] = input[0];
286 x[2] = input[4];
287 x[4] = input[2];
288 x[6] = input[6];
289 x[8] = input[1];
290 x[10] = input[5];
291 x[12] = input[3];
292 x[14] = input[7];
293
294 // stage 2
295 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
296 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
297 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
298 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
299
300 // stage 3
301 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
302 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
303 btf_16_adds_subs_sse2(x[8], x[9]);
304 btf_16_subs_adds_sse2(x[11], x[10]);
305 btf_16_adds_subs_sse2(x[12], x[13]);
306 btf_16_subs_adds_sse2(x[15], x[14]);
307
308 // stage 4
309 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
310 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
311 btf_16_adds_subs_sse2(x[4], x[5]);
312 btf_16_subs_adds_sse2(x[7], x[6]);
313 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
314 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
315
316 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
317 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
318 idct16_stage7_sse2(output, x);
319 }
320
idct16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)321 void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
322 (void)cos_bit;
323 const int32_t *cospi = cospi_arr(INV_COS_BIT);
324 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
325
326 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
327 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
328 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
329 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
330 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
331 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
332 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
333 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
334 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
335 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
336 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
337 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
338 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
339 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
340 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
341 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
342 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
343 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
344 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
345
346 // stage 1
347 __m128i x[16];
348 x[0] = input[0];
349 x[1] = input[8];
350 x[2] = input[4];
351 x[3] = input[12];
352 x[4] = input[2];
353 x[5] = input[10];
354 x[6] = input[6];
355 x[7] = input[14];
356 x[8] = input[1];
357 x[9] = input[9];
358 x[10] = input[5];
359 x[11] = input[13];
360 x[12] = input[3];
361 x[13] = input[11];
362 x[14] = input[7];
363 x[15] = input[15];
364
365 // stage 2
366 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
367 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
368 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
369 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
370
371 // stage 3
372 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
373 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
374 btf_16_adds_subs_sse2(x[8], x[9]);
375 btf_16_subs_adds_sse2(x[11], x[10]);
376 btf_16_adds_subs_sse2(x[12], x[13]);
377 btf_16_subs_adds_sse2(x[15], x[14]);
378
379 // stage 4
380 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
381 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
382 btf_16_adds_subs_sse2(x[4], x[5]);
383 btf_16_subs_adds_sse2(x[7], x[6]);
384 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
385 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
386
387 // stage 5~7
388 idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
389 idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
390 idct16_stage7_sse2(output, x);
391 }
392
idct16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)393 void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
394 (void)cos_bit;
395 const int32_t *cospi = cospi_arr(INV_COS_BIT);
396 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
397
398 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
399 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
400 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
401 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
402 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
403 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
404 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
405 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
406 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
407 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
408 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
409 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
410 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
411 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
412 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
413 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
414 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
415 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
416 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
417 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
418
419 // stage 1
420 __m128i x[16];
421 x[0] = input[0];
422 x[1] = input[8];
423 x[2] = input[4];
424 x[3] = input[12];
425 x[4] = input[2];
426 x[5] = input[10];
427 x[6] = input[6];
428 x[7] = input[14];
429 x[8] = input[1];
430 x[9] = input[9];
431 x[10] = input[5];
432 x[11] = input[13];
433 x[12] = input[3];
434 x[13] = input[11];
435 x[14] = input[7];
436 x[15] = input[15];
437
438 // stage 2
439 btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
440 btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
441 btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
442 btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
443
444 // stage 3
445 btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
446 btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
447 btf_16_adds_subs_sse2(x[8], x[9]);
448 btf_16_subs_adds_sse2(x[11], x[10]);
449 btf_16_adds_subs_sse2(x[12], x[13]);
450 btf_16_subs_adds_sse2(x[15], x[14]);
451
452 // stage 4
453 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
454 btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
455 btf_16_adds_subs_sse2(x[4], x[5]);
456 btf_16_subs_adds_sse2(x[7], x[6]);
457 btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
458 btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
459
460 // stage 5
461 btf_16_adds_subs_sse2(x[0], x[3]);
462 btf_16_adds_subs_sse2(x[1], x[2]);
463 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
464 btf_16_adds_subs_sse2(x[8], x[11]);
465 btf_16_adds_subs_sse2(x[9], x[10]);
466 btf_16_subs_adds_sse2(x[15], x[12]);
467 btf_16_subs_adds_sse2(x[14], x[13]);
468
469 // stage 6
470 btf_16_adds_subs_sse2(x[0], x[7]);
471 btf_16_adds_subs_sse2(x[1], x[6]);
472 btf_16_adds_subs_sse2(x[2], x[5]);
473 btf_16_adds_subs_sse2(x[3], x[4]);
474 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
475 btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
476
477 // stage 7
478 idct16_stage7_sse2(output, x);
479 }
480
idct32_high16_stage3_sse2(__m128i * x)481 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
482 btf_16_adds_subs_sse2(x[16], x[17]);
483 btf_16_subs_adds_sse2(x[19], x[18]);
484 btf_16_adds_subs_sse2(x[20], x[21]);
485 btf_16_subs_adds_sse2(x[23], x[22]);
486 btf_16_adds_subs_sse2(x[24], x[25]);
487 btf_16_subs_adds_sse2(x[27], x[26]);
488 btf_16_adds_subs_sse2(x[28], x[29]);
489 btf_16_subs_adds_sse2(x[31], x[30]);
490 }
491
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)492 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
493 const __m128i __rounding,
494 int8_t cos_bit) {
495 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
496 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
497 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
498 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
499 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
500 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
501 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
502 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
503 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
504 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
505 }
506
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)507 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
508 const __m128i __rounding,
509 int8_t cos_bit) {
510 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
511 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
512 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
513 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
514 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
515 btf_16_adds_subs_sse2(x[16], x[19]);
516 btf_16_adds_subs_sse2(x[17], x[18]);
517 btf_16_subs_adds_sse2(x[23], x[20]);
518 btf_16_subs_adds_sse2(x[22], x[21]);
519 btf_16_adds_subs_sse2(x[24], x[27]);
520 btf_16_adds_subs_sse2(x[25], x[26]);
521 btf_16_subs_adds_sse2(x[31], x[28]);
522 btf_16_subs_adds_sse2(x[30], x[29]);
523 }
524
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)525 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
526 const __m128i __rounding,
527 int8_t cos_bit) {
528 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
529 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
530 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
531 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
532 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
533 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
534 btf_16_adds_subs_sse2(x[8], x[11]);
535 btf_16_adds_subs_sse2(x[9], x[10]);
536 btf_16_subs_adds_sse2(x[15], x[12]);
537 btf_16_subs_adds_sse2(x[14], x[13]);
538 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
539 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
540 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
541 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
542 }
543
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)544 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
545 const __m128i __rounding,
546 int8_t cos_bit) {
547 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
548 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
549 btf_16_adds_subs_sse2(x[0], x[7]);
550 btf_16_adds_subs_sse2(x[1], x[6]);
551 btf_16_adds_subs_sse2(x[2], x[5]);
552 btf_16_adds_subs_sse2(x[3], x[4]);
553 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
554 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
555 btf_16_adds_subs_sse2(x[16], x[23]);
556 btf_16_adds_subs_sse2(x[17], x[22]);
557 btf_16_adds_subs_sse2(x[18], x[21]);
558 btf_16_adds_subs_sse2(x[19], x[20]);
559 btf_16_subs_adds_sse2(x[31], x[24]);
560 btf_16_subs_adds_sse2(x[30], x[25]);
561 btf_16_subs_adds_sse2(x[29], x[26]);
562 btf_16_subs_adds_sse2(x[28], x[27]);
563 }
564
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)565 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
566 const __m128i __rounding,
567 int8_t cos_bit) {
568 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
569 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
570 btf_16_adds_subs_sse2(x[0], x[15]);
571 btf_16_adds_subs_sse2(x[1], x[14]);
572 btf_16_adds_subs_sse2(x[2], x[13]);
573 btf_16_adds_subs_sse2(x[3], x[12]);
574 btf_16_adds_subs_sse2(x[4], x[11]);
575 btf_16_adds_subs_sse2(x[5], x[10]);
576 btf_16_adds_subs_sse2(x[6], x[9]);
577 btf_16_adds_subs_sse2(x[7], x[8]);
578 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
579 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
580 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
581 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
582 }
583
idct32_stage9_sse2(__m128i * output,__m128i * x)584 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
585 btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
586 btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
587 btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
588 btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
589 btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
590 btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
591 btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
592 btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
593 btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
594 btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
595 btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
596 btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
597 btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
598 btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
599 btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
600 btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
601 }
602
idct32_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)603 static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
604 int8_t cos_bit) {
605 (void)cos_bit;
606 const int32_t *cospi = cospi_arr(INV_COS_BIT);
607
608 // stage 1
609 __m128i x[2];
610 x[0] = input[0];
611
612 // stage 2
613 // stage 3
614 // stage 4
615 // stage 5
616 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
617
618 // stage 6
619 // stage 7
620 // stage 8
621 // stage 9
622 output[0] = x[0];
623 output[31] = x[0];
624 output[1] = x[1];
625 output[30] = x[1];
626 output[2] = x[1];
627 output[29] = x[1];
628 output[3] = x[0];
629 output[28] = x[0];
630 output[4] = x[0];
631 output[27] = x[0];
632 output[5] = x[1];
633 output[26] = x[1];
634 output[6] = x[1];
635 output[25] = x[1];
636 output[7] = x[0];
637 output[24] = x[0];
638 output[8] = x[0];
639 output[23] = x[0];
640 output[9] = x[1];
641 output[22] = x[1];
642 output[10] = x[1];
643 output[21] = x[1];
644 output[11] = x[0];
645 output[20] = x[0];
646 output[12] = x[0];
647 output[19] = x[0];
648 output[13] = x[1];
649 output[18] = x[1];
650 output[14] = x[1];
651 output[17] = x[1];
652 output[15] = x[0];
653 output[16] = x[0];
654 }
655
idct32_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)656 static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
657 int8_t cos_bit) {
658 (void)cos_bit;
659 const int32_t *cospi = cospi_arr(INV_COS_BIT);
660 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
661
662 // stage 1
663 __m128i x[32];
664 x[0] = input[0];
665 x[4] = input[4];
666 x[8] = input[2];
667 x[12] = input[6];
668 x[16] = input[1];
669 x[20] = input[5];
670 x[24] = input[3];
671 x[28] = input[7];
672
673 // stage 2
674 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
675 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
676 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
677 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
678
679 // stage 3
680 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
681 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
682 x[17] = x[16];
683 x[18] = x[19];
684 x[21] = x[20];
685 x[22] = x[23];
686 x[25] = x[24];
687 x[26] = x[27];
688 x[29] = x[28];
689 x[30] = x[31];
690
691 // stage 4
692 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
693 x[9] = x[8];
694 x[10] = x[11];
695 x[13] = x[12];
696 x[14] = x[15];
697 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
698
699 // stage 5
700 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
701 x[5] = x[4];
702 x[6] = x[7];
703 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
704 // stage 6
705 x[3] = x[0];
706 x[2] = x[1];
707 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
708
709 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
710 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
711 idct32_stage9_sse2(output, x);
712 }
713
idct32_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)714 static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
715 int8_t cos_bit) {
716 (void)cos_bit;
717 const int32_t *cospi = cospi_arr(INV_COS_BIT);
718 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
719
720 // stage 1
721 __m128i x[32];
722 x[0] = input[0];
723 x[2] = input[8];
724 x[4] = input[4];
725 x[6] = input[12];
726 x[8] = input[2];
727 x[10] = input[10];
728 x[12] = input[6];
729 x[14] = input[14];
730 x[16] = input[1];
731 x[18] = input[9];
732 x[20] = input[5];
733 x[22] = input[13];
734 x[24] = input[3];
735 x[26] = input[11];
736 x[28] = input[7];
737 x[30] = input[15];
738
739 // stage 2
740 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
741 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
742 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
743 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
744 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
745 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
746 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
747 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
748
749 // stage 3
750 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
751 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
752 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
753 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
754 idct32_high16_stage3_sse2(x);
755
756 // stage 4
757 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
758 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
759 btf_16_adds_subs_sse2(x[8], x[9]);
760 btf_16_subs_adds_sse2(x[11], x[10]);
761 btf_16_adds_subs_sse2(x[12], x[13]);
762 btf_16_subs_adds_sse2(x[15], x[14]);
763 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
764
765 // stage 5
766 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
767 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
768 btf_16_adds_subs_sse2(x[4], x[5]);
769 btf_16_subs_adds_sse2(x[7], x[6]);
770 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
771
772 btf_16_adds_subs_sse2(x[0], x[3]);
773 btf_16_adds_subs_sse2(x[1], x[2]);
774 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
775
776 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
777 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
778 idct32_stage9_sse2(output, x);
779 }
780
idct32_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)781 static void idct32_new_sse2(const __m128i *input, __m128i *output,
782 int8_t cos_bit) {
783 (void)cos_bit;
784 const int32_t *cospi = cospi_arr(INV_COS_BIT);
785 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
786
787 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
788 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
789 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
790 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
791 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
792 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
793 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
794 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
795 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
796 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
797 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
798 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
799 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
800 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
801 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
802 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
803 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
804 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
805 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
806 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
807 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
808 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
809 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
810 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
811 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
812 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
813 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
814 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
815 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
816 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
817 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
818 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
819
820 // stage 1
821 __m128i x[32];
822 x[0] = input[0];
823 x[1] = input[16];
824 x[2] = input[8];
825 x[3] = input[24];
826 x[4] = input[4];
827 x[5] = input[20];
828 x[6] = input[12];
829 x[7] = input[28];
830 x[8] = input[2];
831 x[9] = input[18];
832 x[10] = input[10];
833 x[11] = input[26];
834 x[12] = input[6];
835 x[13] = input[22];
836 x[14] = input[14];
837 x[15] = input[30];
838 x[16] = input[1];
839 x[17] = input[17];
840 x[18] = input[9];
841 x[19] = input[25];
842 x[20] = input[5];
843 x[21] = input[21];
844 x[22] = input[13];
845 x[23] = input[29];
846 x[24] = input[3];
847 x[25] = input[19];
848 x[26] = input[11];
849 x[27] = input[27];
850 x[28] = input[7];
851 x[29] = input[23];
852 x[30] = input[15];
853 x[31] = input[31];
854
855 // stage 2
856 btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
857 btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
858 btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
859 btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
860 btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
861 btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
862 btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
863 btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
864
865 // stage 3
866 btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
867 btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
868 btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
869 btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
870 idct32_high16_stage3_sse2(x);
871
872 // stage 4
873 btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
874 btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
875 btf_16_adds_subs_sse2(x[8], x[9]);
876 btf_16_subs_adds_sse2(x[11], x[10]);
877 btf_16_adds_subs_sse2(x[12], x[13]);
878 btf_16_subs_adds_sse2(x[15], x[14]);
879 idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
880
881 // stage 5
882 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
883 btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
884 btf_16_adds_subs_sse2(x[4], x[5]);
885 btf_16_adds_subs_sse2(x[7], x[6]);
886 idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
887
888 // stage 6
889 btf_16_adds_subs_sse2(x[0], x[3]);
890 btf_16_adds_subs_sse2(x[1], x[2]);
891 idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
892
893 // stage 7~8
894 idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
895 idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
896 idct32_stage9_sse2(output, x);
897 }
898
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)899 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
900 const __m128i __rounding,
901 int8_t cos_bit) {
902 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
903 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
904 const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
905 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
906 const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
907 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
908 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
909 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
910 const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
911 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
912 const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
913 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
914 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
915 btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
916 btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
917 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
918 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
919 btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
920 btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
921 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
922 }
923
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)924 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
925 const __m128i __rounding,
926 int8_t cos_bit) {
927 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
928 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
929 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
930 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
931 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
932 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
933 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
934 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
935 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
936 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
937 btf_16_adds_subs_sse2(x[32], x[35]);
938 btf_16_adds_subs_sse2(x[33], x[34]);
939 btf_16_subs_adds_sse2(x[39], x[36]);
940 btf_16_subs_adds_sse2(x[38], x[37]);
941 btf_16_adds_subs_sse2(x[40], x[43]);
942 btf_16_adds_subs_sse2(x[41], x[42]);
943 btf_16_subs_adds_sse2(x[47], x[44]);
944 btf_16_subs_adds_sse2(x[46], x[45]);
945 btf_16_adds_subs_sse2(x[48], x[51]);
946 btf_16_adds_subs_sse2(x[49], x[50]);
947 btf_16_subs_adds_sse2(x[55], x[52]);
948 btf_16_subs_adds_sse2(x[54], x[53]);
949 btf_16_adds_subs_sse2(x[56], x[59]);
950 btf_16_adds_subs_sse2(x[57], x[58]);
951 btf_16_subs_adds_sse2(x[63], x[60]);
952 btf_16_subs_adds_sse2(x[62], x[61]);
953 }
954
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)955 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
956 const __m128i __rounding,
957 int8_t cos_bit) {
958 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
959 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
960 const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
961 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
962 const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
963 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
964 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
965 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
966 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
967 btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
968 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
969 btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
970 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
971 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
972 }
973
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)974 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
975 const __m128i __rounding,
976 int8_t cos_bit) {
977 btf_16_adds_subs_sse2(x[16], x[19]);
978 btf_16_adds_subs_sse2(x[17], x[18]);
979 btf_16_subs_adds_sse2(x[23], x[20]);
980 btf_16_subs_adds_sse2(x[22], x[21]);
981 btf_16_adds_subs_sse2(x[24], x[27]);
982 btf_16_adds_subs_sse2(x[25], x[26]);
983 btf_16_subs_adds_sse2(x[31], x[28]);
984 btf_16_subs_adds_sse2(x[30], x[29]);
985 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
986 }
987
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)988 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
989 const __m128i __rounding,
990 int8_t cos_bit) {
991 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
992 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
993 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
994 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
995 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
996 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
997 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
998 btf_16_adds_subs_sse2(x[32], x[39]);
999 btf_16_adds_subs_sse2(x[33], x[38]);
1000 btf_16_adds_subs_sse2(x[34], x[37]);
1001 btf_16_adds_subs_sse2(x[35], x[36]);
1002 btf_16_subs_adds_sse2(x[47], x[40]);
1003 btf_16_subs_adds_sse2(x[46], x[41]);
1004 btf_16_subs_adds_sse2(x[45], x[42]);
1005 btf_16_subs_adds_sse2(x[44], x[43]);
1006 btf_16_adds_subs_sse2(x[48], x[55]);
1007 btf_16_adds_subs_sse2(x[49], x[54]);
1008 btf_16_adds_subs_sse2(x[50], x[53]);
1009 btf_16_adds_subs_sse2(x[51], x[52]);
1010 btf_16_subs_adds_sse2(x[63], x[56]);
1011 btf_16_subs_adds_sse2(x[62], x[57]);
1012 btf_16_subs_adds_sse2(x[61], x[58]);
1013 btf_16_subs_adds_sse2(x[60], x[59]);
1014 }
1015
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1016 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
1017 const __m128i __rounding,
1018 int8_t cos_bit) {
1019 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1020 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1021 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1022 btf_16_adds_subs_sse2(x[16], x[23]);
1023 btf_16_adds_subs_sse2(x[17], x[22]);
1024 btf_16_adds_subs_sse2(x[18], x[21]);
1025 btf_16_adds_subs_sse2(x[19], x[20]);
1026 btf_16_subs_adds_sse2(x[31], x[24]);
1027 btf_16_subs_adds_sse2(x[30], x[25]);
1028 btf_16_subs_adds_sse2(x[29], x[26]);
1029 btf_16_subs_adds_sse2(x[28], x[27]);
1030 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
1031 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
1032 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
1033 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
1034 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
1035 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
1036 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
1037 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
1038 }
1039
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1040 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
1041 const __m128i __rounding,
1042 int8_t cos_bit) {
1043 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1044 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1045 btf_16_adds_subs_sse2(x[0], x[15]);
1046 btf_16_adds_subs_sse2(x[1], x[14]);
1047 btf_16_adds_subs_sse2(x[2], x[13]);
1048 btf_16_adds_subs_sse2(x[3], x[12]);
1049 btf_16_adds_subs_sse2(x[4], x[11]);
1050 btf_16_adds_subs_sse2(x[5], x[10]);
1051 btf_16_adds_subs_sse2(x[6], x[9]);
1052 btf_16_adds_subs_sse2(x[7], x[8]);
1053 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
1054 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
1055 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
1056 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
1057 btf_16_adds_subs_sse2(x[32], x[47]);
1058 btf_16_adds_subs_sse2(x[33], x[46]);
1059 btf_16_adds_subs_sse2(x[34], x[45]);
1060 btf_16_adds_subs_sse2(x[35], x[44]);
1061 btf_16_adds_subs_sse2(x[36], x[43]);
1062 btf_16_adds_subs_sse2(x[37], x[42]);
1063 btf_16_adds_subs_sse2(x[38], x[41]);
1064 btf_16_adds_subs_sse2(x[39], x[40]);
1065 btf_16_subs_adds_sse2(x[63], x[48]);
1066 btf_16_subs_adds_sse2(x[62], x[49]);
1067 btf_16_subs_adds_sse2(x[61], x[50]);
1068 btf_16_subs_adds_sse2(x[60], x[51]);
1069 btf_16_subs_adds_sse2(x[59], x[52]);
1070 btf_16_subs_adds_sse2(x[58], x[53]);
1071 btf_16_subs_adds_sse2(x[57], x[54]);
1072 btf_16_subs_adds_sse2(x[56], x[55]);
1073 }
1074
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1075 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
1076 const __m128i __rounding,
1077 int8_t cos_bit) {
1078 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1079 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1080 btf_16_adds_subs_sse2(x[0], x[31]);
1081 btf_16_adds_subs_sse2(x[1], x[30]);
1082 btf_16_adds_subs_sse2(x[2], x[29]);
1083 btf_16_adds_subs_sse2(x[3], x[28]);
1084 btf_16_adds_subs_sse2(x[4], x[27]);
1085 btf_16_adds_subs_sse2(x[5], x[26]);
1086 btf_16_adds_subs_sse2(x[6], x[25]);
1087 btf_16_adds_subs_sse2(x[7], x[24]);
1088 btf_16_adds_subs_sse2(x[8], x[23]);
1089 btf_16_adds_subs_sse2(x[9], x[22]);
1090 btf_16_adds_subs_sse2(x[10], x[21]);
1091 btf_16_adds_subs_sse2(x[11], x[20]);
1092 btf_16_adds_subs_sse2(x[12], x[19]);
1093 btf_16_adds_subs_sse2(x[13], x[18]);
1094 btf_16_adds_subs_sse2(x[14], x[17]);
1095 btf_16_adds_subs_sse2(x[15], x[16]);
1096 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
1097 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
1098 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
1099 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
1100 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
1101 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
1102 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
1103 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
1104 }
1105
idct64_stage11_sse2(__m128i * output,__m128i * x)1106 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1107 btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1108 btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1109 btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1110 btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1111 btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1112 btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1113 btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1114 btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1115 btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1116 btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1117 btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1118 btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1119 btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1120 btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1121 btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1122 btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1123 btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1124 btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1125 btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1126 btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1127 btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1128 btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1129 btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1130 btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1131 btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1132 btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1133 btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1134 btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1135 btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1136 btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1137 btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1138 btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1139 }
1140
idct64_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1141 static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
1142 int8_t cos_bit) {
1143 (void)cos_bit;
1144 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1145
1146 // stage 1
1147 __m128i x[32];
1148 x[0] = input[0];
1149
1150 // stage 2
1151 // stage 3
1152 // stage 4
1153 // stage 5
1154 // stage 6
1155 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1156
1157 // stage 7
1158 // stage 8
1159 // stage 9
1160 // stage 10
1161 // stage 11
1162 output[0] = x[0];
1163 output[63] = x[0];
1164 output[1] = x[1];
1165 output[62] = x[1];
1166 output[2] = x[1];
1167 output[61] = x[1];
1168 output[3] = x[0];
1169 output[60] = x[0];
1170 output[4] = x[0];
1171 output[59] = x[0];
1172 output[5] = x[1];
1173 output[58] = x[1];
1174 output[6] = x[1];
1175 output[57] = x[1];
1176 output[7] = x[0];
1177 output[56] = x[0];
1178 output[8] = x[0];
1179 output[55] = x[0];
1180 output[9] = x[1];
1181 output[54] = x[1];
1182 output[10] = x[1];
1183 output[53] = x[1];
1184 output[11] = x[0];
1185 output[52] = x[0];
1186 output[12] = x[0];
1187 output[51] = x[0];
1188 output[13] = x[1];
1189 output[50] = x[1];
1190 output[14] = x[1];
1191 output[49] = x[1];
1192 output[15] = x[0];
1193 output[48] = x[0];
1194 output[16] = x[0];
1195 output[47] = x[0];
1196 output[17] = x[1];
1197 output[46] = x[1];
1198 output[18] = x[1];
1199 output[45] = x[1];
1200 output[19] = x[0];
1201 output[44] = x[0];
1202 output[20] = x[0];
1203 output[43] = x[0];
1204 output[21] = x[1];
1205 output[42] = x[1];
1206 output[22] = x[1];
1207 output[41] = x[1];
1208 output[23] = x[0];
1209 output[40] = x[0];
1210 output[24] = x[0];
1211 output[39] = x[0];
1212 output[25] = x[1];
1213 output[38] = x[1];
1214 output[26] = x[1];
1215 output[37] = x[1];
1216 output[27] = x[0];
1217 output[36] = x[0];
1218 output[28] = x[0];
1219 output[35] = x[0];
1220 output[29] = x[1];
1221 output[34] = x[1];
1222 output[30] = x[1];
1223 output[33] = x[1];
1224 output[31] = x[0];
1225 output[32] = x[0];
1226 }
1227
idct64_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1228 static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
1229 int8_t cos_bit) {
1230 (void)cos_bit;
1231 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1232 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1233 const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1234 const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1235 const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1236 const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1237 const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1238 const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1239 const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1240 const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1241 const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1242 const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1243 const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1244 const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1245 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1246 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1247 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1248 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1249
1250 // stage 1
1251 __m128i x[64];
1252 x[0] = input[0];
1253 x[8] = input[4];
1254 x[16] = input[2];
1255 x[24] = input[6];
1256 x[32] = input[1];
1257 x[40] = input[5];
1258 x[48] = input[3];
1259 x[56] = input[7];
1260
1261 // stage 2
1262 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1263 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1264 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1265 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1266
1267 // stage 3
1268 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1269 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1270 x[33] = x[32];
1271 x[38] = x[39];
1272 x[41] = x[40];
1273 x[46] = x[47];
1274 x[49] = x[48];
1275 x[54] = x[55];
1276 x[57] = x[56];
1277 x[62] = x[63];
1278
1279 // stage 4
1280 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1281 x[17] = x[16];
1282 x[22] = x[23];
1283 x[25] = x[24];
1284 x[30] = x[31];
1285 btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
1286 btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
1287 btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
1288 btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
1289
1290 // stage 5
1291 x[9] = x[8];
1292 x[14] = x[15];
1293 btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
1294 btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
1295 x[35] = x[32];
1296 x[34] = x[33];
1297 x[36] = x[39];
1298 x[37] = x[38];
1299 x[43] = x[40];
1300 x[42] = x[41];
1301 x[44] = x[47];
1302 x[45] = x[46];
1303 x[51] = x[48];
1304 x[50] = x[49];
1305 x[52] = x[55];
1306 x[53] = x[54];
1307 x[59] = x[56];
1308 x[58] = x[57];
1309 x[60] = x[63];
1310 x[61] = x[62];
1311
1312 // stage 6
1313 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1314 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1315 x[19] = x[16];
1316 x[18] = x[17];
1317 x[20] = x[23];
1318 x[21] = x[22];
1319 x[27] = x[24];
1320 x[26] = x[25];
1321 x[28] = x[31];
1322 x[29] = x[30];
1323 idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1324
1325 // stage 7
1326 x[3] = x[0];
1327 x[2] = x[1];
1328 x[11] = x[8];
1329 x[10] = x[9];
1330 x[12] = x[15];
1331 x[13] = x[14];
1332 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1333
1334 // stage 8
1335 x[7] = x[0];
1336 x[6] = x[1];
1337 x[5] = x[2];
1338 x[4] = x[3];
1339 x[9] = x[9];
1340 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1341 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1342 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1343
1344 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1345 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1346 idct64_stage11_sse2(output, x);
1347 }
1348
idct64_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1349 static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
1350 int8_t cos_bit) {
1351 (void)cos_bit;
1352 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1353 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1354
1355 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1356 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1357 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1358 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1359 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1360
1361 // stage 1
1362 __m128i x[64];
1363 x[0] = input[0];
1364 x[4] = input[8];
1365 x[8] = input[4];
1366 x[12] = input[12];
1367 x[16] = input[2];
1368 x[20] = input[10];
1369 x[24] = input[6];
1370 x[28] = input[14];
1371 x[32] = input[1];
1372 x[36] = input[9];
1373 x[40] = input[5];
1374 x[44] = input[13];
1375 x[48] = input[3];
1376 x[52] = input[11];
1377 x[56] = input[7];
1378 x[60] = input[15];
1379
1380 // stage 2
1381 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1382 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1383 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1384 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1385 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1386 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1387 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1388 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1389
1390 // stage 3
1391 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1392 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1393 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1394 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1395 x[33] = x[32];
1396 x[34] = x[35];
1397 x[37] = x[36];
1398 x[38] = x[39];
1399 x[41] = x[40];
1400 x[42] = x[43];
1401 x[45] = x[44];
1402 x[46] = x[47];
1403 x[49] = x[48];
1404 x[50] = x[51];
1405 x[53] = x[52];
1406 x[54] = x[55];
1407 x[57] = x[56];
1408 x[58] = x[59];
1409 x[61] = x[60];
1410 x[62] = x[63];
1411
1412 // stage 4
1413 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1414 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1415 x[17] = x[16];
1416 x[18] = x[19];
1417 x[21] = x[20];
1418 x[22] = x[23];
1419 x[25] = x[24];
1420 x[26] = x[27];
1421 x[29] = x[28];
1422 x[30] = x[31];
1423 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1424
1425 // stage 5
1426 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1427 x[9] = x[8];
1428 x[10] = x[11];
1429 x[13] = x[12];
1430 x[14] = x[15];
1431 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1432
1433 // stage 6
1434 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1435 x[5] = x[4];
1436 x[6] = x[7];
1437 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1438 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1439 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1440
1441 // stage 7
1442 x[3] = x[0];
1443 x[2] = x[1];
1444 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1445 btf_16_adds_subs_sse2(x[8], x[11]);
1446 btf_16_adds_subs_sse2(x[9], x[10]);
1447 btf_16_subs_adds_sse2(x[15], x[12]);
1448 btf_16_subs_adds_sse2(x[14], x[13]);
1449 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1450
1451 // stage 8
1452 btf_16_adds_subs_sse2(x[0], x[7]);
1453 btf_16_adds_subs_sse2(x[1], x[6]);
1454 btf_16_adds_subs_sse2(x[2], x[5]);
1455 btf_16_adds_subs_sse2(x[3], x[4]);
1456 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1457 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1458 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1459
1460 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1461 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1462 idct64_stage11_sse2(output, x);
1463 }
1464
idct64_low32_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1465 static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
1466 int8_t cos_bit) {
1467 (void)cos_bit;
1468 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1469 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1470
1471 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1472 const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1473 const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1474 const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1475 const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1476
1477 // stage 1
1478 __m128i x[64];
1479 x[0] = input[0];
1480 x[2] = input[16];
1481 x[4] = input[8];
1482 x[6] = input[24];
1483 x[8] = input[4];
1484 x[10] = input[20];
1485 x[12] = input[12];
1486 x[14] = input[28];
1487 x[16] = input[2];
1488 x[18] = input[18];
1489 x[20] = input[10];
1490 x[22] = input[26];
1491 x[24] = input[6];
1492 x[26] = input[22];
1493 x[28] = input[14];
1494 x[30] = input[30];
1495 x[32] = input[1];
1496 x[34] = input[17];
1497 x[36] = input[9];
1498 x[38] = input[25];
1499 x[40] = input[5];
1500 x[42] = input[21];
1501 x[44] = input[13];
1502 x[46] = input[29];
1503 x[48] = input[3];
1504 x[50] = input[19];
1505 x[52] = input[11];
1506 x[54] = input[27];
1507 x[56] = input[7];
1508 x[58] = input[23];
1509 x[60] = input[15];
1510 x[62] = input[31];
1511
1512 // stage 2
1513 btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1514 btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1515 btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1516 btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1517 btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1518 btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1519 btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1520 btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1521 btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1522 btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1523 btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1524 btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1525 btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1526 btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1527 btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1528 btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1529
1530 // stage 3
1531 btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1532 btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1533 btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1534 btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1535 btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1536 btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1537 btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1538 btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1539 btf_16_adds_subs_sse2(x[32], x[33]);
1540 btf_16_subs_adds_sse2(x[35], x[34]);
1541 btf_16_adds_subs_sse2(x[36], x[37]);
1542 btf_16_subs_adds_sse2(x[39], x[38]);
1543 btf_16_adds_subs_sse2(x[40], x[41]);
1544 btf_16_subs_adds_sse2(x[43], x[42]);
1545 btf_16_adds_subs_sse2(x[44], x[45]);
1546 btf_16_subs_adds_sse2(x[47], x[46]);
1547 btf_16_adds_subs_sse2(x[48], x[49]);
1548 btf_16_subs_adds_sse2(x[51], x[50]);
1549 btf_16_adds_subs_sse2(x[52], x[53]);
1550 btf_16_subs_adds_sse2(x[55], x[54]);
1551 btf_16_adds_subs_sse2(x[56], x[57]);
1552 btf_16_subs_adds_sse2(x[59], x[58]);
1553 btf_16_adds_subs_sse2(x[60], x[61]);
1554 btf_16_subs_adds_sse2(x[63], x[62]);
1555
1556 // stage 4
1557 btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1558 btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1559 btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1560 btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1561 btf_16_adds_subs_sse2(x[16], x[17]);
1562 btf_16_subs_adds_sse2(x[19], x[18]);
1563 btf_16_adds_subs_sse2(x[20], x[21]);
1564 btf_16_subs_adds_sse2(x[23], x[22]);
1565 btf_16_adds_subs_sse2(x[24], x[25]);
1566 btf_16_subs_adds_sse2(x[27], x[26]);
1567 btf_16_adds_subs_sse2(x[28], x[29]);
1568 btf_16_subs_adds_sse2(x[31], x[30]);
1569 idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1570
1571 // stage 5
1572 btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1573 btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1574 btf_16_adds_subs_sse2(x[8], x[9]);
1575 btf_16_subs_adds_sse2(x[11], x[10]);
1576 btf_16_adds_subs_sse2(x[12], x[13]);
1577 btf_16_subs_adds_sse2(x[15], x[14]);
1578 idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1579
1580 // stage 6
1581 btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1582 btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1583 btf_16_adds_subs_sse2(x[4], x[5]);
1584 btf_16_subs_adds_sse2(x[7], x[6]);
1585 btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1586 btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1587 idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1588
1589 // stage 7
1590 btf_16_adds_subs_sse2(x[0], x[3]);
1591 btf_16_adds_subs_sse2(x[1], x[2]);
1592 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1593 btf_16_adds_subs_sse2(x[8], x[11]);
1594 btf_16_adds_subs_sse2(x[9], x[10]);
1595 btf_16_subs_adds_sse2(x[15], x[12]);
1596 btf_16_subs_adds_sse2(x[14], x[13]);
1597 idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1598
1599 // stage 8
1600 btf_16_adds_subs_sse2(x[0], x[7]);
1601 btf_16_adds_subs_sse2(x[1], x[6]);
1602 btf_16_adds_subs_sse2(x[2], x[5]);
1603 btf_16_adds_subs_sse2(x[3], x[4]);
1604 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1605 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1606 idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1607
1608 // stage 9~11
1609 idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1610 idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1611 idct64_stage11_sse2(output, x);
1612 }
1613
iadst4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1614 void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1615 (void)cos_bit;
1616 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1617 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1618 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1619 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1620 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1621 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1622 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1623 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1624 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1625 __m128i x0[4];
1626 x0[0] = input[0];
1627 x0[1] = input[1];
1628 x0[2] = input[2];
1629 x0[3] = input[3];
1630
1631 __m128i u[4];
1632 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1633 u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1634 u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1635 u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1636
1637 __m128i x1[16];
1638 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1639 x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1640 x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1641 x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1642 x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
1643 x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1644 x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
1645 x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1646 x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1647 x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1648 x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
1649 x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1650 x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1651 x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1652 x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1653 x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1654
1655 __m128i x2[8];
1656 x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1657 x2[1] = _mm_add_epi32(x1[1], x1[5]);
1658 x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1659 x2[3] = _mm_add_epi32(x1[3], x1[7]);
1660 x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
1661 x2[5] = _mm_add_epi32(x1[9], x1[11]);
1662 x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1663 x2[7] = _mm_add_epi32(x1[13], x1[15]);
1664
1665 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1666 for (int i = 0; i < 4; ++i) {
1667 __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1668 __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1669 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1670 out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1671 output[i] = _mm_packs_epi32(out0, out1);
1672 }
1673 }
1674
1675 // TODO(binpengsmail@gmail.com):
1676 // To explore the reuse of VP9 versions of corresponding SSE2 functions and
1677 // evaluate whether there is a possibility for further speedup.
iadst4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1678 void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1679 (void)cos_bit;
1680 const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1681 const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1682 const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1683 const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1684 const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1685 const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1686 const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1687 const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1688 const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1689 __m128i x0[4];
1690 x0[0] = input[0];
1691 x0[1] = input[1];
1692 x0[2] = input[2];
1693 x0[3] = input[3];
1694
1695 __m128i u[2];
1696 u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1697 u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1698
1699 __m128i x1[8];
1700 x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
1701 x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
1702 x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
1703 x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
1704 x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
1705 x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
1706 x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
1707 x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
1708
1709 __m128i x2[4];
1710 x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1711 x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1712 x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
1713 x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1714
1715 const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1716 for (int i = 0; i < 4; ++i) {
1717 __m128i out0 = _mm_add_epi32(x2[i], rounding);
1718 out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1719 output[i] = _mm_packs_epi32(out0, out0);
1720 }
1721 }
1722
iadst8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1723 static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
1724 int8_t cos_bit) {
1725 (void)cos_bit;
1726 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1727 const __m128i __zero = _mm_setzero_si128();
1728 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1729
1730 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1731 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1732 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1733 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1734
1735 // stage 1
1736 __m128i x[8];
1737 x[1] = input[0];
1738
1739 // stage 2
1740 btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1741
1742 // stage 3
1743 x[4] = x[0];
1744 x[5] = x[1];
1745
1746 // stage 4
1747 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1748
1749 // stage 5
1750 x[2] = x[0];
1751 x[3] = x[1];
1752 x[6] = x[4];
1753 x[7] = x[5];
1754
1755 // stage 6
1756 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1757 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1758
1759 // stage 7
1760 output[0] = x[0];
1761 output[1] = _mm_subs_epi16(__zero, x[4]);
1762 output[2] = x[6];
1763 output[3] = _mm_subs_epi16(__zero, x[2]);
1764 output[4] = x[3];
1765 output[5] = _mm_subs_epi16(__zero, x[7]);
1766 output[6] = x[5];
1767 output[7] = _mm_subs_epi16(__zero, x[1]);
1768 }
1769
iadst8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1770 void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1771 (void)cos_bit;
1772 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1773 const __m128i __zero = _mm_setzero_si128();
1774 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1775
1776 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1777 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1778 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1779 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1780 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1781 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1782 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1783 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1784 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1785 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1786 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1787 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1788 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1789
1790 // stage 1
1791 __m128i x[8];
1792 x[0] = input[7];
1793 x[1] = input[0];
1794 x[2] = input[5];
1795 x[3] = input[2];
1796 x[4] = input[3];
1797 x[5] = input[4];
1798 x[6] = input[1];
1799 x[7] = input[6];
1800
1801 // stage 2
1802 btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1803 btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1804 btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1805 btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1806
1807 // stage 3
1808 btf_16_adds_subs_sse2(x[0], x[4]);
1809 btf_16_adds_subs_sse2(x[1], x[5]);
1810 btf_16_adds_subs_sse2(x[2], x[6]);
1811 btf_16_adds_subs_sse2(x[3], x[7]);
1812
1813 // stage 4
1814 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1815 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1816
1817 // stage 5
1818 btf_16_adds_subs_sse2(x[0], x[2]);
1819 btf_16_adds_subs_sse2(x[1], x[3]);
1820 btf_16_adds_subs_sse2(x[4], x[6]);
1821 btf_16_adds_subs_sse2(x[5], x[7]);
1822
1823 // stage 6
1824 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1825 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1826
1827 // stage 7
1828 output[0] = x[0];
1829 output[1] = _mm_subs_epi16(__zero, x[4]);
1830 output[2] = x[6];
1831 output[3] = _mm_subs_epi16(__zero, x[2]);
1832 output[4] = x[3];
1833 output[5] = _mm_subs_epi16(__zero, x[7]);
1834 output[6] = x[5];
1835 output[7] = _mm_subs_epi16(__zero, x[1]);
1836 }
1837
iadst8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1838 void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1839 (void)cos_bit;
1840 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1841 const __m128i __zero = _mm_setzero_si128();
1842 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1843
1844 const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1845 const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1846 const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1847 const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1848 const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1849 const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1850 const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1851 const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1852 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1853 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1854 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1855 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1856 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1857
1858 // stage 1
1859 __m128i x[8];
1860 x[0] = input[7];
1861 x[1] = input[0];
1862 x[2] = input[5];
1863 x[3] = input[2];
1864 x[4] = input[3];
1865 x[5] = input[4];
1866 x[6] = input[1];
1867 x[7] = input[6];
1868
1869 // stage 2
1870 btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1871 btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1872 btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1873 btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1874
1875 // stage 3
1876 btf_16_adds_subs_sse2(x[0], x[4]);
1877 btf_16_adds_subs_sse2(x[1], x[5]);
1878 btf_16_adds_subs_sse2(x[2], x[6]);
1879 btf_16_adds_subs_sse2(x[3], x[7]);
1880
1881 // stage 4
1882 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1883 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1884
1885 // stage 5
1886 btf_16_adds_subs_sse2(x[0], x[2]);
1887 btf_16_adds_subs_sse2(x[1], x[3]);
1888 btf_16_adds_subs_sse2(x[4], x[6]);
1889 btf_16_adds_subs_sse2(x[5], x[7]);
1890
1891 // stage 6
1892 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1893 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1894
1895 // stage 7
1896 output[0] = x[0];
1897 output[1] = _mm_subs_epi16(__zero, x[4]);
1898 output[2] = x[6];
1899 output[3] = _mm_subs_epi16(__zero, x[2]);
1900 output[4] = x[3];
1901 output[5] = _mm_subs_epi16(__zero, x[7]);
1902 output[6] = x[5];
1903 output[7] = _mm_subs_epi16(__zero, x[1]);
1904 }
1905
iadst16_stage3_ssse3(__m128i * x)1906 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1907 btf_16_adds_subs_sse2(x[0], x[8]);
1908 btf_16_adds_subs_sse2(x[1], x[9]);
1909 btf_16_adds_subs_sse2(x[2], x[10]);
1910 btf_16_adds_subs_sse2(x[3], x[11]);
1911 btf_16_adds_subs_sse2(x[4], x[12]);
1912 btf_16_adds_subs_sse2(x[5], x[13]);
1913 btf_16_adds_subs_sse2(x[6], x[14]);
1914 btf_16_adds_subs_sse2(x[7], x[15]);
1915 }
1916
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1917 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
1918 const __m128i __rounding,
1919 int8_t cos_bit) {
1920 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1921 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1922 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1923 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1924 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1925 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1926 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1927 btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
1928 btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
1929 btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
1930 }
1931
iadst16_stage5_ssse3(__m128i * x)1932 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1933 btf_16_adds_subs_sse2(x[0], x[4]);
1934 btf_16_adds_subs_sse2(x[1], x[5]);
1935 btf_16_adds_subs_sse2(x[2], x[6]);
1936 btf_16_adds_subs_sse2(x[3], x[7]);
1937 btf_16_adds_subs_sse2(x[8], x[12]);
1938 btf_16_adds_subs_sse2(x[9], x[13]);
1939 btf_16_adds_subs_sse2(x[10], x[14]);
1940 btf_16_adds_subs_sse2(x[11], x[15]);
1941 }
1942
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1943 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
1944 const __m128i __rounding,
1945 int8_t cos_bit) {
1946 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1947 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1948 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1949 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1950 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1951 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
1952 btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
1953 }
1954
iadst16_stage7_ssse3(__m128i * x)1955 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1956 btf_16_adds_subs_sse2(x[0], x[2]);
1957 btf_16_adds_subs_sse2(x[1], x[3]);
1958 btf_16_adds_subs_sse2(x[4], x[6]);
1959 btf_16_adds_subs_sse2(x[5], x[7]);
1960 btf_16_adds_subs_sse2(x[8], x[10]);
1961 btf_16_adds_subs_sse2(x[9], x[11]);
1962 btf_16_adds_subs_sse2(x[12], x[14]);
1963 btf_16_adds_subs_sse2(x[13], x[15]);
1964 }
1965
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1966 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
1967 const __m128i __rounding,
1968 int8_t cos_bit) {
1969 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1970 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1971 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1972 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1973 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
1974 btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
1975 }
1976
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1977 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1978 const __m128i __zero = _mm_setzero_si128();
1979 output[0] = x[0];
1980 output[1] = _mm_subs_epi16(__zero, x[8]);
1981 output[2] = x[12];
1982 output[3] = _mm_subs_epi16(__zero, x[4]);
1983 output[4] = x[6];
1984 output[5] = _mm_subs_epi16(__zero, x[14]);
1985 output[6] = x[10];
1986 output[7] = _mm_subs_epi16(__zero, x[2]);
1987 output[8] = x[3];
1988 output[9] = _mm_subs_epi16(__zero, x[11]);
1989 output[10] = x[15];
1990 output[11] = _mm_subs_epi16(__zero, x[7]);
1991 output[12] = x[5];
1992 output[13] = _mm_subs_epi16(__zero, x[13]);
1993 output[14] = x[9];
1994 output[15] = _mm_subs_epi16(__zero, x[1]);
1995 }
1996
iadst16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1997 static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
1998 int8_t cos_bit) {
1999 (void)cos_bit;
2000 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2001 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2002
2003 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2004 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2005 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2006 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2007
2008 // stage 1
2009 __m128i x[16];
2010 x[1] = input[0];
2011
2012 // stage 2
2013 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2014
2015 // stage 3
2016 x[8] = x[0];
2017 x[9] = x[1];
2018
2019 // stage 4
2020 btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2021
2022 // stage 5
2023 x[4] = x[0];
2024 x[5] = x[1];
2025 x[12] = x[8];
2026 x[13] = x[9];
2027
2028 // stage 6
2029 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2030 btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2031
2032 // stage 7
2033 x[2] = x[0];
2034 x[3] = x[1];
2035 x[6] = x[4];
2036 x[7] = x[5];
2037 x[10] = x[8];
2038 x[11] = x[9];
2039 x[14] = x[12];
2040 x[15] = x[13];
2041
2042 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2043 iadst16_stage9_ssse3(output, x);
2044 }
2045
iadst16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2046 static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
2047 int8_t cos_bit) {
2048 (void)cos_bit;
2049 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2050 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2051
2052 // stage 1
2053 __m128i x[16];
2054 x[1] = input[0];
2055 x[3] = input[2];
2056 x[5] = input[4];
2057 x[7] = input[6];
2058 x[8] = input[7];
2059 x[10] = input[5];
2060 x[12] = input[3];
2061 x[14] = input[1];
2062
2063 // stage 2
2064 btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2065 btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2066 btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2067 btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2068 btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2069 btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2070 btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2071 btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2072
2073 // stage 3
2074 iadst16_stage3_ssse3(x);
2075 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2076 iadst16_stage5_ssse3(x);
2077 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2078 iadst16_stage7_ssse3(x);
2079 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2080 iadst16_stage9_ssse3(output, x);
2081 }
iadst16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2082 void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2083 (void)cos_bit;
2084 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2085 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2086 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2087 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2088 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2089 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2090 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2091 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2092 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2093 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2094 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2095 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2096 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2097 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2098 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2099 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2100 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2101 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2102
2103 // stage 1
2104 __m128i x[16];
2105 x[0] = input[15];
2106 x[1] = input[0];
2107 x[2] = input[13];
2108 x[3] = input[2];
2109 x[4] = input[11];
2110 x[5] = input[4];
2111 x[6] = input[9];
2112 x[7] = input[6];
2113 x[8] = input[7];
2114 x[9] = input[8];
2115 x[10] = input[5];
2116 x[11] = input[10];
2117 x[12] = input[3];
2118 x[13] = input[12];
2119 x[14] = input[1];
2120 x[15] = input[14];
2121
2122 // stage 2
2123 btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2124 btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2125 btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2126 btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2127 btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2128 btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2129 btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2130 btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2131
2132 // stage 3~9
2133 iadst16_stage3_ssse3(x);
2134 iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2135 iadst16_stage5_ssse3(x);
2136 iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2137 iadst16_stage7_ssse3(x);
2138 iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2139 iadst16_stage9_ssse3(output, x);
2140 }
2141
iadst16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2142 void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
2143 int8_t cos_bit) {
2144 (void)cos_bit;
2145 const int32_t *cospi = cospi_arr(INV_COS_BIT);
2146 const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2147
2148 const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2149 const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2150 const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2151 const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2152 const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2153 const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2154 const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2155 const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2156 const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2157 const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2158 const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2159 const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2160 const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2161 const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2162 const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2163 const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2164 const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2165 const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2166 const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2167 const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2168 const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2169 const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2170 const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2171 const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2172 const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2173 const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2174 const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2175
2176 // stage 1
2177 __m128i x[16];
2178 x[0] = input[15];
2179 x[1] = input[0];
2180 x[2] = input[13];
2181 x[3] = input[2];
2182 x[4] = input[11];
2183 x[5] = input[4];
2184 x[6] = input[9];
2185 x[7] = input[6];
2186 x[8] = input[7];
2187 x[9] = input[8];
2188 x[10] = input[5];
2189 x[11] = input[10];
2190 x[12] = input[3];
2191 x[13] = input[12];
2192 x[14] = input[1];
2193 x[15] = input[14];
2194
2195 // stage 2
2196 btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2197 btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2198 btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2199 btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2200 btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2201 btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2202 btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2203 btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2204
2205 // stage 3
2206 iadst16_stage3_ssse3(x);
2207
2208 // stage 4
2209 btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2210 btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
2211 btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
2212 btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
2213
2214 // stage 5
2215 iadst16_stage5_ssse3(x);
2216
2217 // stage 6
2218 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2219 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
2220 btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2221 btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
2222
2223 // stage 7
2224 iadst16_stage7_ssse3(x);
2225
2226 // stage 8
2227 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
2228 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
2229 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
2230 btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
2231
2232 // stage 9
2233 iadst16_stage9_ssse3(output, x);
2234 }
2235
iidentity4_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2236 static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
2237 int8_t cos_bit) {
2238 (void)cos_bit;
2239 const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
2240 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2241 for (int i = 0; i < 4; ++i) {
2242 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2243 output[i] = _mm_adds_epi16(x, input[i]);
2244 }
2245 }
2246
iidentity8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2247 static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
2248 int8_t cos_bit) {
2249 (void)cos_bit;
2250 for (int i = 0; i < 8; ++i) {
2251 output[i] = _mm_adds_epi16(input[i], input[i]);
2252 }
2253 }
2254
iidentity16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2255 static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
2256 int8_t cos_bit) {
2257 (void)cos_bit;
2258 const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
2259 const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2260 for (int i = 0; i < 16; ++i) {
2261 __m128i x = _mm_mulhrs_epi16(input[i], scale);
2262 __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2263 output[i] = _mm_adds_epi16(x, srcx2);
2264 }
2265 }
2266
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2267 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
2268 __m128i res) {
2269 const __m128i zero = _mm_setzero_si128();
2270 __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2271 return _mm_packus_epi16(x0, x0);
2272 }
2273
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2274 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
2275 int stride, int flipud,
2276 const int height) {
2277 int j = flipud ? (height - 1) : 0;
2278 const int step = flipud ? -1 : 1;
2279 const __m128i zero = _mm_setzero_si128();
2280 for (int i = 0; i < height; ++i, j += step) {
2281 const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
2282 __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2283 u = _mm_packus_epi16(u, zero);
2284 *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
2285 }
2286 }
2287
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2288 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
2289 int stride, int flipud,
2290 const int height) {
2291 int j = flipud ? (height - 1) : 0;
2292 const int step = flipud ? -1 : 1;
2293 for (int i = 0; i < height; ++i, j += step) {
2294 const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
2295 const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2296 _mm_storel_epi64((__m128i *)(output + i * stride), u);
2297 }
2298 }
2299
2300 // 1D functions process process 8 pixels at one time.
2301 static const transform_1d_ssse3
2302 lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2303 { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
2304 { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
2305 { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
2306 { idct32_new_sse2, NULL, NULL },
2307 { idct64_low32_new_ssse3, NULL, NULL },
2308 };
2309
2310 // functions for blocks with eob at DC and within
2311 // topleft 8x8, 16x16, 32x32 corner
2312 static const transform_1d_ssse3
2313 lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2314 {
2315 { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
2316 { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
2317 { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
2318 },
2319 { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
2320 { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
2321 { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
2322 {
2323 { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
2324 NULL },
2325 { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
2326 NULL },
2327 { NULL, NULL, NULL, NULL },
2328 },
2329 { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
2330 idct32_new_sse2 },
2331 { NULL, NULL, NULL, NULL },
2332 { NULL, NULL, NULL, NULL } },
2333 { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
2334 idct64_low32_new_ssse3 },
2335 { NULL, NULL, NULL, NULL },
2336 { NULL, NULL, NULL, NULL } }
2337 };
2338
2339 // 1D functions process process 4 pixels at one time.
2340 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2341 static const transform_1d_ssse3
2342 lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2343 { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
2344 { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
2345 { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
2346 { NULL, NULL, NULL },
2347 { NULL, NULL, NULL },
2348 };
2349
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)2350 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
2351 int stride, int shift, int height,
2352 int txw_idx, int rect_type) {
2353 const int32_t *input_row = input;
2354 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
2355 const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
2356 (1 << (NewSqrt2Bits - shift - 1)));
2357 const __m128i one = _mm_set1_epi16(1);
2358 const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2359 if (rect_type != 1 && rect_type != -1) {
2360 for (int i = 0; i < height; ++i) {
2361 const __m128i src = load_32bit_to_16bit(input_row);
2362 input_row += stride;
2363 __m128i lo = _mm_unpacklo_epi16(src, one);
2364 __m128i hi = _mm_unpackhi_epi16(src, one);
2365 lo = _mm_madd_epi16(lo, scale_rounding);
2366 hi = _mm_madd_epi16(hi, scale_rounding);
2367 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2368 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2369 out[i] = _mm_packs_epi32(lo, hi);
2370 }
2371 } else {
2372 const __m128i rect_scale =
2373 _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
2374 for (int i = 0; i < height; ++i) {
2375 __m128i src = load_32bit_to_16bit(input_row);
2376 src = _mm_mulhrs_epi16(src, rect_scale);
2377 input_row += stride;
2378 __m128i lo = _mm_unpacklo_epi16(src, one);
2379 __m128i hi = _mm_unpackhi_epi16(src, one);
2380 lo = _mm_madd_epi16(lo, scale_rounding);
2381 hi = _mm_madd_epi16(hi, scale_rounding);
2382 lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2383 hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2384 out[i] = _mm_packs_epi32(lo, hi);
2385 }
2386 }
2387 }
2388
iidentity_col_8xn_ssse3(uint8_t * output,int stride,__m128i * buf,int shift,int height,int txh_idx)2389 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
2390 __m128i *buf, int shift, int height,
2391 int txh_idx) {
2392 const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
2393 const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
2394 const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2395 const __m128i one = _mm_set1_epi16(1);
2396 const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2397 const __m128i zero = _mm_setzero_si128();
2398 for (int h = 0; h < height; ++h) {
2399 __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2400 __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2401 lo = _mm_madd_epi16(lo, scale_coeff);
2402 hi = _mm_madd_epi16(hi, scale_coeff);
2403 lo = _mm_srai_epi32(lo, NewSqrt2Bits);
2404 hi = _mm_srai_epi32(hi, NewSqrt2Bits);
2405 lo = _mm_add_epi32(lo, shift_rounding);
2406 hi = _mm_add_epi32(hi, shift_rounding);
2407 lo = _mm_srai_epi32(lo, -shift);
2408 hi = _mm_srai_epi32(hi, -shift);
2409 __m128i x = _mm_packs_epi32(lo, hi);
2410
2411 const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
2412 x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2413 const __m128i u = _mm_packus_epi16(x, x);
2414 _mm_storel_epi64((__m128i *)(output), u);
2415 output += stride;
2416 }
2417 }
2418
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size)2419 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
2420 uint8_t *output, int stride,
2421 TX_SIZE tx_size) {
2422 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2423 const int txw_idx = get_txw_idx(tx_size);
2424 const int txh_idx = get_txh_idx(tx_size);
2425 const int txfm_size_col = tx_size_wide[tx_size];
2426 const int txfm_size_row = tx_size_high[tx_size];
2427 const int input_stride = AOMMIN(32, txfm_size_col);
2428 const int row_max = AOMMIN(32, txfm_size_row);
2429 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2430 __m128i buf[32];
2431
2432 for (int i = 0; i < (input_stride >> 3); ++i) {
2433 iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
2434 txw_idx, rect_type);
2435 iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
2436 txh_idx);
2437 }
2438 }
2439
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2440 static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
2441 uint8_t *output, int stride,
2442 TX_TYPE tx_type, TX_SIZE tx_size_,
2443 int eob) {
2444 (void)tx_size_;
2445 (void)eob;
2446 __m128i buf[4];
2447 const TX_SIZE tx_size = TX_4X4;
2448 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2449 const int txw_idx = get_txw_idx(tx_size);
2450 const int txh_idx = get_txh_idx(tx_size);
2451 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2452 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2453 const int txfm_size_col = tx_size_wide[tx_size];
2454 const int txfm_size_row = tx_size_high[tx_size];
2455
2456 const transform_1d_ssse3 row_txfm =
2457 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2458 const transform_1d_ssse3 col_txfm =
2459 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2460
2461 int ud_flip, lr_flip;
2462 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2463 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2464 transpose_16bit_4x4(buf, buf);
2465 row_txfm(buf, buf, cos_bit_row);
2466 if (lr_flip) {
2467 __m128i temp[4];
2468 flip_buf_sse2(buf, temp, txfm_size_col);
2469 transpose_16bit_4x4(temp, buf);
2470 } else {
2471 transpose_16bit_4x4(buf, buf);
2472 }
2473 col_txfm(buf, buf, cos_bit_col);
2474 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2475 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2476 }
2477
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2478 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
2479 __m128i res0, __m128i res1) {
2480 const __m128i zero = _mm_setzero_si128();
2481 __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2482 __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2483 x0 = _mm_adds_epi16(res0, x0);
2484 x1 = _mm_adds_epi16(res1, x1);
2485 return _mm_packus_epi16(x0, x1);
2486 }
2487
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,int height)2488 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
2489 int stride, int flipud,
2490 int height) {
2491 int j = flipud ? (height - 1) : 0;
2492 const int step = flipud ? -1 : 1;
2493 for (int i = 0; i < height; ++i, j += step) {
2494 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
2495 __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2496 _mm_storeu_si128((__m128i *)(output + i * stride), u);
2497 }
2498 }
2499
round_shift_ssse3(const __m128i * input,__m128i * output,int size)2500 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
2501 int size) {
2502 const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
2503 for (int i = 0; i < size; ++i) {
2504 output[i] = _mm_mulhrs_epi16(input[i], scale);
2505 }
2506 }
2507
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2508 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
2509 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2510 TX_SIZE tx_size, int eob) {
2511 __m128i buf1[64 * 8];
2512 int eobx, eoby;
2513 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2514 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2515 const int txw_idx = get_txw_idx(tx_size);
2516 const int txh_idx = get_txh_idx(tx_size);
2517 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2518 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2519 const int txfm_size_col = tx_size_wide[tx_size];
2520 const int txfm_size_row = tx_size_high[tx_size];
2521 const int buf_size_w_div8 = txfm_size_col >> 3;
2522 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2523 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2524 const int input_stride = AOMMIN(32, txfm_size_col);
2525 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2526
2527 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2528 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2529 const transform_1d_ssse3 row_txfm =
2530 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2531 const transform_1d_ssse3 col_txfm =
2532 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2533
2534 assert(col_txfm != NULL);
2535 assert(row_txfm != NULL);
2536 int ud_flip, lr_flip;
2537 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2538 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
2539 __m128i buf0[64];
2540 const int32_t *input_row = input + i * input_stride * 8;
2541 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
2542 __m128i *buf0_cur = buf0 + j * 8;
2543 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2544 transpose_16bit_8x8(buf0_cur, buf0_cur);
2545 }
2546 if (rect_type == 1 || rect_type == -1) {
2547 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2548 }
2549 row_txfm(buf0, buf0, cos_bit_row);
2550 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2551 __m128i *_buf1 = buf1 + i * 8;
2552 if (lr_flip) {
2553 for (int j = 0; j < buf_size_w_div8; ++j) {
2554 __m128i temp[8];
2555 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2556 transpose_16bit_8x8(temp,
2557 _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2558 }
2559 } else {
2560 for (int j = 0; j < buf_size_w_div8; ++j) {
2561 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2562 }
2563 }
2564 }
2565 for (int i = 0; i < buf_size_w_div8; i++) {
2566 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
2567 round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2568 }
2569
2570 if (txfm_size_col >= 16) {
2571 for (int i = 0; i < (txfm_size_col >> 4); i++) {
2572 lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2573 output + 16 * i, stride, ud_flip,
2574 txfm_size_row);
2575 }
2576 } else if (txfm_size_col == 8) {
2577 lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
2578 }
2579 }
2580
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2581 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
2582 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2583 TX_SIZE tx_size, int eob) {
2584 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2585 int eobx, eoby;
2586 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2587 const int txw_idx = get_txw_idx(tx_size);
2588 const int txh_idx = get_txh_idx(tx_size);
2589 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2590 const int txfm_size_col = tx_size_wide[tx_size];
2591 const int txfm_size_row = tx_size_high[tx_size];
2592 const int buf_size_w_div8 = (eobx + 8) >> 3;
2593 const int input_stride = AOMMIN(32, txfm_size_col);
2594 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2595
2596 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2597 assert(fun_idx < 5);
2598 const transform_1d_ssse3 col_txfm =
2599 lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2600
2601 assert(col_txfm != NULL);
2602
2603 int ud_flip, lr_flip;
2604 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2605 for (int i = 0; i < buf_size_w_div8; i++) {
2606 __m128i buf0[64];
2607 iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
2608 eoby + 1, txw_idx, rect_type);
2609 col_txfm(buf0, buf0, cos_bit_col);
2610 __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2611 int k = ud_flip ? (txfm_size_row - 1) : 0;
2612 const int step = ud_flip ? -1 : 1;
2613 uint8_t *out = output + 8 * i;
2614 for (int j = 0; j < txfm_size_row; ++j, k += step) {
2615 const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
2616 __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2617 const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2618 _mm_storel_epi64((__m128i *)(out), u);
2619 out += stride;
2620 }
2621 }
2622 }
2623
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2624 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
2625 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2626 TX_SIZE tx_size, int eob) {
2627 __m128i buf1[64];
2628 int eobx, eoby;
2629 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2630 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2631 const int txw_idx = get_txw_idx(tx_size);
2632 const int txh_idx = get_txh_idx(tx_size);
2633 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2634 const int txfm_size_col = tx_size_wide[tx_size];
2635 const int txfm_size_row = tx_size_high[tx_size];
2636 const int buf_size_w_div8 = txfm_size_col >> 3;
2637 const int buf_size_h_div8 = (eoby + 8) >> 3;
2638 const int input_stride = AOMMIN(32, txfm_size_col);
2639 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2640
2641 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2642 const transform_1d_ssse3 row_txfm =
2643 lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2644
2645 assert(row_txfm != NULL);
2646 int ud_flip, lr_flip;
2647 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2648 for (int i = 0; i < buf_size_h_div8; i++) {
2649 __m128i buf0[64];
2650 const int32_t *input_row = input + i * input_stride * 8;
2651 for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2652 __m128i *buf0_cur = buf0 + j * 8;
2653 load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2654 transpose_16bit_8x8(buf0_cur, buf0_cur);
2655 }
2656 if (rect_type == 1 || rect_type == -1) {
2657 round_shift_ssse3(buf0, buf0, input_stride); // rect special code
2658 }
2659 row_txfm(buf0, buf0, cos_bit_row);
2660 round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2661 __m128i *_buf1 = buf1;
2662 if (lr_flip) {
2663 for (int j = 0; j < buf_size_w_div8; ++j) {
2664 __m128i temp[8];
2665 flip_buf_sse2(buf0 + 8 * j, temp, 8);
2666 transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2667 }
2668 } else {
2669 for (int j = 0; j < buf_size_w_div8; ++j) {
2670 transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2671 }
2672 }
2673
2674 for (int j = 0; j < buf_size_w_div8; ++j) {
2675 iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
2676 buf1 + j * 8, shift[1], 8, txh_idx);
2677 }
2678 }
2679 }
2680
2681 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2682 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
2683 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2684 TX_SIZE tx_size, int eob) {
2685 switch (tx_type) {
2686 case DCT_DCT:
2687 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2688 tx_size, eob);
2689 break;
2690 case IDTX:
2691 lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2692 break;
2693 case V_DCT:
2694 case V_ADST:
2695 case V_FLIPADST:
2696 lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2697 tx_size, eob);
2698 break;
2699 case H_DCT:
2700 case H_ADST:
2701 case H_FLIPADST:
2702 lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2703 tx_size, eob);
2704 break;
2705 default:
2706 lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2707 tx_size, eob);
2708 break;
2709 }
2710 }
2711
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2712 static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
2713 uint8_t *output, int stride,
2714 TX_TYPE tx_type, TX_SIZE tx_size_,
2715 int eob) {
2716 (void)tx_size_;
2717 (void)eob;
2718 __m128i buf[8];
2719 const TX_SIZE tx_size = TX_4X8;
2720 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2721 const int txw_idx = get_txw_idx(tx_size);
2722 const int txh_idx = get_txh_idx(tx_size);
2723 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2724 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2725 const int txfm_size_col = tx_size_wide[tx_size];
2726 const int txfm_size_row = tx_size_high[tx_size];
2727
2728 const transform_1d_ssse3 row_txfm =
2729 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2730 const transform_1d_ssse3 col_txfm =
2731 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2732
2733 int ud_flip, lr_flip;
2734 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2735 load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2736 transpose_16bit_4x8(buf, buf);
2737 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2738 row_txfm(buf, buf, cos_bit_row);
2739 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2740 if (lr_flip) {
2741 __m128i temp[4];
2742 flip_buf_sse2(buf, temp, txfm_size_col);
2743 transpose_16bit_8x4(temp, buf);
2744 } else {
2745 transpose_16bit_8x4(buf, buf);
2746 }
2747 col_txfm(buf, buf, cos_bit_col);
2748 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2749 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2750 }
2751
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2752 static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
2753 uint8_t *output, int stride,
2754 TX_TYPE tx_type, TX_SIZE tx_size_,
2755 int eob) {
2756 (void)tx_size_;
2757 (void)eob;
2758 __m128i buf[8];
2759 const TX_SIZE tx_size = TX_8X4;
2760 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2761 const int txw_idx = get_txw_idx(tx_size);
2762 const int txh_idx = get_txh_idx(tx_size);
2763 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2764 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2765 const int txfm_size_col = tx_size_wide[tx_size];
2766 const int txfm_size_row = tx_size_high[tx_size];
2767
2768 const transform_1d_ssse3 row_txfm =
2769 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2770 const transform_1d_ssse3 col_txfm =
2771 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2772
2773 int ud_flip, lr_flip;
2774 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2775 load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2776 transpose_16bit_8x4(buf, buf);
2777 round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
2778 row_txfm(buf, buf, cos_bit_row);
2779 // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2780 if (lr_flip) {
2781 __m128i temp[8];
2782 flip_buf_sse2(buf, temp, txfm_size_col);
2783 transpose_16bit_4x8(temp, buf);
2784 } else {
2785 transpose_16bit_4x8(buf, buf);
2786 }
2787 col_txfm(buf, buf, cos_bit_col);
2788 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2789 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2790 }
2791
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2792 static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
2793 uint8_t *output, int stride,
2794 TX_TYPE tx_type, TX_SIZE tx_size_,
2795 int eob) {
2796 (void)tx_size_;
2797 (void)eob;
2798 __m128i buf[16];
2799 const TX_SIZE tx_size = TX_4X16;
2800 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2801 const int txw_idx = get_txw_idx(tx_size);
2802 const int txh_idx = get_txh_idx(tx_size);
2803 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2804 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2805 const int txfm_size_col = tx_size_wide[tx_size];
2806 const int txfm_size_row = tx_size_high[tx_size];
2807
2808 const transform_1d_ssse3 row_txfm =
2809 lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2810 const transform_1d_ssse3 col_txfm =
2811 lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2812
2813 int ud_flip, lr_flip;
2814 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2815
2816 const int row_one_loop = 8;
2817 for (int i = 0; i < 2; ++i) {
2818 const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2819 __m128i *buf_cur = buf + i * row_one_loop;
2820 load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
2821 row_one_loop);
2822 transpose_16bit_4x8(buf_cur, buf_cur);
2823 row_txfm(buf_cur, buf_cur, cos_bit_row);
2824 round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2825 if (lr_flip) {
2826 __m128i temp[8];
2827 flip_buf_sse2(buf_cur, temp, txfm_size_col);
2828 transpose_16bit_8x4(temp, buf_cur);
2829 } else {
2830 transpose_16bit_8x4(buf_cur, buf_cur);
2831 }
2832 }
2833 col_txfm(buf, buf, cos_bit_col);
2834 round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2835 lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2836 }
2837
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2838 static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
2839 uint8_t *output, int stride,
2840 TX_TYPE tx_type, TX_SIZE tx_size_,
2841 int eob) {
2842 (void)tx_size_;
2843 (void)eob;
2844 __m128i buf[16];
2845 const TX_SIZE tx_size = TX_16X4;
2846 const int8_t *shift = inv_txfm_shift_ls[tx_size];
2847 const int txw_idx = get_txw_idx(tx_size);
2848 const int txh_idx = get_txh_idx(tx_size);
2849 const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2850 const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2851 const int txfm_size_col = tx_size_wide[tx_size];
2852 const int txfm_size_row = tx_size_high[tx_size];
2853 const int buf_size_w_div8 = txfm_size_col >> 3;
2854
2855 const transform_1d_ssse3 row_txfm =
2856 lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2857 const transform_1d_ssse3 col_txfm =
2858 lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2859
2860 int ud_flip, lr_flip;
2861 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2862 const int row_one_loop = 8;
2863 for (int i = 0; i < buf_size_w_div8; ++i) {
2864 const int32_t *input_cur = input + i * row_one_loop;
2865 __m128i *buf_cur = buf + i * row_one_loop;
2866 load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
2867 txfm_size_row);
2868 transpose_16bit_8x4(buf_cur, buf_cur);
2869 }
2870 row_txfm(buf, buf, cos_bit_row);
2871 round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2872 if (lr_flip) {
2873 __m128i temp[16];
2874 flip_buf_sse2(buf, temp, 16);
2875 transpose_16bit_4x8(temp, buf);
2876 transpose_16bit_4x8(temp + 8, buf + 8);
2877 } else {
2878 transpose_16bit_4x8(buf, buf);
2879 transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2880 }
2881 for (int i = 0; i < buf_size_w_div8; i++) {
2882 col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
2883 round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2884 }
2885 lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
2886 lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
2887 }
2888
av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2889 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
2890 int stride, TX_TYPE tx_type,
2891 TX_SIZE tx_size, int eob) {
2892 switch (tx_size) {
2893 case TX_4X4:
2894 lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
2895 eob);
2896 break;
2897 case TX_4X8:
2898 lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
2899 eob);
2900 break;
2901 case TX_8X4:
2902 lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
2903 eob);
2904 break;
2905 case TX_4X16:
2906 lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
2907 eob);
2908 break;
2909 case TX_16X4:
2910 lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
2911 eob);
2912 break;
2913 default:
2914 lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
2915 tx_size, eob);
2916 break;
2917 }
2918 }
av1_inv_txfm_add_ssse3(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)2919 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2920 const TxfmParam *txfm_param) {
2921 const TX_TYPE tx_type = txfm_param->tx_type;
2922 if (!txfm_param->lossless) {
2923 switch (txfm_param->tx_size) {
2924 case TX_4X16:
2925 case TX_16X4:
2926 // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test
2927 // vector mismatches.
2928 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2929 break;
2930 default:
2931 av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
2932 txfm_param->tx_size, txfm_param->eob);
2933 break;
2934 }
2935 } else {
2936 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2937 }
2938 }
2939