1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13
14 #include "aom_dsp/txfm_common.h"
15 #include "aom_dsp/x86/transpose_sse2.h"
16 #include "aom_dsp/x86/txfm_common_sse2.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
21 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
22 #include "config/aom_config.h"
23 #include "config/av1_rtcd.h"
24
av1_fwht4x4_sse4_1(const int16_t * input,tran_low_t * output,int stride)25 void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
26 __m128i in[4];
27 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
28 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
29 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
30 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
31
32 // Convert to int32_t.
33 __m128i op[4];
34 op[0] = _mm_cvtepi16_epi32(in[0]);
35 op[1] = _mm_cvtepi16_epi32(in[1]);
36 op[2] = _mm_cvtepi16_epi32(in[2]);
37 op[3] = _mm_cvtepi16_epi32(in[3]);
38
39 for (int i = 0; i < 2; ++i) {
40 __m128i a1 = op[0];
41 __m128i b1 = op[1];
42 __m128i c1 = op[2];
43 __m128i d1 = op[3];
44 __m128i e1;
45
46 a1 = _mm_add_epi32(a1, b1); // a1 += b1
47 d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1
48 e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
49 e1 = _mm_srai_epi32(e1, 1);
50 b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
51 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
52 a1 = _mm_sub_epi32(a1, c1); // a1 -= c1
53 d1 = _mm_add_epi32(d1, b1); // d1 += b1
54
55 op[0] = a1;
56 op[1] = c1;
57 op[2] = d1;
58 op[3] = b1;
59
60 transpose_32bit_4x4(op, op);
61 }
62
63 op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
64 op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
65 op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
66 op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
67
68 _mm_storeu_si128((__m128i *)(output + 0), op[0]);
69 _mm_storeu_si128((__m128i *)(output + 4), op[1]);
70 _mm_storeu_si128((__m128i *)(output + 8), op[2]);
71 _mm_storeu_si128((__m128i *)(output + 12), op[3]);
72 }
73
av1_highbd_fwht4x4_sse4_1(const int16_t * input,tran_low_t * output,int stride)74 void av1_highbd_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output,
75 int stride) {
76 av1_fwht4x4_sse4_1(input, output, stride);
77 }
78
load_buffer_4x4(const int16_t * input,__m128i * in,int stride,int flipud,int fliplr,int shift)79 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
80 int stride, int flipud, int fliplr,
81 int shift) {
82 if (!flipud) {
83 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
84 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
85 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
86 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
87 } else {
88 in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
89 in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
90 in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
91 in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
92 }
93
94 if (fliplr) {
95 in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
96 in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
97 in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
98 in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
99 }
100
101 in[0] = _mm_cvtepi16_epi32(in[0]);
102 in[1] = _mm_cvtepi16_epi32(in[1]);
103 in[2] = _mm_cvtepi16_epi32(in[2]);
104 in[3] = _mm_cvtepi16_epi32(in[3]);
105
106 in[0] = _mm_slli_epi32(in[0], shift);
107 in[1] = _mm_slli_epi32(in[1], shift);
108 in[2] = _mm_slli_epi32(in[2], shift);
109 in[3] = _mm_slli_epi32(in[3], shift);
110 }
111
112 // We only use stage-2 bit;
113 // shift[0] is used in load_buffer_4x4()
114 // shift[1] is used in txfm_func_col()
115 // shift[2] is used in txfm_func_row()
fdct4x4_sse4_1(__m128i * in,__m128i * out,int bit,const int num_col)116 static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
117 const int num_col) {
118 const int32_t *cospi = cospi_arr(bit);
119 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
120 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
121 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
122 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
123 __m128i s0, s1, s2, s3;
124 __m128i u0, u1, u2, u3;
125 __m128i v0, v1, v2, v3;
126
127 int endidx = 3 * num_col;
128 s0 = _mm_add_epi32(in[0], in[endidx]);
129 s3 = _mm_sub_epi32(in[0], in[endidx]);
130 endidx -= num_col;
131 s1 = _mm_add_epi32(in[num_col], in[endidx]);
132 s2 = _mm_sub_epi32(in[num_col], in[endidx]);
133
134 // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
135 u0 = _mm_mullo_epi32(s0, cospi32);
136 u1 = _mm_mullo_epi32(s1, cospi32);
137 u2 = _mm_add_epi32(u0, u1);
138 v0 = _mm_sub_epi32(u0, u1);
139
140 u3 = _mm_add_epi32(u2, rnding);
141 v1 = _mm_add_epi32(v0, rnding);
142
143 u0 = _mm_srai_epi32(u3, bit);
144 u2 = _mm_srai_epi32(v1, bit);
145
146 // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
147 v0 = _mm_mullo_epi32(s2, cospi48);
148 v1 = _mm_mullo_epi32(s3, cospi16);
149 v2 = _mm_add_epi32(v0, v1);
150
151 v3 = _mm_add_epi32(v2, rnding);
152 u1 = _mm_srai_epi32(v3, bit);
153
154 v0 = _mm_mullo_epi32(s2, cospi16);
155 v1 = _mm_mullo_epi32(s3, cospi48);
156 v2 = _mm_sub_epi32(v1, v0);
157
158 v3 = _mm_add_epi32(v2, rnding);
159 u3 = _mm_srai_epi32(v3, bit);
160
161 // Note: shift[1] and shift[2] are zeros
162
163 // Transpose 4x4 32-bit
164 v0 = _mm_unpacklo_epi32(u0, u1);
165 v1 = _mm_unpackhi_epi32(u0, u1);
166 v2 = _mm_unpacklo_epi32(u2, u3);
167 v3 = _mm_unpackhi_epi32(u2, u3);
168
169 out[0] = _mm_unpacklo_epi64(v0, v2);
170 out[1] = _mm_unpackhi_epi64(v0, v2);
171 out[2] = _mm_unpacklo_epi64(v1, v3);
172 out[3] = _mm_unpackhi_epi64(v1, v3);
173 }
174
write_buffer_4x4(__m128i * res,int32_t * output)175 static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
176 _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
177 _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
178 _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
179 _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
180 }
181
fadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,const int num_col)182 static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
183 const int num_col) {
184 const int32_t *sinpi = sinpi_arr(bit);
185 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
186 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
187 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
188 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
189 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
190 __m128i t;
191 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
192 __m128i x0, x1, x2, x3;
193 __m128i u0, u1, u2, u3;
194 __m128i v0, v1, v2, v3;
195
196 int idx = 0 * num_col;
197 s0 = _mm_mullo_epi32(in[idx], sinpi1);
198 s1 = _mm_mullo_epi32(in[idx], sinpi4);
199 t = _mm_add_epi32(in[idx], in[idx + num_col]);
200 idx += num_col;
201 s2 = _mm_mullo_epi32(in[idx], sinpi2);
202 s3 = _mm_mullo_epi32(in[idx], sinpi1);
203 idx += num_col;
204 s4 = _mm_mullo_epi32(in[idx], sinpi3);
205 idx += num_col;
206 s5 = _mm_mullo_epi32(in[idx], sinpi4);
207 s6 = _mm_mullo_epi32(in[idx], sinpi2);
208 s7 = _mm_sub_epi32(t, in[idx]);
209
210 t = _mm_add_epi32(s0, s2);
211 x0 = _mm_add_epi32(t, s5);
212 x1 = _mm_mullo_epi32(s7, sinpi3);
213 t = _mm_sub_epi32(s1, s3);
214 x2 = _mm_add_epi32(t, s6);
215 x3 = s4;
216
217 s0 = _mm_add_epi32(x0, x3);
218 s1 = x1;
219 s2 = _mm_sub_epi32(x2, x3);
220 t = _mm_sub_epi32(x2, x0);
221 s3 = _mm_add_epi32(t, x3);
222
223 u0 = _mm_add_epi32(s0, rnding);
224 u0 = _mm_srai_epi32(u0, bit);
225
226 u1 = _mm_add_epi32(s1, rnding);
227 u1 = _mm_srai_epi32(u1, bit);
228
229 u2 = _mm_add_epi32(s2, rnding);
230 u2 = _mm_srai_epi32(u2, bit);
231
232 u3 = _mm_add_epi32(s3, rnding);
233 u3 = _mm_srai_epi32(u3, bit);
234
235 v0 = _mm_unpacklo_epi32(u0, u1);
236 v1 = _mm_unpackhi_epi32(u0, u1);
237 v2 = _mm_unpacklo_epi32(u2, u3);
238 v3 = _mm_unpackhi_epi32(u2, u3);
239
240 out[0] = _mm_unpacklo_epi64(v0, v2);
241 out[1] = _mm_unpackhi_epi64(v0, v2);
242 out[2] = _mm_unpacklo_epi64(v1, v3);
243 out[3] = _mm_unpackhi_epi64(v1, v3);
244 }
idtx4x4_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)245 static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
246 (void)bit;
247 __m128i fact = _mm_set1_epi32(NewSqrt2);
248 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
249 __m128i a_low;
250 __m128i v[4];
251
252 for (int i = 0; i < 4; i++) {
253 a_low = _mm_mullo_epi32(in[i * col_num], fact);
254 a_low = _mm_add_epi32(a_low, offset);
255 out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
256 }
257
258 // Transpose for 4x4
259 v[0] = _mm_unpacklo_epi32(out[0], out[1]);
260 v[1] = _mm_unpackhi_epi32(out[0], out[1]);
261 v[2] = _mm_unpacklo_epi32(out[2], out[3]);
262 v[3] = _mm_unpackhi_epi32(out[2], out[3]);
263
264 out[0] = _mm_unpacklo_epi64(v[0], v[2]);
265 out[1] = _mm_unpackhi_epi64(v[0], v[2]);
266 out[2] = _mm_unpacklo_epi64(v[1], v[3]);
267 out[3] = _mm_unpackhi_epi64(v[1], v[3]);
268 }
av1_fwd_txfm2d_4x4_sse4_1(const int16_t * input,int32_t * coeff,int input_stride,TX_TYPE tx_type,int bd)269 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
270 int input_stride, TX_TYPE tx_type, int bd) {
271 __m128i in[4];
272 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
273 const int txw_idx = get_txw_idx(TX_4X4);
274 const int txh_idx = get_txh_idx(TX_4X4);
275
276 switch (tx_type) {
277 case DCT_DCT:
278 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
279 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
280 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
281 write_buffer_4x4(in, coeff);
282 break;
283 case ADST_DCT:
284 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
285 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
286 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
287 write_buffer_4x4(in, coeff);
288 break;
289 case DCT_ADST:
290 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
291 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
292 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
293 write_buffer_4x4(in, coeff);
294 break;
295 case ADST_ADST:
296 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
297 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
298 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
299 write_buffer_4x4(in, coeff);
300 break;
301 case FLIPADST_DCT:
302 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
303 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
304 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
305 write_buffer_4x4(in, coeff);
306 break;
307 case DCT_FLIPADST:
308 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
309 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
310 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
311 write_buffer_4x4(in, coeff);
312 break;
313 case FLIPADST_FLIPADST:
314 load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
315 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
316 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
317 write_buffer_4x4(in, coeff);
318 break;
319 case ADST_FLIPADST:
320 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
321 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
322 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
323 write_buffer_4x4(in, coeff);
324 break;
325 case FLIPADST_ADST:
326 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
327 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
328 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
329 write_buffer_4x4(in, coeff);
330 break;
331 case IDTX:
332 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
333 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
334 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
335 write_buffer_4x4(in, coeff);
336 break;
337 case V_DCT:
338 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
339 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
340 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
341 write_buffer_4x4(in, coeff);
342 break;
343 case H_DCT:
344 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
345 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
346 fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
347 write_buffer_4x4(in, coeff);
348 break;
349 case V_ADST:
350 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
351 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
352 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
353 write_buffer_4x4(in, coeff);
354 break;
355 case H_ADST:
356 load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
357 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
358 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
359 write_buffer_4x4(in, coeff);
360 break;
361 case V_FLIPADST:
362 load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
363 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
364 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
365 write_buffer_4x4(in, coeff);
366 break;
367 case H_FLIPADST:
368 load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
369 idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
370 fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
371 write_buffer_4x4(in, coeff);
372 break;
373 default: assert(0);
374 }
375 (void)bd;
376 }
377
load_buffer_8x8(const int16_t * input,__m128i * in,int stride,int flipud,int fliplr,int shift)378 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
379 int stride, int flipud, int fliplr,
380 int shift) {
381 __m128i u;
382 if (!flipud) {
383 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
384 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
385 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
386 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
387 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
388 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
389 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
390 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
391 } else {
392 in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
393 in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
394 in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
395 in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
396 in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
397 in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
398 in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
399 in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
400 }
401
402 if (fliplr) {
403 in[0] = mm_reverse_epi16(in[0]);
404 in[1] = mm_reverse_epi16(in[1]);
405 in[2] = mm_reverse_epi16(in[2]);
406 in[3] = mm_reverse_epi16(in[3]);
407 in[4] = mm_reverse_epi16(in[4]);
408 in[5] = mm_reverse_epi16(in[5]);
409 in[6] = mm_reverse_epi16(in[6]);
410 in[7] = mm_reverse_epi16(in[7]);
411 }
412
413 u = _mm_unpackhi_epi64(in[4], in[4]);
414 in[8] = _mm_cvtepi16_epi32(in[4]);
415 in[9] = _mm_cvtepi16_epi32(u);
416
417 u = _mm_unpackhi_epi64(in[5], in[5]);
418 in[10] = _mm_cvtepi16_epi32(in[5]);
419 in[11] = _mm_cvtepi16_epi32(u);
420
421 u = _mm_unpackhi_epi64(in[6], in[6]);
422 in[12] = _mm_cvtepi16_epi32(in[6]);
423 in[13] = _mm_cvtepi16_epi32(u);
424
425 u = _mm_unpackhi_epi64(in[7], in[7]);
426 in[14] = _mm_cvtepi16_epi32(in[7]);
427 in[15] = _mm_cvtepi16_epi32(u);
428
429 u = _mm_unpackhi_epi64(in[3], in[3]);
430 in[6] = _mm_cvtepi16_epi32(in[3]);
431 in[7] = _mm_cvtepi16_epi32(u);
432
433 u = _mm_unpackhi_epi64(in[2], in[2]);
434 in[4] = _mm_cvtepi16_epi32(in[2]);
435 in[5] = _mm_cvtepi16_epi32(u);
436
437 u = _mm_unpackhi_epi64(in[1], in[1]);
438 in[2] = _mm_cvtepi16_epi32(in[1]);
439 in[3] = _mm_cvtepi16_epi32(u);
440
441 u = _mm_unpackhi_epi64(in[0], in[0]);
442 in[0] = _mm_cvtepi16_epi32(in[0]);
443 in[1] = _mm_cvtepi16_epi32(u);
444
445 in[0] = _mm_slli_epi32(in[0], shift);
446 in[1] = _mm_slli_epi32(in[1], shift);
447 in[2] = _mm_slli_epi32(in[2], shift);
448 in[3] = _mm_slli_epi32(in[3], shift);
449 in[4] = _mm_slli_epi32(in[4], shift);
450 in[5] = _mm_slli_epi32(in[5], shift);
451 in[6] = _mm_slli_epi32(in[6], shift);
452 in[7] = _mm_slli_epi32(in[7], shift);
453
454 in[8] = _mm_slli_epi32(in[8], shift);
455 in[9] = _mm_slli_epi32(in[9], shift);
456 in[10] = _mm_slli_epi32(in[10], shift);
457 in[11] = _mm_slli_epi32(in[11], shift);
458 in[12] = _mm_slli_epi32(in[12], shift);
459 in[13] = _mm_slli_epi32(in[13], shift);
460 in[14] = _mm_slli_epi32(in[14], shift);
461 in[15] = _mm_slli_epi32(in[15], shift);
462 }
463
col_txfm_8x8_rounding(__m128i * in,int shift)464 static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
465 const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
466
467 in[0] = _mm_add_epi32(in[0], rounding);
468 in[1] = _mm_add_epi32(in[1], rounding);
469 in[2] = _mm_add_epi32(in[2], rounding);
470 in[3] = _mm_add_epi32(in[3], rounding);
471 in[4] = _mm_add_epi32(in[4], rounding);
472 in[5] = _mm_add_epi32(in[5], rounding);
473 in[6] = _mm_add_epi32(in[6], rounding);
474 in[7] = _mm_add_epi32(in[7], rounding);
475 in[8] = _mm_add_epi32(in[8], rounding);
476 in[9] = _mm_add_epi32(in[9], rounding);
477 in[10] = _mm_add_epi32(in[10], rounding);
478 in[11] = _mm_add_epi32(in[11], rounding);
479 in[12] = _mm_add_epi32(in[12], rounding);
480 in[13] = _mm_add_epi32(in[13], rounding);
481 in[14] = _mm_add_epi32(in[14], rounding);
482 in[15] = _mm_add_epi32(in[15], rounding);
483
484 in[0] = _mm_srai_epi32(in[0], shift);
485 in[1] = _mm_srai_epi32(in[1], shift);
486 in[2] = _mm_srai_epi32(in[2], shift);
487 in[3] = _mm_srai_epi32(in[3], shift);
488 in[4] = _mm_srai_epi32(in[4], shift);
489 in[5] = _mm_srai_epi32(in[5], shift);
490 in[6] = _mm_srai_epi32(in[6], shift);
491 in[7] = _mm_srai_epi32(in[7], shift);
492 in[8] = _mm_srai_epi32(in[8], shift);
493 in[9] = _mm_srai_epi32(in[9], shift);
494 in[10] = _mm_srai_epi32(in[10], shift);
495 in[11] = _mm_srai_epi32(in[11], shift);
496 in[12] = _mm_srai_epi32(in[12], shift);
497 in[13] = _mm_srai_epi32(in[13], shift);
498 in[14] = _mm_srai_epi32(in[14], shift);
499 in[15] = _mm_srai_epi32(in[15], shift);
500 }
501
col_txfm_4x8_rounding(__m128i * in,int shift)502 static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
503 const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
504
505 in[0] = _mm_add_epi32(in[0], rounding);
506 in[1] = _mm_add_epi32(in[1], rounding);
507 in[2] = _mm_add_epi32(in[2], rounding);
508 in[3] = _mm_add_epi32(in[3], rounding);
509 in[4] = _mm_add_epi32(in[4], rounding);
510 in[5] = _mm_add_epi32(in[5], rounding);
511 in[6] = _mm_add_epi32(in[6], rounding);
512 in[7] = _mm_add_epi32(in[7], rounding);
513
514 in[0] = _mm_srai_epi32(in[0], shift);
515 in[1] = _mm_srai_epi32(in[1], shift);
516 in[2] = _mm_srai_epi32(in[2], shift);
517 in[3] = _mm_srai_epi32(in[3], shift);
518 in[4] = _mm_srai_epi32(in[4], shift);
519 in[5] = _mm_srai_epi32(in[5], shift);
520 in[6] = _mm_srai_epi32(in[6], shift);
521 in[7] = _mm_srai_epi32(in[7], shift);
522 }
523
write_buffer_8x8(const __m128i * res,int32_t * output)524 static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
525 _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
526 _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
527 _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
528 _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
529
530 _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
531 _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
532 _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
533 _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
534
535 _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
536 _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
537 _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
538 _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
539
540 _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
541 _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
542 _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
543 _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
544 }
545
write_buffer_16x8(const __m128i * res,int32_t * output,const int stride)546 static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
547 const int stride) {
548 _mm_storeu_si128((__m128i *)(output), res[0]);
549 _mm_storeu_si128((__m128i *)(output + 4), res[1]);
550 _mm_storeu_si128((__m128i *)(output + stride), res[2]);
551 _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
552
553 _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
554 _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
555 _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
556 _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
557
558 _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
559 _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
560 _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
561 _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
562
563 _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
564 _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
565 _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
566 _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
567 }
568
fdct4x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)569 static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
570 const int col_num) {
571 const int32_t *cospi = cospi_arr(bit);
572 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
573 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
574 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
575 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
576 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
577 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
578 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
579 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
580 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
581 __m128i u[8], v[8];
582
583 int startidx = 0 * col_num;
584 int endidx = 7 * col_num;
585 // Even 8 points 0, 2, ..., 14
586 // stage 0
587 // stage 1
588 u[0] = _mm_add_epi32(in[startidx], in[endidx]);
589 v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7]
590 startidx += col_num;
591 endidx -= col_num;
592 u[1] = _mm_add_epi32(in[startidx], in[endidx]);
593 u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
594 startidx += col_num;
595 endidx -= col_num;
596 u[2] = _mm_add_epi32(in[startidx], in[endidx]);
597 u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
598 startidx += col_num;
599 endidx -= col_num;
600 u[3] = _mm_add_epi32(in[startidx], in[endidx]);
601 v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4]
602
603 // stage 2
604 v[0] = _mm_add_epi32(u[0], u[3]);
605 v[3] = _mm_sub_epi32(u[0], u[3]);
606 v[1] = _mm_add_epi32(u[1], u[2]);
607 v[2] = _mm_sub_epi32(u[1], u[2]);
608
609 v[5] = _mm_mullo_epi32(u[5], cospim32);
610 v[6] = _mm_mullo_epi32(u[6], cospi32);
611 v[5] = _mm_add_epi32(v[5], v[6]);
612 v[5] = _mm_add_epi32(v[5], rnding);
613 v[5] = _mm_srai_epi32(v[5], bit);
614
615 u[0] = _mm_mullo_epi32(u[5], cospi32);
616 v[6] = _mm_mullo_epi32(u[6], cospim32);
617 v[6] = _mm_sub_epi32(u[0], v[6]);
618 v[6] = _mm_add_epi32(v[6], rnding);
619 v[6] = _mm_srai_epi32(v[6], bit);
620
621 // stage 3
622 // type 0
623 v[0] = _mm_mullo_epi32(v[0], cospi32);
624 v[1] = _mm_mullo_epi32(v[1], cospi32);
625 u[0] = _mm_add_epi32(v[0], v[1]);
626 u[0] = _mm_add_epi32(u[0], rnding);
627 u[0] = _mm_srai_epi32(u[0], bit);
628
629 u[1] = _mm_sub_epi32(v[0], v[1]);
630 u[1] = _mm_add_epi32(u[1], rnding);
631 u[1] = _mm_srai_epi32(u[1], bit);
632
633 // type 1
634 v[0] = _mm_mullo_epi32(v[2], cospi48);
635 v[1] = _mm_mullo_epi32(v[3], cospi16);
636 u[2] = _mm_add_epi32(v[0], v[1]);
637 u[2] = _mm_add_epi32(u[2], rnding);
638 u[2] = _mm_srai_epi32(u[2], bit);
639
640 v[0] = _mm_mullo_epi32(v[2], cospi16);
641 v[1] = _mm_mullo_epi32(v[3], cospi48);
642 u[3] = _mm_sub_epi32(v[1], v[0]);
643 u[3] = _mm_add_epi32(u[3], rnding);
644 u[3] = _mm_srai_epi32(u[3], bit);
645
646 u[4] = _mm_add_epi32(v[4], v[5]);
647 u[5] = _mm_sub_epi32(v[4], v[5]);
648 u[6] = _mm_sub_epi32(v[7], v[6]);
649 u[7] = _mm_add_epi32(v[7], v[6]);
650
651 // stage 4
652 // stage 5
653 v[0] = _mm_mullo_epi32(u[4], cospi56);
654 v[1] = _mm_mullo_epi32(u[7], cospi8);
655 v[0] = _mm_add_epi32(v[0], v[1]);
656 v[0] = _mm_add_epi32(v[0], rnding);
657 out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4]
658
659 v[0] = _mm_mullo_epi32(u[4], cospi8);
660 v[1] = _mm_mullo_epi32(u[7], cospi56);
661 v[0] = _mm_sub_epi32(v[1], v[0]);
662 v[0] = _mm_add_epi32(v[0], rnding);
663 out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7]
664
665 v[0] = _mm_mullo_epi32(u[5], cospi24);
666 v[1] = _mm_mullo_epi32(u[6], cospi40);
667 v[0] = _mm_add_epi32(v[0], v[1]);
668 v[0] = _mm_add_epi32(v[0], rnding);
669 out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5]
670
671 v[0] = _mm_mullo_epi32(u[5], cospi40);
672 v[1] = _mm_mullo_epi32(u[6], cospi24);
673 v[0] = _mm_sub_epi32(v[1], v[0]);
674 v[0] = _mm_add_epi32(v[0], rnding);
675 out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6]
676
677 out[0 * col_num] = u[0]; // buf0[0]
678 out[4 * col_num] = u[1]; // buf0[1]
679 out[2 * col_num] = u[2]; // buf0[2]
680 out[6 * col_num] = u[3]; // buf0[3]
681 }
682
fdct8x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)683 static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
684 const int col_num) {
685 fdct4x8_sse4_1(in, out, bit, col_num);
686 fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
687 }
688
fadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)689 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
690 const int col_num) {
691 const int32_t *cospi = cospi_arr(bit);
692 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
693 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
694 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
695 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
696 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
697 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
698 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
699 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
700 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
701 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
702 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
703 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
704 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
705 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
706 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
707 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
708 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
709 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
710 const __m128i zero = _mm_setzero_si128();
711 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
712 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
713 __m128i x, y;
714 int col;
715
716 // Note:
717 // Even column: 0, 2, ..., 14
718 // Odd column: 1, 3, ..., 15
719 // one even column plus one odd column constructs one row (8 coeffs)
720 // total we have 8 rows (8x8).
721 for (col = 0; col < col_num; ++col) {
722 // stage 0
723 // stage 1
724 u0 = in[col_num * 0 + col];
725 u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
726 u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
727 u3 = in[col_num * 4 + col];
728 u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
729 u5 = in[col_num * 6 + col];
730 u6 = in[col_num * 2 + col];
731 u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
732
733 // stage 2
734 v0 = u0;
735 v1 = u1;
736
737 x = _mm_mullo_epi32(u2, cospi32);
738 y = _mm_mullo_epi32(u3, cospi32);
739 v2 = _mm_add_epi32(x, y);
740 v2 = _mm_add_epi32(v2, rnding);
741 v2 = _mm_srai_epi32(v2, bit);
742
743 v3 = _mm_sub_epi32(x, y);
744 v3 = _mm_add_epi32(v3, rnding);
745 v3 = _mm_srai_epi32(v3, bit);
746
747 v4 = u4;
748 v5 = u5;
749
750 x = _mm_mullo_epi32(u6, cospi32);
751 y = _mm_mullo_epi32(u7, cospi32);
752 v6 = _mm_add_epi32(x, y);
753 v6 = _mm_add_epi32(v6, rnding);
754 v6 = _mm_srai_epi32(v6, bit);
755
756 v7 = _mm_sub_epi32(x, y);
757 v7 = _mm_add_epi32(v7, rnding);
758 v7 = _mm_srai_epi32(v7, bit);
759
760 // stage 3
761 u0 = _mm_add_epi32(v0, v2);
762 u1 = _mm_add_epi32(v1, v3);
763 u2 = _mm_sub_epi32(v0, v2);
764 u3 = _mm_sub_epi32(v1, v3);
765 u4 = _mm_add_epi32(v4, v6);
766 u5 = _mm_add_epi32(v5, v7);
767 u6 = _mm_sub_epi32(v4, v6);
768 u7 = _mm_sub_epi32(v5, v7);
769
770 // stage 4
771 v0 = u0;
772 v1 = u1;
773 v2 = u2;
774 v3 = u3;
775
776 x = _mm_mullo_epi32(u4, cospi16);
777 y = _mm_mullo_epi32(u5, cospi48);
778 v4 = _mm_add_epi32(x, y);
779 v4 = _mm_add_epi32(v4, rnding);
780 v4 = _mm_srai_epi32(v4, bit);
781
782 x = _mm_mullo_epi32(u4, cospi48);
783 y = _mm_mullo_epi32(u5, cospim16);
784 v5 = _mm_add_epi32(x, y);
785 v5 = _mm_add_epi32(v5, rnding);
786 v5 = _mm_srai_epi32(v5, bit);
787
788 x = _mm_mullo_epi32(u6, cospim48);
789 y = _mm_mullo_epi32(u7, cospi16);
790 v6 = _mm_add_epi32(x, y);
791 v6 = _mm_add_epi32(v6, rnding);
792 v6 = _mm_srai_epi32(v6, bit);
793
794 x = _mm_mullo_epi32(u6, cospi16);
795 y = _mm_mullo_epi32(u7, cospi48);
796 v7 = _mm_add_epi32(x, y);
797 v7 = _mm_add_epi32(v7, rnding);
798 v7 = _mm_srai_epi32(v7, bit);
799
800 // stage 5
801 u0 = _mm_add_epi32(v0, v4);
802 u1 = _mm_add_epi32(v1, v5);
803 u2 = _mm_add_epi32(v2, v6);
804 u3 = _mm_add_epi32(v3, v7);
805 u4 = _mm_sub_epi32(v0, v4);
806 u5 = _mm_sub_epi32(v1, v5);
807 u6 = _mm_sub_epi32(v2, v6);
808 u7 = _mm_sub_epi32(v3, v7);
809
810 // stage 6
811 x = _mm_mullo_epi32(u0, cospi4);
812 y = _mm_mullo_epi32(u1, cospi60);
813 v0 = _mm_add_epi32(x, y);
814 v0 = _mm_add_epi32(v0, rnding);
815 v0 = _mm_srai_epi32(v0, bit);
816
817 x = _mm_mullo_epi32(u0, cospi60);
818 y = _mm_mullo_epi32(u1, cospim4);
819 v1 = _mm_add_epi32(x, y);
820 v1 = _mm_add_epi32(v1, rnding);
821 v1 = _mm_srai_epi32(v1, bit);
822
823 x = _mm_mullo_epi32(u2, cospi20);
824 y = _mm_mullo_epi32(u3, cospi44);
825 v2 = _mm_add_epi32(x, y);
826 v2 = _mm_add_epi32(v2, rnding);
827 v2 = _mm_srai_epi32(v2, bit);
828
829 x = _mm_mullo_epi32(u2, cospi44);
830 y = _mm_mullo_epi32(u3, cospim20);
831 v3 = _mm_add_epi32(x, y);
832 v3 = _mm_add_epi32(v3, rnding);
833 v3 = _mm_srai_epi32(v3, bit);
834
835 x = _mm_mullo_epi32(u4, cospi36);
836 y = _mm_mullo_epi32(u5, cospi28);
837 v4 = _mm_add_epi32(x, y);
838 v4 = _mm_add_epi32(v4, rnding);
839 v4 = _mm_srai_epi32(v4, bit);
840
841 x = _mm_mullo_epi32(u4, cospi28);
842 y = _mm_mullo_epi32(u5, cospim36);
843 v5 = _mm_add_epi32(x, y);
844 v5 = _mm_add_epi32(v5, rnding);
845 v5 = _mm_srai_epi32(v5, bit);
846
847 x = _mm_mullo_epi32(u6, cospi52);
848 y = _mm_mullo_epi32(u7, cospi12);
849 v6 = _mm_add_epi32(x, y);
850 v6 = _mm_add_epi32(v6, rnding);
851 v6 = _mm_srai_epi32(v6, bit);
852
853 x = _mm_mullo_epi32(u6, cospi12);
854 y = _mm_mullo_epi32(u7, cospim52);
855 v7 = _mm_add_epi32(x, y);
856 v7 = _mm_add_epi32(v7, rnding);
857 v7 = _mm_srai_epi32(v7, bit);
858
859 // stage 7
860 out[col_num * 0 + col] = v1;
861 out[col_num * 1 + col] = v6;
862 out[col_num * 2 + col] = v3;
863 out[col_num * 3 + col] = v4;
864 out[col_num * 4 + col] = v5;
865 out[col_num * 5 + col] = v2;
866 out[col_num * 6 + col] = v7;
867 out[col_num * 7 + col] = v0;
868 }
869 }
idtx8x8_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)870 static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
871 (void)bit;
872
873 for (int i = 0; i < col_num; i += 1) {
874 out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
875 out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
876 out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
877 out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
878 out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
879 out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
880 out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
881 out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
882 }
883 }
884 #if !CONFIG_REALTIME_ONLY
idtx32x8_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)885 static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
886 (void)bit;
887 (void)col_num;
888 for (int j = 0; j < 2; j++) {
889 out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
890 out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
891 out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
892 out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
893 out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
894 out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
895 out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
896 out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
897 }
898 }
899 #endif
av1_fwd_txfm2d_8x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)900 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
901 TX_TYPE tx_type, int bd) {
902 __m128i in[16], out[16];
903 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
904 const int txw_idx = get_txw_idx(TX_8X8);
905 const int txh_idx = get_txh_idx(TX_8X8);
906
907 switch (tx_type) {
908 case DCT_DCT:
909 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
910 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
911 col_txfm_8x8_rounding(out, -shift[1]);
912 transpose_8x8(out, in);
913 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
914 transpose_8x8(out, in);
915 write_buffer_8x8(in, coeff);
916 break;
917 case ADST_DCT:
918 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
919 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
920 col_txfm_8x8_rounding(out, -shift[1]);
921 transpose_8x8(out, in);
922 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
923 transpose_8x8(out, in);
924 write_buffer_8x8(in, coeff);
925 break;
926 case DCT_ADST:
927 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
928 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
929 col_txfm_8x8_rounding(out, -shift[1]);
930 transpose_8x8(out, in);
931 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
932 transpose_8x8(out, in);
933 write_buffer_8x8(in, coeff);
934 break;
935 case ADST_ADST:
936 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
937 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
938 col_txfm_8x8_rounding(out, -shift[1]);
939 transpose_8x8(out, in);
940 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
941 transpose_8x8(out, in);
942 write_buffer_8x8(in, coeff);
943 break;
944 case FLIPADST_DCT:
945 load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
946 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
947 col_txfm_8x8_rounding(out, -shift[1]);
948 transpose_8x8(out, in);
949 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
950 transpose_8x8(out, in);
951 write_buffer_8x8(in, coeff);
952 break;
953 case DCT_FLIPADST:
954 load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
955 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
956 col_txfm_8x8_rounding(out, -shift[1]);
957 transpose_8x8(out, in);
958 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
959 transpose_8x8(out, in);
960 write_buffer_8x8(in, coeff);
961 break;
962 case FLIPADST_FLIPADST:
963 load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
964 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
965 col_txfm_8x8_rounding(out, -shift[1]);
966 transpose_8x8(out, in);
967 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
968 transpose_8x8(out, in);
969 write_buffer_8x8(in, coeff);
970 break;
971 case ADST_FLIPADST:
972 load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
973 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
974 col_txfm_8x8_rounding(out, -shift[1]);
975 transpose_8x8(out, in);
976 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
977 transpose_8x8(out, in);
978 write_buffer_8x8(in, coeff);
979 break;
980 case FLIPADST_ADST:
981 load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
982 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
983 col_txfm_8x8_rounding(out, -shift[1]);
984 transpose_8x8(out, in);
985 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
986 transpose_8x8(out, in);
987 write_buffer_8x8(in, coeff);
988 break;
989 case IDTX:
990 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
991 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
992 col_txfm_8x8_rounding(out, -shift[1]);
993 transpose_8x8(out, in);
994 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
995 transpose_8x8(out, in);
996 write_buffer_8x8(in, coeff);
997 break;
998 case V_DCT:
999 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1000 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1001 col_txfm_8x8_rounding(out, -shift[1]);
1002 transpose_8x8(out, in);
1003 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1004 transpose_8x8(out, in);
1005 write_buffer_8x8(in, coeff);
1006 break;
1007 case H_DCT:
1008 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1009 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1010 col_txfm_8x8_rounding(out, -shift[1]);
1011 transpose_8x8(out, in);
1012 fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1013 transpose_8x8(out, in);
1014 write_buffer_8x8(in, coeff);
1015 break;
1016 case V_ADST:
1017 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1018 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1019 col_txfm_8x8_rounding(out, -shift[1]);
1020 transpose_8x8(out, in);
1021 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1022 transpose_8x8(out, in);
1023 write_buffer_8x8(in, coeff);
1024 break;
1025 case H_ADST:
1026 load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1027 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1028 col_txfm_8x8_rounding(out, -shift[1]);
1029 transpose_8x8(out, in);
1030 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1031 transpose_8x8(out, in);
1032 write_buffer_8x8(in, coeff);
1033 break;
1034 case V_FLIPADST:
1035 load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
1036 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1037 col_txfm_8x8_rounding(out, -shift[1]);
1038 transpose_8x8(out, in);
1039 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1040 transpose_8x8(out, in);
1041 write_buffer_8x8(in, coeff);
1042 break;
1043 case H_FLIPADST:
1044 load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
1045 idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1046 col_txfm_8x8_rounding(out, -shift[1]);
1047 transpose_8x8(out, in);
1048 fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1049 transpose_8x8(out, in);
1050 write_buffer_8x8(in, coeff);
1051 break;
1052 default: assert(0);
1053 }
1054 (void)bd;
1055 }
1056
1057 // Hybrid Transform 16x16
1058
convert_8x8_to_16x16(const __m128i * in,__m128i * out)1059 static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
1060 int row_index = 0;
1061 int dst_index = 0;
1062 int src_index = 0;
1063
1064 // row 0, 1, .., 7
1065 do {
1066 out[dst_index] = in[src_index];
1067 out[dst_index + 1] = in[src_index + 1];
1068 out[dst_index + 2] = in[src_index + 16];
1069 out[dst_index + 3] = in[src_index + 17];
1070 dst_index += 4;
1071 src_index += 2;
1072 row_index += 1;
1073 } while (row_index < 8);
1074
1075 // row 8, 9, ..., 15
1076 src_index += 16;
1077 do {
1078 out[dst_index] = in[src_index];
1079 out[dst_index + 1] = in[src_index + 1];
1080 out[dst_index + 2] = in[src_index + 16];
1081 out[dst_index + 3] = in[src_index + 17];
1082 dst_index += 4;
1083 src_index += 2;
1084 row_index += 1;
1085 } while (row_index < 16);
1086 }
1087
load_buffer_16x16(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1088 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
1089 int stride, int flipud, int fliplr,
1090 int shift) {
1091 __m128i in[64];
1092 // Load 4 8x8 blocks
1093 const int16_t *topL = input;
1094 const int16_t *topR = input + 8;
1095 const int16_t *botL = input + 8 * stride;
1096 const int16_t *botR = input + 8 * stride + 8;
1097
1098 const int16_t *tmp;
1099
1100 if (flipud) {
1101 // Swap left columns
1102 tmp = topL;
1103 topL = botL;
1104 botL = tmp;
1105 // Swap right columns
1106 tmp = topR;
1107 topR = botR;
1108 botR = tmp;
1109 }
1110
1111 if (fliplr) {
1112 // Swap top rows
1113 tmp = topL;
1114 topL = topR;
1115 topR = tmp;
1116 // Swap bottom rows
1117 tmp = botL;
1118 botL = botR;
1119 botR = tmp;
1120 }
1121
1122 // load first 8 columns
1123 load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
1124 load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
1125
1126 // load second 8 columns
1127 load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
1128 load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
1129
1130 convert_8x8_to_16x16(in, out);
1131 }
1132
load_buffer_8x16(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1133 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
1134 int stride, int flipud, int fliplr,
1135 int shift) {
1136 const int16_t *topL = input;
1137 const int16_t *botL = input + 8 * stride;
1138
1139 const int16_t *tmp;
1140
1141 if (flipud) {
1142 tmp = topL;
1143 topL = botL;
1144 botL = tmp;
1145 }
1146
1147 load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
1148 load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
1149 }
1150
load_buffer_8x4(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1151 static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
1152 int stride, int flipud, int fliplr,
1153 int shift) {
1154 const int16_t *topL = input;
1155 const int16_t *topR = input + 4;
1156
1157 const int16_t *tmp;
1158
1159 if (fliplr) {
1160 tmp = topL;
1161 topL = topR;
1162 topR = tmp;
1163 }
1164
1165 load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
1166 load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
1167 }
1168
load_buffer_16x4(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1169 static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
1170 int stride, int flipud, int fliplr,
1171 int shift) {
1172 const int16_t *topL = input;
1173 const int16_t *topR = input + 8;
1174
1175 const int16_t *tmp;
1176
1177 if (fliplr) {
1178 tmp = topL;
1179 topL = topR;
1180 topR = tmp;
1181 }
1182
1183 load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
1184 load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
1185 }
1186
load_buffer_4x8(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1187 static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
1188 int stride, int flipud, int fliplr,
1189 int shift) {
1190 const int16_t *topL = input;
1191 const int16_t *botL = input + 4 * stride;
1192
1193 const int16_t *tmp;
1194
1195 if (flipud) {
1196 tmp = topL;
1197 topL = botL;
1198 botL = tmp;
1199 }
1200
1201 load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
1202 load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
1203 }
1204
1205 #if !CONFIG_REALTIME_ONLY
load_buffer_4x16(const int16_t * input,__m128i * out,const int stride,const int flipud,const int fliplr,const int shift)1206 static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
1207 const int stride, const int flipud,
1208 const int fliplr, const int shift) {
1209 const int16_t *topL = input;
1210 const int16_t *botL = input + 8 * stride;
1211
1212 const int16_t *tmp;
1213
1214 if (flipud) {
1215 tmp = topL;
1216 topL = botL;
1217 botL = tmp;
1218 }
1219 load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
1220 load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
1221 }
1222 #endif
1223
load_buffer_32x8n(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift,const int height)1224 static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
1225 int stride, int flipud, int fliplr,
1226 int shift, const int height) {
1227 const int16_t *in = input;
1228 __m128i *output = out;
1229 for (int col = 0; col < height; col++) {
1230 in = input + col * stride;
1231 output = out + col * 8;
1232 load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
1233 load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
1234 }
1235 }
1236
fdct16x16_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)1237 static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
1238 const int col_num) {
1239 const int32_t *cospi = cospi_arr(bit);
1240 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1241 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
1242 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1243 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1244 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1245 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1246 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1247 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1248 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1249 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1250 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1251 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1252 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1253 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1254 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1255 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1256 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1257 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1258 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1259 __m128i u[16], v[16], x;
1260 int col;
1261
1262 // Calculate the column 0, 1, 2, 3
1263 for (col = 0; col < col_num; ++col) {
1264 // stage 0
1265 // stage 1
1266 u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1267 u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1268 u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1269 u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1270 u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1271 u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1272 u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1273 u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1274 u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1275 u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1276 u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1277 u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1278 u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1279 u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1280 u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1281 u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1282
1283 // stage 2
1284 v[0] = _mm_add_epi32(u[0], u[7]);
1285 v[7] = _mm_sub_epi32(u[0], u[7]);
1286 v[1] = _mm_add_epi32(u[1], u[6]);
1287 v[6] = _mm_sub_epi32(u[1], u[6]);
1288 v[2] = _mm_add_epi32(u[2], u[5]);
1289 v[5] = _mm_sub_epi32(u[2], u[5]);
1290 v[3] = _mm_add_epi32(u[3], u[4]);
1291 v[4] = _mm_sub_epi32(u[3], u[4]);
1292 v[8] = u[8];
1293 v[9] = u[9];
1294
1295 v[10] = _mm_mullo_epi32(u[10], cospim32);
1296 x = _mm_mullo_epi32(u[13], cospi32);
1297 v[10] = _mm_add_epi32(v[10], x);
1298 v[10] = _mm_add_epi32(v[10], rnding);
1299 v[10] = _mm_srai_epi32(v[10], bit);
1300
1301 v[13] = _mm_mullo_epi32(u[10], cospi32);
1302 x = _mm_mullo_epi32(u[13], cospim32);
1303 v[13] = _mm_sub_epi32(v[13], x);
1304 v[13] = _mm_add_epi32(v[13], rnding);
1305 v[13] = _mm_srai_epi32(v[13], bit);
1306
1307 v[11] = _mm_mullo_epi32(u[11], cospim32);
1308 x = _mm_mullo_epi32(u[12], cospi32);
1309 v[11] = _mm_add_epi32(v[11], x);
1310 v[11] = _mm_add_epi32(v[11], rnding);
1311 v[11] = _mm_srai_epi32(v[11], bit);
1312
1313 v[12] = _mm_mullo_epi32(u[11], cospi32);
1314 x = _mm_mullo_epi32(u[12], cospim32);
1315 v[12] = _mm_sub_epi32(v[12], x);
1316 v[12] = _mm_add_epi32(v[12], rnding);
1317 v[12] = _mm_srai_epi32(v[12], bit);
1318 v[14] = u[14];
1319 v[15] = u[15];
1320
1321 // stage 3
1322 u[0] = _mm_add_epi32(v[0], v[3]);
1323 u[3] = _mm_sub_epi32(v[0], v[3]);
1324 u[1] = _mm_add_epi32(v[1], v[2]);
1325 u[2] = _mm_sub_epi32(v[1], v[2]);
1326 u[4] = v[4];
1327
1328 u[5] = _mm_mullo_epi32(v[5], cospim32);
1329 x = _mm_mullo_epi32(v[6], cospi32);
1330 u[5] = _mm_add_epi32(u[5], x);
1331 u[5] = _mm_add_epi32(u[5], rnding);
1332 u[5] = _mm_srai_epi32(u[5], bit);
1333
1334 u[6] = _mm_mullo_epi32(v[5], cospi32);
1335 x = _mm_mullo_epi32(v[6], cospim32);
1336 u[6] = _mm_sub_epi32(u[6], x);
1337 u[6] = _mm_add_epi32(u[6], rnding);
1338 u[6] = _mm_srai_epi32(u[6], bit);
1339
1340 u[7] = v[7];
1341 u[8] = _mm_add_epi32(v[8], v[11]);
1342 u[11] = _mm_sub_epi32(v[8], v[11]);
1343 u[9] = _mm_add_epi32(v[9], v[10]);
1344 u[10] = _mm_sub_epi32(v[9], v[10]);
1345 u[12] = _mm_sub_epi32(v[15], v[12]);
1346 u[15] = _mm_add_epi32(v[15], v[12]);
1347 u[13] = _mm_sub_epi32(v[14], v[13]);
1348 u[14] = _mm_add_epi32(v[14], v[13]);
1349
1350 // stage 4
1351 u[0] = _mm_mullo_epi32(u[0], cospi32);
1352 u[1] = _mm_mullo_epi32(u[1], cospi32);
1353 v[0] = _mm_add_epi32(u[0], u[1]);
1354 v[0] = _mm_add_epi32(v[0], rnding);
1355 v[0] = _mm_srai_epi32(v[0], bit);
1356
1357 v[1] = _mm_sub_epi32(u[0], u[1]);
1358 v[1] = _mm_add_epi32(v[1], rnding);
1359 v[1] = _mm_srai_epi32(v[1], bit);
1360
1361 v[2] = _mm_mullo_epi32(u[2], cospi48);
1362 x = _mm_mullo_epi32(u[3], cospi16);
1363 v[2] = _mm_add_epi32(v[2], x);
1364 v[2] = _mm_add_epi32(v[2], rnding);
1365 v[2] = _mm_srai_epi32(v[2], bit);
1366
1367 v[3] = _mm_mullo_epi32(u[2], cospi16);
1368 x = _mm_mullo_epi32(u[3], cospi48);
1369 v[3] = _mm_sub_epi32(x, v[3]);
1370 v[3] = _mm_add_epi32(v[3], rnding);
1371 v[3] = _mm_srai_epi32(v[3], bit);
1372
1373 v[4] = _mm_add_epi32(u[4], u[5]);
1374 v[5] = _mm_sub_epi32(u[4], u[5]);
1375 v[6] = _mm_sub_epi32(u[7], u[6]);
1376 v[7] = _mm_add_epi32(u[7], u[6]);
1377 v[8] = u[8];
1378
1379 v[9] = _mm_mullo_epi32(u[9], cospim16);
1380 x = _mm_mullo_epi32(u[14], cospi48);
1381 v[9] = _mm_add_epi32(v[9], x);
1382 v[9] = _mm_add_epi32(v[9], rnding);
1383 v[9] = _mm_srai_epi32(v[9], bit);
1384
1385 v[14] = _mm_mullo_epi32(u[9], cospi48);
1386 x = _mm_mullo_epi32(u[14], cospim16);
1387 v[14] = _mm_sub_epi32(v[14], x);
1388 v[14] = _mm_add_epi32(v[14], rnding);
1389 v[14] = _mm_srai_epi32(v[14], bit);
1390
1391 v[10] = _mm_mullo_epi32(u[10], cospim48);
1392 x = _mm_mullo_epi32(u[13], cospim16);
1393 v[10] = _mm_add_epi32(v[10], x);
1394 v[10] = _mm_add_epi32(v[10], rnding);
1395 v[10] = _mm_srai_epi32(v[10], bit);
1396
1397 v[13] = _mm_mullo_epi32(u[10], cospim16);
1398 x = _mm_mullo_epi32(u[13], cospim48);
1399 v[13] = _mm_sub_epi32(v[13], x);
1400 v[13] = _mm_add_epi32(v[13], rnding);
1401 v[13] = _mm_srai_epi32(v[13], bit);
1402
1403 v[11] = u[11];
1404 v[12] = u[12];
1405 v[15] = u[15];
1406
1407 // stage 5
1408 u[0] = v[0];
1409 u[1] = v[1];
1410 u[2] = v[2];
1411 u[3] = v[3];
1412
1413 u[4] = _mm_mullo_epi32(v[4], cospi56);
1414 x = _mm_mullo_epi32(v[7], cospi8);
1415 u[4] = _mm_add_epi32(u[4], x);
1416 u[4] = _mm_add_epi32(u[4], rnding);
1417 u[4] = _mm_srai_epi32(u[4], bit);
1418
1419 u[7] = _mm_mullo_epi32(v[4], cospi8);
1420 x = _mm_mullo_epi32(v[7], cospi56);
1421 u[7] = _mm_sub_epi32(x, u[7]);
1422 u[7] = _mm_add_epi32(u[7], rnding);
1423 u[7] = _mm_srai_epi32(u[7], bit);
1424
1425 u[5] = _mm_mullo_epi32(v[5], cospi24);
1426 x = _mm_mullo_epi32(v[6], cospi40);
1427 u[5] = _mm_add_epi32(u[5], x);
1428 u[5] = _mm_add_epi32(u[5], rnding);
1429 u[5] = _mm_srai_epi32(u[5], bit);
1430
1431 u[6] = _mm_mullo_epi32(v[5], cospi40);
1432 x = _mm_mullo_epi32(v[6], cospi24);
1433 u[6] = _mm_sub_epi32(x, u[6]);
1434 u[6] = _mm_add_epi32(u[6], rnding);
1435 u[6] = _mm_srai_epi32(u[6], bit);
1436
1437 u[8] = _mm_add_epi32(v[8], v[9]);
1438 u[9] = _mm_sub_epi32(v[8], v[9]);
1439 u[10] = _mm_sub_epi32(v[11], v[10]);
1440 u[11] = _mm_add_epi32(v[11], v[10]);
1441 u[12] = _mm_add_epi32(v[12], v[13]);
1442 u[13] = _mm_sub_epi32(v[12], v[13]);
1443 u[14] = _mm_sub_epi32(v[15], v[14]);
1444 u[15] = _mm_add_epi32(v[15], v[14]);
1445
1446 // stage 6
1447 v[0] = u[0];
1448 v[1] = u[1];
1449 v[2] = u[2];
1450 v[3] = u[3];
1451 v[4] = u[4];
1452 v[5] = u[5];
1453 v[6] = u[6];
1454 v[7] = u[7];
1455
1456 v[8] = _mm_mullo_epi32(u[8], cospi60);
1457 x = _mm_mullo_epi32(u[15], cospi4);
1458 v[8] = _mm_add_epi32(v[8], x);
1459 v[8] = _mm_add_epi32(v[8], rnding);
1460 v[8] = _mm_srai_epi32(v[8], bit);
1461
1462 v[15] = _mm_mullo_epi32(u[8], cospi4);
1463 x = _mm_mullo_epi32(u[15], cospi60);
1464 v[15] = _mm_sub_epi32(x, v[15]);
1465 v[15] = _mm_add_epi32(v[15], rnding);
1466 v[15] = _mm_srai_epi32(v[15], bit);
1467
1468 v[9] = _mm_mullo_epi32(u[9], cospi28);
1469 x = _mm_mullo_epi32(u[14], cospi36);
1470 v[9] = _mm_add_epi32(v[9], x);
1471 v[9] = _mm_add_epi32(v[9], rnding);
1472 v[9] = _mm_srai_epi32(v[9], bit);
1473
1474 v[14] = _mm_mullo_epi32(u[9], cospi36);
1475 x = _mm_mullo_epi32(u[14], cospi28);
1476 v[14] = _mm_sub_epi32(x, v[14]);
1477 v[14] = _mm_add_epi32(v[14], rnding);
1478 v[14] = _mm_srai_epi32(v[14], bit);
1479
1480 v[10] = _mm_mullo_epi32(u[10], cospi44);
1481 x = _mm_mullo_epi32(u[13], cospi20);
1482 v[10] = _mm_add_epi32(v[10], x);
1483 v[10] = _mm_add_epi32(v[10], rnding);
1484 v[10] = _mm_srai_epi32(v[10], bit);
1485
1486 v[13] = _mm_mullo_epi32(u[10], cospi20);
1487 x = _mm_mullo_epi32(u[13], cospi44);
1488 v[13] = _mm_sub_epi32(x, v[13]);
1489 v[13] = _mm_add_epi32(v[13], rnding);
1490 v[13] = _mm_srai_epi32(v[13], bit);
1491
1492 v[11] = _mm_mullo_epi32(u[11], cospi12);
1493 x = _mm_mullo_epi32(u[12], cospi52);
1494 v[11] = _mm_add_epi32(v[11], x);
1495 v[11] = _mm_add_epi32(v[11], rnding);
1496 v[11] = _mm_srai_epi32(v[11], bit);
1497
1498 v[12] = _mm_mullo_epi32(u[11], cospi52);
1499 x = _mm_mullo_epi32(u[12], cospi12);
1500 v[12] = _mm_sub_epi32(x, v[12]);
1501 v[12] = _mm_add_epi32(v[12], rnding);
1502 v[12] = _mm_srai_epi32(v[12], bit);
1503
1504 out[0 * col_num + col] = v[0];
1505 out[1 * col_num + col] = v[8];
1506 out[2 * col_num + col] = v[4];
1507 out[3 * col_num + col] = v[12];
1508 out[4 * col_num + col] = v[2];
1509 out[5 * col_num + col] = v[10];
1510 out[6 * col_num + col] = v[6];
1511 out[7 * col_num + col] = v[14];
1512 out[8 * col_num + col] = v[1];
1513 out[9 * col_num + col] = v[9];
1514 out[10 * col_num + col] = v[5];
1515 out[11 * col_num + col] = v[13];
1516 out[12 * col_num + col] = v[3];
1517 out[13 * col_num + col] = v[11];
1518 out[14 * col_num + col] = v[7];
1519 out[15 * col_num + col] = v[15];
1520 }
1521 }
1522
fadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,const int num_cols)1523 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
1524 const int num_cols) {
1525 const int32_t *cospi = cospi_arr(bit);
1526 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1527 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1528 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1529 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1530 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1531 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1532 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1533 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1534 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1535 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1536 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1537 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1538 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1539 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1540 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1541 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
1542 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1543 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1544 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
1545 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1546 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1547 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
1548 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1549 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1550 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
1551 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1552 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1553 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
1554 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1555 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1556 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
1557 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1558 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1559 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
1560 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1561 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1562 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
1563 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1564 const __m128i zero = _mm_setzero_si128();
1565
1566 __m128i u[16], v[16], x, y;
1567 int col;
1568
1569 for (col = 0; col < num_cols; ++col) {
1570 // stage 0
1571 // stage 1
1572 u[0] = in[0 * num_cols + col];
1573 u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
1574 u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
1575 u[3] = in[8 * num_cols + col];
1576 u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
1577 u[5] = in[12 * num_cols + col];
1578 u[6] = in[4 * num_cols + col];
1579 u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
1580 u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
1581 u[9] = in[14 * num_cols + col];
1582 u[10] = in[6 * num_cols + col];
1583 u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
1584 u[12] = in[2 * num_cols + col];
1585 u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
1586 u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
1587 u[15] = in[10 * num_cols + col];
1588
1589 // stage 2
1590 v[0] = u[0];
1591 v[1] = u[1];
1592
1593 x = _mm_mullo_epi32(u[2], cospi32);
1594 y = _mm_mullo_epi32(u[3], cospi32);
1595 v[2] = _mm_add_epi32(x, y);
1596 v[2] = _mm_add_epi32(v[2], rnding);
1597 v[2] = _mm_srai_epi32(v[2], bit);
1598
1599 v[3] = _mm_sub_epi32(x, y);
1600 v[3] = _mm_add_epi32(v[3], rnding);
1601 v[3] = _mm_srai_epi32(v[3], bit);
1602
1603 v[4] = u[4];
1604 v[5] = u[5];
1605
1606 x = _mm_mullo_epi32(u[6], cospi32);
1607 y = _mm_mullo_epi32(u[7], cospi32);
1608 v[6] = _mm_add_epi32(x, y);
1609 v[6] = _mm_add_epi32(v[6], rnding);
1610 v[6] = _mm_srai_epi32(v[6], bit);
1611
1612 v[7] = _mm_sub_epi32(x, y);
1613 v[7] = _mm_add_epi32(v[7], rnding);
1614 v[7] = _mm_srai_epi32(v[7], bit);
1615
1616 v[8] = u[8];
1617 v[9] = u[9];
1618
1619 x = _mm_mullo_epi32(u[10], cospi32);
1620 y = _mm_mullo_epi32(u[11], cospi32);
1621 v[10] = _mm_add_epi32(x, y);
1622 v[10] = _mm_add_epi32(v[10], rnding);
1623 v[10] = _mm_srai_epi32(v[10], bit);
1624
1625 v[11] = _mm_sub_epi32(x, y);
1626 v[11] = _mm_add_epi32(v[11], rnding);
1627 v[11] = _mm_srai_epi32(v[11], bit);
1628
1629 v[12] = u[12];
1630 v[13] = u[13];
1631
1632 x = _mm_mullo_epi32(u[14], cospi32);
1633 y = _mm_mullo_epi32(u[15], cospi32);
1634 v[14] = _mm_add_epi32(x, y);
1635 v[14] = _mm_add_epi32(v[14], rnding);
1636 v[14] = _mm_srai_epi32(v[14], bit);
1637
1638 v[15] = _mm_sub_epi32(x, y);
1639 v[15] = _mm_add_epi32(v[15], rnding);
1640 v[15] = _mm_srai_epi32(v[15], bit);
1641
1642 // stage 3
1643 u[0] = _mm_add_epi32(v[0], v[2]);
1644 u[1] = _mm_add_epi32(v[1], v[3]);
1645 u[2] = _mm_sub_epi32(v[0], v[2]);
1646 u[3] = _mm_sub_epi32(v[1], v[3]);
1647 u[4] = _mm_add_epi32(v[4], v[6]);
1648 u[5] = _mm_add_epi32(v[5], v[7]);
1649 u[6] = _mm_sub_epi32(v[4], v[6]);
1650 u[7] = _mm_sub_epi32(v[5], v[7]);
1651 u[8] = _mm_add_epi32(v[8], v[10]);
1652 u[9] = _mm_add_epi32(v[9], v[11]);
1653 u[10] = _mm_sub_epi32(v[8], v[10]);
1654 u[11] = _mm_sub_epi32(v[9], v[11]);
1655 u[12] = _mm_add_epi32(v[12], v[14]);
1656 u[13] = _mm_add_epi32(v[13], v[15]);
1657 u[14] = _mm_sub_epi32(v[12], v[14]);
1658 u[15] = _mm_sub_epi32(v[13], v[15]);
1659
1660 // stage 4
1661 v[0] = u[0];
1662 v[1] = u[1];
1663 v[2] = u[2];
1664 v[3] = u[3];
1665 v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
1666 v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
1667 v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
1668 v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
1669 v[8] = u[8];
1670 v[9] = u[9];
1671 v[10] = u[10];
1672 v[11] = u[11];
1673 v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
1674 v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
1675 v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
1676 v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
1677
1678 // stage 5
1679 u[0] = _mm_add_epi32(v[0], v[4]);
1680 u[1] = _mm_add_epi32(v[1], v[5]);
1681 u[2] = _mm_add_epi32(v[2], v[6]);
1682 u[3] = _mm_add_epi32(v[3], v[7]);
1683 u[4] = _mm_sub_epi32(v[0], v[4]);
1684 u[5] = _mm_sub_epi32(v[1], v[5]);
1685 u[6] = _mm_sub_epi32(v[2], v[6]);
1686 u[7] = _mm_sub_epi32(v[3], v[7]);
1687 u[8] = _mm_add_epi32(v[8], v[12]);
1688 u[9] = _mm_add_epi32(v[9], v[13]);
1689 u[10] = _mm_add_epi32(v[10], v[14]);
1690 u[11] = _mm_add_epi32(v[11], v[15]);
1691 u[12] = _mm_sub_epi32(v[8], v[12]);
1692 u[13] = _mm_sub_epi32(v[9], v[13]);
1693 u[14] = _mm_sub_epi32(v[10], v[14]);
1694 u[15] = _mm_sub_epi32(v[11], v[15]);
1695
1696 // stage 6
1697 v[0] = u[0];
1698 v[1] = u[1];
1699 v[2] = u[2];
1700 v[3] = u[3];
1701 v[4] = u[4];
1702 v[5] = u[5];
1703 v[6] = u[6];
1704 v[7] = u[7];
1705 v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
1706 v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
1707 v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
1708 v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
1709 v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
1710 v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
1711 v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
1712 v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
1713
1714 // stage 7
1715 u[0] = _mm_add_epi32(v[0], v[8]);
1716 u[1] = _mm_add_epi32(v[1], v[9]);
1717 u[2] = _mm_add_epi32(v[2], v[10]);
1718 u[3] = _mm_add_epi32(v[3], v[11]);
1719 u[4] = _mm_add_epi32(v[4], v[12]);
1720 u[5] = _mm_add_epi32(v[5], v[13]);
1721 u[6] = _mm_add_epi32(v[6], v[14]);
1722 u[7] = _mm_add_epi32(v[7], v[15]);
1723 u[8] = _mm_sub_epi32(v[0], v[8]);
1724 u[9] = _mm_sub_epi32(v[1], v[9]);
1725 u[10] = _mm_sub_epi32(v[2], v[10]);
1726 u[11] = _mm_sub_epi32(v[3], v[11]);
1727 u[12] = _mm_sub_epi32(v[4], v[12]);
1728 u[13] = _mm_sub_epi32(v[5], v[13]);
1729 u[14] = _mm_sub_epi32(v[6], v[14]);
1730 u[15] = _mm_sub_epi32(v[7], v[15]);
1731
1732 // stage 8
1733 v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
1734 v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
1735 v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
1736 v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
1737 v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
1738 v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
1739 v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
1740 v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
1741 v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
1742 v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
1743 v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
1744 v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
1745 v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
1746 v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
1747 v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
1748 v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
1749
1750 // stage 9
1751 out[0 * num_cols + col] = v[1];
1752 out[1 * num_cols + col] = v[14];
1753 out[2 * num_cols + col] = v[3];
1754 out[3 * num_cols + col] = v[12];
1755 out[4 * num_cols + col] = v[5];
1756 out[5 * num_cols + col] = v[10];
1757 out[6 * num_cols + col] = v[7];
1758 out[7 * num_cols + col] = v[8];
1759 out[8 * num_cols + col] = v[9];
1760 out[9 * num_cols + col] = v[6];
1761 out[10 * num_cols + col] = v[11];
1762 out[11 * num_cols + col] = v[4];
1763 out[12 * num_cols + col] = v[13];
1764 out[13 * num_cols + col] = v[2];
1765 out[14 * num_cols + col] = v[15];
1766 out[15 * num_cols + col] = v[0];
1767 }
1768 }
1769
col_txfm_16x16_rounding(__m128i * in,int shift)1770 static void col_txfm_16x16_rounding(__m128i *in, int shift) {
1771 // Note:
1772 // We split 16x16 rounding into 4 sections of 8x8 rounding,
1773 // instead of 4 columns
1774 col_txfm_8x8_rounding(&in[0], shift);
1775 col_txfm_8x8_rounding(&in[16], shift);
1776 col_txfm_8x8_rounding(&in[32], shift);
1777 col_txfm_8x8_rounding(&in[48], shift);
1778 }
1779
col_txfm_8x16_rounding(__m128i * in,int shift)1780 static void col_txfm_8x16_rounding(__m128i *in, int shift) {
1781 col_txfm_8x8_rounding(&in[0], shift);
1782 col_txfm_8x8_rounding(&in[16], shift);
1783 }
1784
write_buffer_16x16(const __m128i * in,int32_t * output)1785 static void write_buffer_16x16(const __m128i *in, int32_t *output) {
1786 const int size_8x8 = 16 * 4;
1787 write_buffer_8x8(&in[0], output);
1788 output += size_8x8;
1789 write_buffer_8x8(&in[16], output);
1790 output += size_8x8;
1791 write_buffer_8x8(&in[32], output);
1792 output += size_8x8;
1793 write_buffer_8x8(&in[48], output);
1794 }
idtx16x16_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)1795 static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
1796 (void)bit;
1797 __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
1798 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
1799 __m128i a_low;
1800
1801 int num_iters = 16 * col_num;
1802 for (int i = 0; i < num_iters; i++) {
1803 a_low = _mm_mullo_epi32(in[i], fact);
1804 a_low = _mm_add_epi32(a_low, offset);
1805 out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
1806 }
1807 }
av1_fwd_txfm2d_16x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)1808 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
1809 int stride, TX_TYPE tx_type, int bd) {
1810 __m128i in[64], out[64];
1811 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
1812 const int txw_idx = get_txw_idx(TX_16X16);
1813 const int txh_idx = get_txh_idx(TX_16X16);
1814 const int col_num = 4;
1815 switch (tx_type) {
1816 case DCT_DCT:
1817 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1818 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1819 col_txfm_16x16_rounding(out, -shift[1]);
1820 transpose_16x16(out, in);
1821 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1822 transpose_16x16(out, in);
1823 write_buffer_16x16(in, coeff);
1824 break;
1825 case ADST_DCT:
1826 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1827 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1828 col_num);
1829 col_txfm_16x16_rounding(out, -shift[1]);
1830 transpose_16x16(out, in);
1831 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1832 transpose_16x16(out, in);
1833 write_buffer_16x16(in, coeff);
1834 break;
1835 case DCT_ADST:
1836 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1837 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1838 col_txfm_16x16_rounding(out, -shift[1]);
1839 transpose_16x16(out, in);
1840 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1841 col_num);
1842 transpose_16x16(out, in);
1843 write_buffer_16x16(in, coeff);
1844 break;
1845 case ADST_ADST:
1846 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1847 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1848 col_num);
1849 col_txfm_16x16_rounding(out, -shift[1]);
1850 transpose_16x16(out, in);
1851 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1852 col_num);
1853 transpose_16x16(out, in);
1854 write_buffer_16x16(in, coeff);
1855 break;
1856 case FLIPADST_DCT:
1857 load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1858 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1859 col_num);
1860 col_txfm_16x16_rounding(out, -shift[1]);
1861 transpose_16x16(out, in);
1862 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1863 transpose_16x16(out, in);
1864 write_buffer_16x16(in, coeff);
1865 break;
1866 case DCT_FLIPADST:
1867 load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1868 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1869 col_txfm_16x16_rounding(out, -shift[1]);
1870 transpose_16x16(out, in);
1871 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1872 col_num);
1873 transpose_16x16(out, in);
1874 write_buffer_16x16(in, coeff);
1875 break;
1876 case FLIPADST_FLIPADST:
1877 load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
1878 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1879 col_num);
1880 col_txfm_16x16_rounding(out, -shift[1]);
1881 transpose_16x16(out, in);
1882 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1883 col_num);
1884 transpose_16x16(out, in);
1885 write_buffer_16x16(in, coeff);
1886 break;
1887 case ADST_FLIPADST:
1888 load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1889 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1890 col_num);
1891 col_txfm_16x16_rounding(out, -shift[1]);
1892 transpose_16x16(out, in);
1893 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1894 col_num);
1895 transpose_16x16(out, in);
1896 write_buffer_16x16(in, coeff);
1897 break;
1898 case FLIPADST_ADST:
1899 load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1900 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1901 col_num);
1902 col_txfm_16x16_rounding(out, -shift[1]);
1903 transpose_16x16(out, in);
1904 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1905 col_num);
1906 transpose_16x16(out, in);
1907 write_buffer_16x16(in, coeff);
1908 break;
1909 case IDTX:
1910 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1911 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1912 col_txfm_16x16_rounding(out, -shift[1]);
1913 transpose_16x16(out, in);
1914 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1915 transpose_16x16(out, in);
1916 write_buffer_16x16(in, coeff);
1917 break;
1918 case V_DCT:
1919 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1920 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1921 col_txfm_16x16_rounding(out, -shift[1]);
1922 transpose_16x16(out, in);
1923 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1924 transpose_16x16(out, in);
1925 write_buffer_16x16(in, coeff);
1926 break;
1927 case H_DCT:
1928 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1929 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1930 col_txfm_16x16_rounding(out, -shift[1]);
1931 transpose_16x16(out, in);
1932 fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1933 transpose_16x16(out, in);
1934 write_buffer_16x16(in, coeff);
1935 break;
1936 case V_ADST:
1937 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1938 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1939 col_num);
1940 col_txfm_16x16_rounding(out, -shift[1]);
1941 transpose_16x16(out, in);
1942 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1943 transpose_16x16(out, in);
1944 write_buffer_16x16(in, coeff);
1945 break;
1946 case H_ADST:
1947 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1948 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1949 col_txfm_16x16_rounding(out, -shift[1]);
1950 transpose_16x16(out, in);
1951 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1952 col_num);
1953 transpose_16x16(out, in);
1954 write_buffer_16x16(in, coeff);
1955 break;
1956 case V_FLIPADST:
1957 load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1958 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1959 col_num);
1960 col_txfm_16x16_rounding(out, -shift[1]);
1961 transpose_16x16(out, in);
1962 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1963 transpose_16x16(out, in);
1964 write_buffer_16x16(in, coeff);
1965 break;
1966 case H_FLIPADST:
1967 load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1968 idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1969 col_txfm_16x16_rounding(out, -shift[1]);
1970 transpose_16x16(out, in);
1971 fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1972 col_num);
1973 transpose_16x16(out, in);
1974 write_buffer_16x16(in, coeff);
1975 break;
1976 default: assert(0);
1977 }
1978 (void)bd;
1979 }
1980
flip_buf_sse4_1(__m128i * in,__m128i * out,int size)1981 static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
1982 for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
1983 for (int i = 1; i < size; i += 2) in[size - i] = out[i];
1984 }
1985
1986 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
1987 fdct8x8_sse4_1, // DCT_DCT
1988 fadst8x8_sse4_1, // ADST_DCT
1989 fdct8x8_sse4_1, // DCT_ADST
1990 fadst8x8_sse4_1, // ADST_ADST
1991 fadst8x8_sse4_1, // FLIPADST_DCT
1992 fdct8x8_sse4_1, // DCT_FLIPADST
1993 fadst8x8_sse4_1, // FLIPADST_FLIPADST
1994 fadst8x8_sse4_1, // ADST_FLIPADST
1995 fadst8x8_sse4_1, // FLIPADST_ADST
1996 idtx8x8_sse4_1, // IDTX
1997 fdct8x8_sse4_1, // V_DCT
1998 idtx8x8_sse4_1, // H_DCT
1999 fadst8x8_sse4_1, // V_ADST
2000 idtx8x8_sse4_1, // H_ADST
2001 fadst8x8_sse4_1, // V_FLIPADST
2002 idtx8x8_sse4_1 // H_FLIPADST
2003 };
2004 #if !CONFIG_REALTIME_ONLY
2005 static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
2006 fdct8x8_sse4_1, // DCT_DCT
2007 NULL, // ADST_DCT
2008 NULL, // DCT_ADST
2009 NULL, // ADST_ADST
2010 NULL, // FLIPADST_DCT
2011 NULL, // DCT_FLIPADST
2012 NULL, // FLIPADST_FLIPADST
2013 NULL, // ADST_FLIPADST
2014 NULL, // FLIPADST-ADST
2015 idtx32x8_sse4_1, // IDTX
2016 NULL, // V_DCT
2017 NULL, // H_DCT
2018 NULL, // V_ADST
2019 NULL, // H_ADST
2020 NULL, // V_FLIPADST
2021 NULL, // H_FLIPADST
2022 };
2023 #endif
2024 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
2025 fdct4x8_sse4_1, // DCT_DCT
2026 fadst8x8_sse4_1, // ADST_DCT
2027 fdct4x8_sse4_1, // DCT_ADST
2028 fadst8x8_sse4_1, // ADST_ADST
2029 fadst8x8_sse4_1, // FLIPADST_DCT
2030 fdct4x8_sse4_1, // DCT_FLIPADST
2031 fadst8x8_sse4_1, // FLIPADST_FLIPADST
2032 fadst8x8_sse4_1, // ADST_FLIPADST
2033 fadst8x8_sse4_1, // FLIPADST_ADST
2034 idtx8x8_sse4_1, // IDTX
2035 fdct4x8_sse4_1, // V_DCT
2036 idtx8x8_sse4_1, // H_DCT
2037 fadst8x8_sse4_1, // V_ADST
2038 idtx8x8_sse4_1, // H_ADST
2039 fadst8x8_sse4_1, // V_FLIPADST
2040 idtx8x8_sse4_1 // H_FLIPADST
2041 };
2042
2043 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
2044 fdct16x16_sse4_1, // DCT_DCT
2045 fdct16x16_sse4_1, // ADST_DCT
2046 fadst16x16_sse4_1, // DCT_ADST
2047 fadst16x16_sse4_1, // ADST_ADST
2048 fdct16x16_sse4_1, // FLIPADST_DCT
2049 fadst16x16_sse4_1, // DCT_FLIPADST
2050 fadst16x16_sse4_1, // FLIPADST_FLIPADST
2051 fadst16x16_sse4_1, // ADST_FLIPADST
2052 fadst16x16_sse4_1, // FLIPADST_ADST
2053 idtx16x16_sse4_1, // IDTX
2054 idtx16x16_sse4_1, // V_DCT
2055 fdct16x16_sse4_1, // H_DCT
2056 idtx16x16_sse4_1, // V_ADST
2057 fadst16x16_sse4_1, // H_ADST
2058 idtx16x16_sse4_1, // V_FLIPADST
2059 fadst16x16_sse4_1 // H_FLIPADST
2060 };
2061
2062 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
2063 fdct16x16_sse4_1, // DCT_DCT
2064 fadst16x16_sse4_1, // ADST_DCT
2065 fdct16x16_sse4_1, // DCT_ADST
2066 fadst16x16_sse4_1, // ADST_ADST
2067 fadst16x16_sse4_1, // FLIPADST_DCT
2068 fdct16x16_sse4_1, // DCT_FLIPADST
2069 fadst16x16_sse4_1, // FLIPADST_FLIPADST
2070 fadst16x16_sse4_1, // ADST_FLIPADST
2071 fadst16x16_sse4_1, // FLIPADST_ADST
2072 idtx16x16_sse4_1, // IDTX
2073 fdct16x16_sse4_1, // V_DCT
2074 idtx16x16_sse4_1, // H_DCT
2075 fadst16x16_sse4_1, // V_ADST
2076 idtx16x16_sse4_1, // H_ADST
2077 fadst16x16_sse4_1, // V_FLIPADST
2078 idtx16x16_sse4_1 // H_FLIPADST
2079 };
2080 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
2081 fdct8x8_sse4_1, // DCT_DCT
2082 fdct8x8_sse4_1, // ADST_DCT
2083 fadst8x8_sse4_1, // DCT_ADST
2084 fadst8x8_sse4_1, // ADST_ADST
2085 fdct8x8_sse4_1, // FLIPADST_DCT
2086 fadst8x8_sse4_1, // DCT_FLIPADST
2087 fadst8x8_sse4_1, // FLIPADST_FLIPADST
2088 fadst8x8_sse4_1, // ADST_FLIPADST
2089 fadst8x8_sse4_1, // FLIPADST_ADST
2090 idtx8x8_sse4_1, // IDTX
2091 idtx8x8_sse4_1, // V_DCT
2092 fdct8x8_sse4_1, // H_DCT
2093 idtx8x8_sse4_1, // V_ADST
2094 fadst8x8_sse4_1, // H_ADST
2095 idtx8x8_sse4_1, // V_FLIPADST
2096 fadst8x8_sse4_1 // H_FLIPADST
2097 };
2098
2099 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
2100 fdct4x8_sse4_1, // DCT_DCT
2101 fdct4x8_sse4_1, // ADST_DCT
2102 fadst8x8_sse4_1, // DCT_ADST
2103 fadst8x8_sse4_1, // ADST_ADST
2104 fdct4x8_sse4_1, // FLIPADST_DCT
2105 fadst8x8_sse4_1, // DCT_FLIPADST
2106 fadst8x8_sse4_1, // FLIPADST_FLIPADST
2107 fadst8x8_sse4_1, // ADST_FLIPADST
2108 fadst8x8_sse4_1, // FLIPADST_ADST
2109 idtx8x8_sse4_1, // IDTX
2110 idtx8x8_sse4_1, // V_DCT
2111 fdct4x8_sse4_1, // H_DCT
2112 idtx8x8_sse4_1, // V_ADST
2113 fadst8x8_sse4_1, // H_ADST
2114 idtx8x8_sse4_1, // V_FLIPADST
2115 fadst8x8_sse4_1 // H_FLIPADST
2116 };
2117
2118 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
2119 fdct4x4_sse4_1, // DCT_DCT
2120 fdct4x4_sse4_1, // ADST_DCT
2121 fadst4x4_sse4_1, // DCT_ADST
2122 fadst4x4_sse4_1, // ADST_ADST
2123 fdct4x4_sse4_1, // FLIPADST_DCT
2124 fadst4x4_sse4_1, // DCT_FLIPADST
2125 fadst4x4_sse4_1, // FLIPADST_FLIPADST
2126 fadst4x4_sse4_1, // ADST_FLIPADST
2127 fadst4x4_sse4_1, // FLIPADST_ADST
2128 idtx4x4_sse4_1, // IDTX
2129 idtx4x4_sse4_1, // V_DCT
2130 fdct4x4_sse4_1, // H_DCT
2131 idtx4x4_sse4_1, // V_ADST
2132 fadst4x4_sse4_1, // H_ADST
2133 idtx4x4_sse4_1, // V_FLIPADST
2134 fadst4x4_sse4_1 // H_FLIPADST
2135 };
2136
2137 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
2138 fdct4x4_sse4_1, // DCT_DCT
2139 fadst4x4_sse4_1, // ADST_DCT
2140 fdct4x4_sse4_1, // DCT_ADST
2141 fadst4x4_sse4_1, // ADST_ADST
2142 fadst4x4_sse4_1, // FLIPADST_DCT
2143 fdct4x4_sse4_1, // DCT_FLIPADST
2144 fadst4x4_sse4_1, // FLIPADST_FLIPADST
2145 fadst4x4_sse4_1, // ADST_FLIPADST
2146 fadst4x4_sse4_1, // FLIPADST_ADST
2147 idtx4x4_sse4_1, // IDTX
2148 fdct4x4_sse4_1, // V_DCT
2149 idtx4x4_sse4_1, // H_DCT
2150 fadst4x4_sse4_1, // V_ADST
2151 idtx4x4_sse4_1, // H_ADST
2152 fadst4x4_sse4_1, // V_FLIPADST
2153 idtx4x4_sse4_1 // H_FLIPADST
2154 };
2155
2156 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
2157 av1_fdct32_sse4_1, // DCT_DCT
2158 NULL, // ADST_DCT
2159 NULL, // DCT_ADST
2160 NULL, // ADST_ADST
2161 NULL, // FLIPADST_DCT
2162 NULL, // DCT_FLIPADST
2163 NULL, // FLIPADST_FLIPADST
2164 NULL, // ADST_FLIPADST
2165 NULL, // FLIPADST_ADST
2166 av1_idtx32_sse4_1, // IDTX
2167 NULL, // V_DCT
2168 NULL, // H_DCT
2169 NULL, // V_ADST
2170 NULL, // H_ADST
2171 NULL, // V_FLIPADST
2172 NULL // H_FLIPADST
2173 };
2174
2175 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
2176 fdct16x16_sse4_1, // DCT_DCT
2177 NULL, // ADST_DCT
2178 NULL, // DCT_ADST
2179 NULL, // ADST_ADST
2180 NULL, // FLIPADST_DCT
2181 NULL, // DCT_FLIPADST
2182 NULL, // FLIPADST_FLIPADST
2183 NULL, // ADST_FLIPADST
2184 NULL, // FLIPADST_ADST
2185 idtx16x16_sse4_1, // IDTX
2186 NULL, // V_DCT
2187 NULL, // H_DCT
2188 NULL, // V_ADST
2189 NULL, // H_ADST
2190 NULL, // V_FLIPADST
2191 NULL // H_FLIPADST
2192 };
2193
av1_fwd_txfm2d_16x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2194 void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
2195 int stride, TX_TYPE tx_type, int bd) {
2196 __m128i in[32], out[32];
2197 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2198 const int txw_idx = get_txw_idx(TX_16X8);
2199 const int txh_idx = get_txh_idx(TX_16X8);
2200 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
2201 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
2202 int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2203 int ud_flip, lr_flip;
2204 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2205
2206 for (int i = 0; i < 2; i++) {
2207 load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
2208 col_txfm(in, in, bit, 2);
2209 col_txfm_8x8_rounding(in, -shift[1]);
2210 transpose_8x8(in, out + i * 16);
2211 }
2212
2213 if (lr_flip) {
2214 flip_buf_sse4_1(in, out, 32);
2215 row_txfm(in, out, bit, 2);
2216 } else {
2217 row_txfm(out, out, bit, 2);
2218 }
2219
2220 for (int i = 0; i < 2; i++) {
2221 transpose_8x8(out + i * 16, in);
2222 av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
2223 write_buffer_16x8(in, coeff + i * 8, 16);
2224 }
2225
2226 (void)bd;
2227 }
2228
av1_fwd_txfm2d_8x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2229 void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
2230 int stride, TX_TYPE tx_type, int bd) {
2231 __m128i in[32], out[32];
2232 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2233 const int txw_idx = get_txw_idx(TX_8X16);
2234 const int txh_idx = get_txh_idx(TX_8X16);
2235 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
2236 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
2237 int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2238 int ud_flip, lr_flip;
2239 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2240
2241 load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
2242 col_txfm(in, in, bit, 2);
2243 col_txfm_8x16_rounding(in, -shift[1]);
2244 transpose_8x8(in, out);
2245 transpose_8x8(in + 16, out + 16);
2246
2247 for (int i = 0; i < 2; i++) {
2248 row_txfm(out + i * 16, out, bit, 2);
2249 transpose_8x8(out, in);
2250 av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
2251 write_buffer_8x8(in, coeff + i * 64);
2252 }
2253
2254 (void)bd;
2255 }
2256
2257 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_4x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2258 void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
2259 int stride, TX_TYPE tx_type, int bd) {
2260 __m128i in[16];
2261 __m128i *outcoeff128 = (__m128i *)coeff;
2262 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
2263 const int txw_idx = get_txw_idx(TX_4X16);
2264 const int txh_idx = get_txh_idx(TX_4X16);
2265 const int txfm_size_col = tx_size_wide[TX_4X16];
2266 const int txfm_size_row = tx_size_high[TX_4X16];
2267 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2268 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2269 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
2270 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
2271
2272 int ud_flip, lr_flip;
2273 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2274 // col transform
2275 load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
2276 col_txfm(in, outcoeff128, bitcol, 1);
2277 col_txfm_8x8_rounding(outcoeff128, -shift[1]);
2278 transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
2279
2280 // row transform
2281 for (int i = 0; i < txfm_size_col; i++) {
2282 row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col);
2283 }
2284 (void)bd;
2285 }
2286 #endif
2287
av1_fwd_txfm2d_16x4_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2288 void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
2289 int stride, TX_TYPE tx_type, int bd) {
2290 __m128i in[16];
2291 __m128i *outcoeff128 = (__m128i *)coeff;
2292 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
2293 const int txw_idx = get_txw_idx(TX_16X4);
2294 const int txh_idx = get_txh_idx(TX_16X4);
2295 const int txfm_size_col = tx_size_wide[TX_16X4];
2296 const int txfm_size_row = tx_size_high[TX_16X4];
2297 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2298 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2299 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
2300 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
2301 int ud_flip, lr_flip;
2302 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2303
2304 // col transform
2305 load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
2306
2307 for (int i = 0; i < txfm_size_row; i++) {
2308 col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol,
2309 1);
2310 }
2311 col_txfm_8x8_rounding(outcoeff128, -shift[1]);
2312
2313 // row transform
2314 row_txfm(outcoeff128, in, bitrow, 1);
2315 transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
2316 (void)bd;
2317 }
2318
av1_fwd_txfm2d_16x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2319 void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
2320 int stride, TX_TYPE tx_type, int bd) {
2321 __m128i in[128];
2322 __m128i *outcoef128 = (__m128i *)coeff;
2323 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
2324 const int txw_idx = get_txw_idx(TX_16X32);
2325 const int txh_idx = get_txh_idx(TX_16X32);
2326 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
2327 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
2328 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2329 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2330
2331 // column transform
2332 load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2333 load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
2334
2335 for (int i = 0; i < 4; i++) {
2336 col_txfm((in + i), (in + i), bitcol, 4);
2337 }
2338 col_txfm_16x16_rounding(&in[0], -shift[1]);
2339 col_txfm_16x16_rounding(&in[64], -shift[1]);
2340 transpose_8nx8n(in, outcoef128, 16, 32);
2341
2342 // row transform
2343 row_txfm(outcoef128, in, bitrow, 8);
2344 transpose_8nx8n(in, outcoef128, 32, 16);
2345 av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
2346 NewSqrt2);
2347 (void)bd;
2348 }
2349
av1_fwd_txfm2d_32x64_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2350 void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
2351 int stride, TX_TYPE tx_type, int bd) {
2352 (void)tx_type;
2353 __m128i in[512];
2354 __m128i *outcoef128 = (__m128i *)coeff;
2355 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
2356 const int txw_idx = get_txw_idx(TX_32X64);
2357 const int txh_idx = get_txh_idx(TX_32X64);
2358 const int txfm_size_col = tx_size_wide[TX_32X64];
2359 const int txfm_size_row = tx_size_high[TX_32X64];
2360 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2361 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2362 const int num_row = txfm_size_row >> 2;
2363 const int num_col = txfm_size_col >> 2;
2364
2365 // column transform
2366 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
2367 for (int i = 0; i < num_col; i++) {
2368 av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
2369 }
2370 for (int i = 0; i < num_col; i++) {
2371 col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
2372 }
2373 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2374
2375 // row transform
2376 for (int i = 0; i < num_row; i++) {
2377 av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
2378 }
2379 transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2380 av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512, -shift[2],
2381 NewSqrt2);
2382 (void)bd;
2383 }
2384
av1_fwd_txfm2d_64x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2385 void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
2386 int stride, TX_TYPE tx_type, int bd) {
2387 (void)tx_type;
2388 __m128i in[512];
2389 __m128i *outcoef128 = (__m128i *)coeff;
2390 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
2391 const int txw_idx = get_txw_idx(TX_64X32);
2392 const int txh_idx = get_txh_idx(TX_64X32);
2393 const int txfm_size_col = tx_size_wide[TX_64X32];
2394 const int txfm_size_row = tx_size_high[TX_64X32];
2395 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2396 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2397 const int num_row = txfm_size_row >> 2;
2398 const int num_col = txfm_size_col >> 2;
2399
2400 // column transform
2401 for (int i = 0; i < 32; i++) {
2402 load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
2403 load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
2404 shift[0]);
2405 load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
2406 shift[0]);
2407 load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
2408 shift[0]);
2409 }
2410
2411 for (int i = 0; i < num_col; i++) {
2412 av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
2413 }
2414
2415 for (int i = 0; i < num_row; i++) {
2416 col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
2417 }
2418 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2419
2420 // row transform
2421 for (int i = 0; i < num_row; i++) {
2422 av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
2423 }
2424 transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
2425 av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512 >> 1,
2426 -shift[2], NewSqrt2);
2427 (void)bd;
2428 }
2429
av1_fwd_txfm2d_32x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2430 void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
2431 int stride, TX_TYPE tx_type, int bd) {
2432 __m128i in[128];
2433 __m128i *outcoef128 = (__m128i *)coeff;
2434 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
2435 const int txw_idx = get_txw_idx(TX_32X16);
2436 const int txh_idx = get_txh_idx(TX_32X16);
2437 const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
2438 const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
2439 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2440 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2441
2442 // column transform
2443 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
2444 col_txfm(in, in, bitcol, 8);
2445 col_txfm_16x16_rounding(&in[0], -shift[1]);
2446 col_txfm_16x16_rounding(&in[64], -shift[1]);
2447 transpose_8nx8n(in, outcoef128, 32, 16);
2448
2449 // row transform
2450 for (int i = 0; i < 4; i++) {
2451 row_txfm((outcoef128 + i), (in + i), bitrow, 4);
2452 }
2453 transpose_8nx8n(in, outcoef128, 16, 32);
2454 av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
2455 NewSqrt2);
2456 (void)bd;
2457 }
2458
2459 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_8x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2460 void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
2461 int stride, TX_TYPE tx_type, int bd) {
2462 __m128i in[64];
2463 __m128i *outcoef128 = (__m128i *)coeff;
2464 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
2465 const int txw_idx = get_txw_idx(TX_8X32);
2466 const int txh_idx = get_txh_idx(TX_8X32);
2467 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
2468 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
2469 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2470 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2471
2472 const int txfm_size_col = tx_size_wide[TX_8X32];
2473 const int txfm_size_row = tx_size_high[TX_8X32];
2474 const int num_col = txfm_size_col >> 2;
2475
2476 // column transform
2477 load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
2478 load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
2479 stride, 0, 0, shift[0]);
2480
2481 for (int i = 0; i < num_col; i++) {
2482 col_txfm((in + i), (in + i), bitcol, num_col);
2483 }
2484 col_txfm_16x16_rounding(in, -shift[1]);
2485 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2486
2487 // row transform
2488 for (int i = 0; i < txfm_size_col; i += 2) {
2489 row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col);
2490 }
2491 transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2492 (void)bd;
2493 }
2494
av1_fwd_txfm2d_32x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2495 void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
2496 int stride, TX_TYPE tx_type, int bd) {
2497 __m128i in[64];
2498 __m128i *outcoef128 = (__m128i *)coeff;
2499 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
2500 const int txw_idx = get_txw_idx(TX_32X8);
2501 const int txh_idx = get_txh_idx(TX_32X8);
2502 const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
2503 const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
2504 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2505 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2506
2507 const int txfm_size_col = tx_size_wide[TX_32X8];
2508 const int txfm_size_row = tx_size_high[TX_32X8];
2509 const int num_col = txfm_size_row >> 2;
2510
2511 // column transform
2512 load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
2513 for (int i = 0; i < txfm_size_row; i += 2) {
2514 col_txfm((in + i), (in + i), bitcol, txfm_size_row);
2515 }
2516
2517 col_txfm_16x16_rounding(&in[0], -shift[1]);
2518 transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2519
2520 // row transform
2521 for (int i = 0; i < num_col; i++) {
2522 row_txfm((outcoef128 + i), (in + i), bitrow, num_col);
2523 }
2524 transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2525 (void)bd;
2526 }
2527 #endif
2528
av1_fwd_txfm2d_4x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2529 void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
2530 TX_TYPE tx_type, int bd) {
2531 __m128i in[8];
2532 __m128i *outcoeff128 = (__m128i *)coeff;
2533 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
2534 const int txw_idx = get_txw_idx(TX_4X8);
2535 const int txh_idx = get_txh_idx(TX_4X8);
2536 const int txfm_size_col = tx_size_wide[TX_4X8];
2537 const int txfm_size_row = tx_size_high[TX_4X8];
2538 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2539 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2540 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
2541 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
2542
2543 int ud_flip, lr_flip;
2544 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2545
2546 load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
2547 col_txfm(in, in, bitcol, 1);
2548 col_txfm_4x8_rounding(in, -shift[1]);
2549 transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row);
2550
2551 for (int i = 0; i < 2; i++) {
2552 row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2);
2553 }
2554 av1_round_shift_rect_array_32_sse4_1(in, outcoeff128, txfm_size_row,
2555 -shift[2], NewSqrt2);
2556 (void)bd;
2557 }
2558
av1_fwd_txfm2d_8x4_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2559 void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
2560 TX_TYPE tx_type, int bd) {
2561 __m128i in[8];
2562 __m128i *outcoeff128 = (__m128i *)coeff;
2563 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
2564 const int txw_idx = get_txw_idx(TX_8X4);
2565 const int txh_idx = get_txh_idx(TX_8X4);
2566 const int txfm_size_col = tx_size_wide[TX_8X4];
2567 const int txfm_size_row = tx_size_high[TX_8X4];
2568 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2569 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2570 const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
2571 const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
2572 int ud_flip, lr_flip;
2573 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2574 // col tranform
2575 load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
2576 for (int i = 0; i < 2; i++) {
2577 col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1);
2578 }
2579 col_txfm_4x8_rounding(in, -shift[1]);
2580
2581 // row tranform
2582 row_txfm(in, outcoeff128, bitrow, 1);
2583 av1_round_shift_rect_array_32_sse4_1(outcoeff128, in, txfm_size_col,
2584 -shift[2], NewSqrt2);
2585 transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
2586 (void)bd;
2587 }
2588
2589 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_16x64_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2590 void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
2591 int stride, TX_TYPE tx_type, int bd) {
2592 __m128i in[256];
2593 __m128i *outcoeff128 = (__m128i *)coeff;
2594 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
2595 const int txw_idx = get_txw_idx(TX_16X64);
2596 const int txh_idx = get_txh_idx(TX_16X64);
2597 const int txfm_size_col = tx_size_wide[TX_16X64];
2598 const int txfm_size_row = tx_size_high[TX_16X64];
2599 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2600 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2601 int ud_flip, lr_flip;
2602 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2603 const int num_col = txfm_size_col >> 2;
2604 // col tranform
2605 for (int i = 0; i < txfm_size_row; i += num_col) {
2606 load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
2607 ud_flip, lr_flip, shift[0]);
2608 load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
2609 ud_flip, lr_flip, shift[0]);
2610 load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
2611 ud_flip, lr_flip, shift[0]);
2612 load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
2613 ud_flip, lr_flip, shift[0]);
2614 }
2615
2616 for (int i = 0; i < num_col; i++) {
2617 av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
2618 }
2619
2620 col_txfm_16x16_rounding(outcoeff128, -shift[1]);
2621 col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
2622 col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
2623 col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
2624
2625 transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
2626 fdct16x16_sse4_1(in, in, bitrow, 8);
2627 transpose_8nx8n(in, outcoeff128, 32, txfm_size_col);
2628 memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff));
2629 (void)bd;
2630 }
2631
av1_fwd_txfm2d_64x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2632 void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
2633 int stride, TX_TYPE tx_type, int bd) {
2634 __m128i in[256];
2635 __m128i *outcoeff128 = (__m128i *)coeff;
2636 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
2637 const int txw_idx = get_txw_idx(TX_64X16);
2638 const int txh_idx = get_txh_idx(TX_64X16);
2639 const int txfm_size_col = tx_size_wide[TX_64X16];
2640 const int txfm_size_row = tx_size_high[TX_64X16];
2641 int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2642 int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2643 int ud_flip, lr_flip;
2644 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2645 // col tranform
2646 for (int i = 0; i < txfm_size_row; i++) {
2647 load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
2648 ud_flip, lr_flip, shift[0]);
2649 load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
2650 ud_flip, lr_flip, shift[0]);
2651 load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
2652 ud_flip, lr_flip, shift[0]);
2653 load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
2654 ud_flip, lr_flip, shift[0]);
2655 }
2656
2657 fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
2658 col_txfm_16x16_rounding(outcoeff128, -shift[1]);
2659 col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
2660 col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
2661 col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
2662
2663 transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
2664 for (int i = 0; i < 4; i++) {
2665 av1_fdct64_sse4_1(in + i, in + i, bitrow, 4, 4);
2666 }
2667 transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
2668 (void)bd;
2669 }
2670 #endif
2671