• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "aom_dsp/txfm_common.h"
15 #include "aom_dsp/x86/transpose_sse2.h"
16 #include "aom_dsp/x86/txfm_common_sse2.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
21 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
22 #include "config/aom_config.h"
23 #include "config/av1_rtcd.h"
24 
av1_fwht4x4_sse4_1(const int16_t * input,tran_low_t * output,int stride)25 void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
26   __m128i in[4];
27   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
28   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
29   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
30   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
31 
32   // Convert to int32_t.
33   __m128i op[4];
34   op[0] = _mm_cvtepi16_epi32(in[0]);
35   op[1] = _mm_cvtepi16_epi32(in[1]);
36   op[2] = _mm_cvtepi16_epi32(in[2]);
37   op[3] = _mm_cvtepi16_epi32(in[3]);
38 
39   for (int i = 0; i < 2; ++i) {
40     __m128i a1 = op[0];
41     __m128i b1 = op[1];
42     __m128i c1 = op[2];
43     __m128i d1 = op[3];
44     __m128i e1;
45 
46     a1 = _mm_add_epi32(a1, b1);  // a1 += b1
47     d1 = _mm_sub_epi32(d1, c1);  // d1 = d1 - c1
48     e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
49     e1 = _mm_srai_epi32(e1, 1);
50     b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
51     c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
52     a1 = _mm_sub_epi32(a1, c1);  // a1 -= c1
53     d1 = _mm_add_epi32(d1, b1);  // d1 += b1
54 
55     op[0] = a1;
56     op[1] = c1;
57     op[2] = d1;
58     op[3] = b1;
59 
60     transpose_32bit_4x4(op, op);
61   }
62 
63   op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
64   op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
65   op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
66   op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
67 
68   _mm_storeu_si128((__m128i *)(output + 0), op[0]);
69   _mm_storeu_si128((__m128i *)(output + 4), op[1]);
70   _mm_storeu_si128((__m128i *)(output + 8), op[2]);
71   _mm_storeu_si128((__m128i *)(output + 12), op[3]);
72 }
73 
av1_highbd_fwht4x4_sse4_1(const int16_t * input,tran_low_t * output,int stride)74 void av1_highbd_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output,
75                                int stride) {
76   av1_fwht4x4_sse4_1(input, output, stride);
77 }
78 
load_buffer_4x4(const int16_t * input,__m128i * in,int stride,int flipud,int fliplr,int shift)79 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
80                                    int stride, int flipud, int fliplr,
81                                    int shift) {
82   if (!flipud) {
83     in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
84     in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
85     in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
86     in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
87   } else {
88     in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
89     in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
90     in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
91     in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
92   }
93 
94   if (fliplr) {
95     in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
96     in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
97     in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
98     in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
99   }
100 
101   in[0] = _mm_cvtepi16_epi32(in[0]);
102   in[1] = _mm_cvtepi16_epi32(in[1]);
103   in[2] = _mm_cvtepi16_epi32(in[2]);
104   in[3] = _mm_cvtepi16_epi32(in[3]);
105 
106   in[0] = _mm_slli_epi32(in[0], shift);
107   in[1] = _mm_slli_epi32(in[1], shift);
108   in[2] = _mm_slli_epi32(in[2], shift);
109   in[3] = _mm_slli_epi32(in[3], shift);
110 }
111 
112 // We only use stage-2 bit;
113 // shift[0] is used in load_buffer_4x4()
114 // shift[1] is used in txfm_func_col()
115 // shift[2] is used in txfm_func_row()
fdct4x4_sse4_1(__m128i * in,__m128i * out,int bit,const int num_col)116 static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
117                            const int num_col) {
118   const int32_t *cospi = cospi_arr(bit);
119   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
120   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
121   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
122   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
123   __m128i s0, s1, s2, s3;
124   __m128i u0, u1, u2, u3;
125   __m128i v0, v1, v2, v3;
126 
127   int endidx = 3 * num_col;
128   s0 = _mm_add_epi32(in[0], in[endidx]);
129   s3 = _mm_sub_epi32(in[0], in[endidx]);
130   endidx -= num_col;
131   s1 = _mm_add_epi32(in[num_col], in[endidx]);
132   s2 = _mm_sub_epi32(in[num_col], in[endidx]);
133 
134   // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
135   u0 = _mm_mullo_epi32(s0, cospi32);
136   u1 = _mm_mullo_epi32(s1, cospi32);
137   u2 = _mm_add_epi32(u0, u1);
138   v0 = _mm_sub_epi32(u0, u1);
139 
140   u3 = _mm_add_epi32(u2, rnding);
141   v1 = _mm_add_epi32(v0, rnding);
142 
143   u0 = _mm_srai_epi32(u3, bit);
144   u2 = _mm_srai_epi32(v1, bit);
145 
146   // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
147   v0 = _mm_mullo_epi32(s2, cospi48);
148   v1 = _mm_mullo_epi32(s3, cospi16);
149   v2 = _mm_add_epi32(v0, v1);
150 
151   v3 = _mm_add_epi32(v2, rnding);
152   u1 = _mm_srai_epi32(v3, bit);
153 
154   v0 = _mm_mullo_epi32(s2, cospi16);
155   v1 = _mm_mullo_epi32(s3, cospi48);
156   v2 = _mm_sub_epi32(v1, v0);
157 
158   v3 = _mm_add_epi32(v2, rnding);
159   u3 = _mm_srai_epi32(v3, bit);
160 
161   // Note: shift[1] and shift[2] are zeros
162 
163   // Transpose 4x4 32-bit
164   v0 = _mm_unpacklo_epi32(u0, u1);
165   v1 = _mm_unpackhi_epi32(u0, u1);
166   v2 = _mm_unpacklo_epi32(u2, u3);
167   v3 = _mm_unpackhi_epi32(u2, u3);
168 
169   out[0] = _mm_unpacklo_epi64(v0, v2);
170   out[1] = _mm_unpackhi_epi64(v0, v2);
171   out[2] = _mm_unpacklo_epi64(v1, v3);
172   out[3] = _mm_unpackhi_epi64(v1, v3);
173 }
174 
write_buffer_4x4(__m128i * res,int32_t * output)175 static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
176   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
177   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
178   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
179   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
180 }
181 
fadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,const int num_col)182 static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
183                             const int num_col) {
184   const int32_t *sinpi = sinpi_arr(bit);
185   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
186   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
187   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
188   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
189   const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
190   __m128i t;
191   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
192   __m128i x0, x1, x2, x3;
193   __m128i u0, u1, u2, u3;
194   __m128i v0, v1, v2, v3;
195 
196   int idx = 0 * num_col;
197   s0 = _mm_mullo_epi32(in[idx], sinpi1);
198   s1 = _mm_mullo_epi32(in[idx], sinpi4);
199   t = _mm_add_epi32(in[idx], in[idx + num_col]);
200   idx += num_col;
201   s2 = _mm_mullo_epi32(in[idx], sinpi2);
202   s3 = _mm_mullo_epi32(in[idx], sinpi1);
203   idx += num_col;
204   s4 = _mm_mullo_epi32(in[idx], sinpi3);
205   idx += num_col;
206   s5 = _mm_mullo_epi32(in[idx], sinpi4);
207   s6 = _mm_mullo_epi32(in[idx], sinpi2);
208   s7 = _mm_sub_epi32(t, in[idx]);
209 
210   t = _mm_add_epi32(s0, s2);
211   x0 = _mm_add_epi32(t, s5);
212   x1 = _mm_mullo_epi32(s7, sinpi3);
213   t = _mm_sub_epi32(s1, s3);
214   x2 = _mm_add_epi32(t, s6);
215   x3 = s4;
216 
217   s0 = _mm_add_epi32(x0, x3);
218   s1 = x1;
219   s2 = _mm_sub_epi32(x2, x3);
220   t = _mm_sub_epi32(x2, x0);
221   s3 = _mm_add_epi32(t, x3);
222 
223   u0 = _mm_add_epi32(s0, rnding);
224   u0 = _mm_srai_epi32(u0, bit);
225 
226   u1 = _mm_add_epi32(s1, rnding);
227   u1 = _mm_srai_epi32(u1, bit);
228 
229   u2 = _mm_add_epi32(s2, rnding);
230   u2 = _mm_srai_epi32(u2, bit);
231 
232   u3 = _mm_add_epi32(s3, rnding);
233   u3 = _mm_srai_epi32(u3, bit);
234 
235   v0 = _mm_unpacklo_epi32(u0, u1);
236   v1 = _mm_unpackhi_epi32(u0, u1);
237   v2 = _mm_unpacklo_epi32(u2, u3);
238   v3 = _mm_unpackhi_epi32(u2, u3);
239 
240   out[0] = _mm_unpacklo_epi64(v0, v2);
241   out[1] = _mm_unpackhi_epi64(v0, v2);
242   out[2] = _mm_unpacklo_epi64(v1, v3);
243   out[3] = _mm_unpackhi_epi64(v1, v3);
244 }
idtx4x4_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)245 static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
246   (void)bit;
247   __m128i fact = _mm_set1_epi32(NewSqrt2);
248   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
249   __m128i a_low;
250   __m128i v[4];
251 
252   for (int i = 0; i < 4; i++) {
253     a_low = _mm_mullo_epi32(in[i * col_num], fact);
254     a_low = _mm_add_epi32(a_low, offset);
255     out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
256   }
257 
258   // Transpose for 4x4
259   v[0] = _mm_unpacklo_epi32(out[0], out[1]);
260   v[1] = _mm_unpackhi_epi32(out[0], out[1]);
261   v[2] = _mm_unpacklo_epi32(out[2], out[3]);
262   v[3] = _mm_unpackhi_epi32(out[2], out[3]);
263 
264   out[0] = _mm_unpacklo_epi64(v[0], v[2]);
265   out[1] = _mm_unpackhi_epi64(v[0], v[2]);
266   out[2] = _mm_unpacklo_epi64(v[1], v[3]);
267   out[3] = _mm_unpackhi_epi64(v[1], v[3]);
268 }
av1_fwd_txfm2d_4x4_sse4_1(const int16_t * input,int32_t * coeff,int input_stride,TX_TYPE tx_type,int bd)269 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
270                                int input_stride, TX_TYPE tx_type, int bd) {
271   __m128i in[4];
272   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
273   const int txw_idx = get_txw_idx(TX_4X4);
274   const int txh_idx = get_txh_idx(TX_4X4);
275 
276   switch (tx_type) {
277     case DCT_DCT:
278       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
279       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
280       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
281       write_buffer_4x4(in, coeff);
282       break;
283     case ADST_DCT:
284       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
285       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
286       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
287       write_buffer_4x4(in, coeff);
288       break;
289     case DCT_ADST:
290       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
291       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
292       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
293       write_buffer_4x4(in, coeff);
294       break;
295     case ADST_ADST:
296       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
297       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
298       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
299       write_buffer_4x4(in, coeff);
300       break;
301     case FLIPADST_DCT:
302       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
303       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
304       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
305       write_buffer_4x4(in, coeff);
306       break;
307     case DCT_FLIPADST:
308       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
309       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
310       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
311       write_buffer_4x4(in, coeff);
312       break;
313     case FLIPADST_FLIPADST:
314       load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
315       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
316       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
317       write_buffer_4x4(in, coeff);
318       break;
319     case ADST_FLIPADST:
320       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
321       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
322       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
323       write_buffer_4x4(in, coeff);
324       break;
325     case FLIPADST_ADST:
326       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
327       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
328       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
329       write_buffer_4x4(in, coeff);
330       break;
331     case IDTX:
332       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
333       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
334       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
335       write_buffer_4x4(in, coeff);
336       break;
337     case V_DCT:
338       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
339       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
340       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
341       write_buffer_4x4(in, coeff);
342       break;
343     case H_DCT:
344       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
345       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
346       fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
347       write_buffer_4x4(in, coeff);
348       break;
349     case V_ADST:
350       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
351       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
352       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
353       write_buffer_4x4(in, coeff);
354       break;
355     case H_ADST:
356       load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
357       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
358       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
359       write_buffer_4x4(in, coeff);
360       break;
361     case V_FLIPADST:
362       load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
363       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
364       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
365       write_buffer_4x4(in, coeff);
366       break;
367     case H_FLIPADST:
368       load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
369       idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
370       fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
371       write_buffer_4x4(in, coeff);
372       break;
373     default: assert(0);
374   }
375   (void)bd;
376 }
377 
load_buffer_8x8(const int16_t * input,__m128i * in,int stride,int flipud,int fliplr,int shift)378 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
379                                    int stride, int flipud, int fliplr,
380                                    int shift) {
381   __m128i u;
382   if (!flipud) {
383     in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
384     in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
385     in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
386     in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
387     in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
388     in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
389     in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
390     in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
391   } else {
392     in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
393     in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
394     in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
395     in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
396     in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
397     in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
398     in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
399     in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
400   }
401 
402   if (fliplr) {
403     in[0] = mm_reverse_epi16(in[0]);
404     in[1] = mm_reverse_epi16(in[1]);
405     in[2] = mm_reverse_epi16(in[2]);
406     in[3] = mm_reverse_epi16(in[3]);
407     in[4] = mm_reverse_epi16(in[4]);
408     in[5] = mm_reverse_epi16(in[5]);
409     in[6] = mm_reverse_epi16(in[6]);
410     in[7] = mm_reverse_epi16(in[7]);
411   }
412 
413   u = _mm_unpackhi_epi64(in[4], in[4]);
414   in[8] = _mm_cvtepi16_epi32(in[4]);
415   in[9] = _mm_cvtepi16_epi32(u);
416 
417   u = _mm_unpackhi_epi64(in[5], in[5]);
418   in[10] = _mm_cvtepi16_epi32(in[5]);
419   in[11] = _mm_cvtepi16_epi32(u);
420 
421   u = _mm_unpackhi_epi64(in[6], in[6]);
422   in[12] = _mm_cvtepi16_epi32(in[6]);
423   in[13] = _mm_cvtepi16_epi32(u);
424 
425   u = _mm_unpackhi_epi64(in[7], in[7]);
426   in[14] = _mm_cvtepi16_epi32(in[7]);
427   in[15] = _mm_cvtepi16_epi32(u);
428 
429   u = _mm_unpackhi_epi64(in[3], in[3]);
430   in[6] = _mm_cvtepi16_epi32(in[3]);
431   in[7] = _mm_cvtepi16_epi32(u);
432 
433   u = _mm_unpackhi_epi64(in[2], in[2]);
434   in[4] = _mm_cvtepi16_epi32(in[2]);
435   in[5] = _mm_cvtepi16_epi32(u);
436 
437   u = _mm_unpackhi_epi64(in[1], in[1]);
438   in[2] = _mm_cvtepi16_epi32(in[1]);
439   in[3] = _mm_cvtepi16_epi32(u);
440 
441   u = _mm_unpackhi_epi64(in[0], in[0]);
442   in[0] = _mm_cvtepi16_epi32(in[0]);
443   in[1] = _mm_cvtepi16_epi32(u);
444 
445   in[0] = _mm_slli_epi32(in[0], shift);
446   in[1] = _mm_slli_epi32(in[1], shift);
447   in[2] = _mm_slli_epi32(in[2], shift);
448   in[3] = _mm_slli_epi32(in[3], shift);
449   in[4] = _mm_slli_epi32(in[4], shift);
450   in[5] = _mm_slli_epi32(in[5], shift);
451   in[6] = _mm_slli_epi32(in[6], shift);
452   in[7] = _mm_slli_epi32(in[7], shift);
453 
454   in[8] = _mm_slli_epi32(in[8], shift);
455   in[9] = _mm_slli_epi32(in[9], shift);
456   in[10] = _mm_slli_epi32(in[10], shift);
457   in[11] = _mm_slli_epi32(in[11], shift);
458   in[12] = _mm_slli_epi32(in[12], shift);
459   in[13] = _mm_slli_epi32(in[13], shift);
460   in[14] = _mm_slli_epi32(in[14], shift);
461   in[15] = _mm_slli_epi32(in[15], shift);
462 }
463 
col_txfm_8x8_rounding(__m128i * in,int shift)464 static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
465   const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
466 
467   in[0] = _mm_add_epi32(in[0], rounding);
468   in[1] = _mm_add_epi32(in[1], rounding);
469   in[2] = _mm_add_epi32(in[2], rounding);
470   in[3] = _mm_add_epi32(in[3], rounding);
471   in[4] = _mm_add_epi32(in[4], rounding);
472   in[5] = _mm_add_epi32(in[5], rounding);
473   in[6] = _mm_add_epi32(in[6], rounding);
474   in[7] = _mm_add_epi32(in[7], rounding);
475   in[8] = _mm_add_epi32(in[8], rounding);
476   in[9] = _mm_add_epi32(in[9], rounding);
477   in[10] = _mm_add_epi32(in[10], rounding);
478   in[11] = _mm_add_epi32(in[11], rounding);
479   in[12] = _mm_add_epi32(in[12], rounding);
480   in[13] = _mm_add_epi32(in[13], rounding);
481   in[14] = _mm_add_epi32(in[14], rounding);
482   in[15] = _mm_add_epi32(in[15], rounding);
483 
484   in[0] = _mm_srai_epi32(in[0], shift);
485   in[1] = _mm_srai_epi32(in[1], shift);
486   in[2] = _mm_srai_epi32(in[2], shift);
487   in[3] = _mm_srai_epi32(in[3], shift);
488   in[4] = _mm_srai_epi32(in[4], shift);
489   in[5] = _mm_srai_epi32(in[5], shift);
490   in[6] = _mm_srai_epi32(in[6], shift);
491   in[7] = _mm_srai_epi32(in[7], shift);
492   in[8] = _mm_srai_epi32(in[8], shift);
493   in[9] = _mm_srai_epi32(in[9], shift);
494   in[10] = _mm_srai_epi32(in[10], shift);
495   in[11] = _mm_srai_epi32(in[11], shift);
496   in[12] = _mm_srai_epi32(in[12], shift);
497   in[13] = _mm_srai_epi32(in[13], shift);
498   in[14] = _mm_srai_epi32(in[14], shift);
499   in[15] = _mm_srai_epi32(in[15], shift);
500 }
501 
col_txfm_4x8_rounding(__m128i * in,int shift)502 static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
503   const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
504 
505   in[0] = _mm_add_epi32(in[0], rounding);
506   in[1] = _mm_add_epi32(in[1], rounding);
507   in[2] = _mm_add_epi32(in[2], rounding);
508   in[3] = _mm_add_epi32(in[3], rounding);
509   in[4] = _mm_add_epi32(in[4], rounding);
510   in[5] = _mm_add_epi32(in[5], rounding);
511   in[6] = _mm_add_epi32(in[6], rounding);
512   in[7] = _mm_add_epi32(in[7], rounding);
513 
514   in[0] = _mm_srai_epi32(in[0], shift);
515   in[1] = _mm_srai_epi32(in[1], shift);
516   in[2] = _mm_srai_epi32(in[2], shift);
517   in[3] = _mm_srai_epi32(in[3], shift);
518   in[4] = _mm_srai_epi32(in[4], shift);
519   in[5] = _mm_srai_epi32(in[5], shift);
520   in[6] = _mm_srai_epi32(in[6], shift);
521   in[7] = _mm_srai_epi32(in[7], shift);
522 }
523 
write_buffer_8x8(const __m128i * res,int32_t * output)524 static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
525   _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
526   _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
527   _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
528   _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
529 
530   _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
531   _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
532   _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
533   _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
534 
535   _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
536   _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
537   _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
538   _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
539 
540   _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
541   _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
542   _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
543   _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
544 }
545 
write_buffer_16x8(const __m128i * res,int32_t * output,const int stride)546 static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
547                                      const int stride) {
548   _mm_storeu_si128((__m128i *)(output), res[0]);
549   _mm_storeu_si128((__m128i *)(output + 4), res[1]);
550   _mm_storeu_si128((__m128i *)(output + stride), res[2]);
551   _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
552 
553   _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
554   _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
555   _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
556   _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
557 
558   _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
559   _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
560   _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
561   _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
562 
563   _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
564   _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
565   _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
566   _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
567 }
568 
fdct4x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)569 static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
570                            const int col_num) {
571   const int32_t *cospi = cospi_arr(bit);
572   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
573   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
574   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
575   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
576   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
577   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
578   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
579   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
580   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
581   __m128i u[8], v[8];
582 
583   int startidx = 0 * col_num;
584   int endidx = 7 * col_num;
585   // Even 8 points 0, 2, ..., 14
586   // stage 0
587   // stage 1
588   u[0] = _mm_add_epi32(in[startidx], in[endidx]);
589   v[7] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[7]
590   startidx += col_num;
591   endidx -= col_num;
592   u[1] = _mm_add_epi32(in[startidx], in[endidx]);
593   u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
594   startidx += col_num;
595   endidx -= col_num;
596   u[2] = _mm_add_epi32(in[startidx], in[endidx]);
597   u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
598   startidx += col_num;
599   endidx -= col_num;
600   u[3] = _mm_add_epi32(in[startidx], in[endidx]);
601   v[4] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[4]
602 
603   // stage 2
604   v[0] = _mm_add_epi32(u[0], u[3]);
605   v[3] = _mm_sub_epi32(u[0], u[3]);
606   v[1] = _mm_add_epi32(u[1], u[2]);
607   v[2] = _mm_sub_epi32(u[1], u[2]);
608 
609   v[5] = _mm_mullo_epi32(u[5], cospim32);
610   v[6] = _mm_mullo_epi32(u[6], cospi32);
611   v[5] = _mm_add_epi32(v[5], v[6]);
612   v[5] = _mm_add_epi32(v[5], rnding);
613   v[5] = _mm_srai_epi32(v[5], bit);
614 
615   u[0] = _mm_mullo_epi32(u[5], cospi32);
616   v[6] = _mm_mullo_epi32(u[6], cospim32);
617   v[6] = _mm_sub_epi32(u[0], v[6]);
618   v[6] = _mm_add_epi32(v[6], rnding);
619   v[6] = _mm_srai_epi32(v[6], bit);
620 
621   // stage 3
622   // type 0
623   v[0] = _mm_mullo_epi32(v[0], cospi32);
624   v[1] = _mm_mullo_epi32(v[1], cospi32);
625   u[0] = _mm_add_epi32(v[0], v[1]);
626   u[0] = _mm_add_epi32(u[0], rnding);
627   u[0] = _mm_srai_epi32(u[0], bit);
628 
629   u[1] = _mm_sub_epi32(v[0], v[1]);
630   u[1] = _mm_add_epi32(u[1], rnding);
631   u[1] = _mm_srai_epi32(u[1], bit);
632 
633   // type 1
634   v[0] = _mm_mullo_epi32(v[2], cospi48);
635   v[1] = _mm_mullo_epi32(v[3], cospi16);
636   u[2] = _mm_add_epi32(v[0], v[1]);
637   u[2] = _mm_add_epi32(u[2], rnding);
638   u[2] = _mm_srai_epi32(u[2], bit);
639 
640   v[0] = _mm_mullo_epi32(v[2], cospi16);
641   v[1] = _mm_mullo_epi32(v[3], cospi48);
642   u[3] = _mm_sub_epi32(v[1], v[0]);
643   u[3] = _mm_add_epi32(u[3], rnding);
644   u[3] = _mm_srai_epi32(u[3], bit);
645 
646   u[4] = _mm_add_epi32(v[4], v[5]);
647   u[5] = _mm_sub_epi32(v[4], v[5]);
648   u[6] = _mm_sub_epi32(v[7], v[6]);
649   u[7] = _mm_add_epi32(v[7], v[6]);
650 
651   // stage 4
652   // stage 5
653   v[0] = _mm_mullo_epi32(u[4], cospi56);
654   v[1] = _mm_mullo_epi32(u[7], cospi8);
655   v[0] = _mm_add_epi32(v[0], v[1]);
656   v[0] = _mm_add_epi32(v[0], rnding);
657   out[1 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[4]
658 
659   v[0] = _mm_mullo_epi32(u[4], cospi8);
660   v[1] = _mm_mullo_epi32(u[7], cospi56);
661   v[0] = _mm_sub_epi32(v[1], v[0]);
662   v[0] = _mm_add_epi32(v[0], rnding);
663   out[7 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[7]
664 
665   v[0] = _mm_mullo_epi32(u[5], cospi24);
666   v[1] = _mm_mullo_epi32(u[6], cospi40);
667   v[0] = _mm_add_epi32(v[0], v[1]);
668   v[0] = _mm_add_epi32(v[0], rnding);
669   out[5 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[5]
670 
671   v[0] = _mm_mullo_epi32(u[5], cospi40);
672   v[1] = _mm_mullo_epi32(u[6], cospi24);
673   v[0] = _mm_sub_epi32(v[1], v[0]);
674   v[0] = _mm_add_epi32(v[0], rnding);
675   out[3 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[6]
676 
677   out[0 * col_num] = u[0];  // buf0[0]
678   out[4 * col_num] = u[1];  // buf0[1]
679   out[2 * col_num] = u[2];  // buf0[2]
680   out[6 * col_num] = u[3];  // buf0[3]
681 }
682 
fdct8x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)683 static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
684                            const int col_num) {
685   fdct4x8_sse4_1(in, out, bit, col_num);
686   fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
687 }
688 
fadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)689 static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
690                             const int col_num) {
691   const int32_t *cospi = cospi_arr(bit);
692   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
693   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
694   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
695   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
696   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
697   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
698   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
699   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
700   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
701   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
702   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
703   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
704   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
705   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
706   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
707   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
708   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
709   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
710   const __m128i zero = _mm_setzero_si128();
711   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
712   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
713   __m128i x, y;
714   int col;
715 
716   // Note:
717   //  Even column: 0, 2, ..., 14
718   //  Odd column: 1, 3, ..., 15
719   //  one even column plus one odd column constructs one row (8 coeffs)
720   //  total we have 8 rows (8x8).
721   for (col = 0; col < col_num; ++col) {
722     // stage 0
723     // stage 1
724     u0 = in[col_num * 0 + col];
725     u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
726     u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
727     u3 = in[col_num * 4 + col];
728     u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
729     u5 = in[col_num * 6 + col];
730     u6 = in[col_num * 2 + col];
731     u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
732 
733     // stage 2
734     v0 = u0;
735     v1 = u1;
736 
737     x = _mm_mullo_epi32(u2, cospi32);
738     y = _mm_mullo_epi32(u3, cospi32);
739     v2 = _mm_add_epi32(x, y);
740     v2 = _mm_add_epi32(v2, rnding);
741     v2 = _mm_srai_epi32(v2, bit);
742 
743     v3 = _mm_sub_epi32(x, y);
744     v3 = _mm_add_epi32(v3, rnding);
745     v3 = _mm_srai_epi32(v3, bit);
746 
747     v4 = u4;
748     v5 = u5;
749 
750     x = _mm_mullo_epi32(u6, cospi32);
751     y = _mm_mullo_epi32(u7, cospi32);
752     v6 = _mm_add_epi32(x, y);
753     v6 = _mm_add_epi32(v6, rnding);
754     v6 = _mm_srai_epi32(v6, bit);
755 
756     v7 = _mm_sub_epi32(x, y);
757     v7 = _mm_add_epi32(v7, rnding);
758     v7 = _mm_srai_epi32(v7, bit);
759 
760     // stage 3
761     u0 = _mm_add_epi32(v0, v2);
762     u1 = _mm_add_epi32(v1, v3);
763     u2 = _mm_sub_epi32(v0, v2);
764     u3 = _mm_sub_epi32(v1, v3);
765     u4 = _mm_add_epi32(v4, v6);
766     u5 = _mm_add_epi32(v5, v7);
767     u6 = _mm_sub_epi32(v4, v6);
768     u7 = _mm_sub_epi32(v5, v7);
769 
770     // stage 4
771     v0 = u0;
772     v1 = u1;
773     v2 = u2;
774     v3 = u3;
775 
776     x = _mm_mullo_epi32(u4, cospi16);
777     y = _mm_mullo_epi32(u5, cospi48);
778     v4 = _mm_add_epi32(x, y);
779     v4 = _mm_add_epi32(v4, rnding);
780     v4 = _mm_srai_epi32(v4, bit);
781 
782     x = _mm_mullo_epi32(u4, cospi48);
783     y = _mm_mullo_epi32(u5, cospim16);
784     v5 = _mm_add_epi32(x, y);
785     v5 = _mm_add_epi32(v5, rnding);
786     v5 = _mm_srai_epi32(v5, bit);
787 
788     x = _mm_mullo_epi32(u6, cospim48);
789     y = _mm_mullo_epi32(u7, cospi16);
790     v6 = _mm_add_epi32(x, y);
791     v6 = _mm_add_epi32(v6, rnding);
792     v6 = _mm_srai_epi32(v6, bit);
793 
794     x = _mm_mullo_epi32(u6, cospi16);
795     y = _mm_mullo_epi32(u7, cospi48);
796     v7 = _mm_add_epi32(x, y);
797     v7 = _mm_add_epi32(v7, rnding);
798     v7 = _mm_srai_epi32(v7, bit);
799 
800     // stage 5
801     u0 = _mm_add_epi32(v0, v4);
802     u1 = _mm_add_epi32(v1, v5);
803     u2 = _mm_add_epi32(v2, v6);
804     u3 = _mm_add_epi32(v3, v7);
805     u4 = _mm_sub_epi32(v0, v4);
806     u5 = _mm_sub_epi32(v1, v5);
807     u6 = _mm_sub_epi32(v2, v6);
808     u7 = _mm_sub_epi32(v3, v7);
809 
810     // stage 6
811     x = _mm_mullo_epi32(u0, cospi4);
812     y = _mm_mullo_epi32(u1, cospi60);
813     v0 = _mm_add_epi32(x, y);
814     v0 = _mm_add_epi32(v0, rnding);
815     v0 = _mm_srai_epi32(v0, bit);
816 
817     x = _mm_mullo_epi32(u0, cospi60);
818     y = _mm_mullo_epi32(u1, cospim4);
819     v1 = _mm_add_epi32(x, y);
820     v1 = _mm_add_epi32(v1, rnding);
821     v1 = _mm_srai_epi32(v1, bit);
822 
823     x = _mm_mullo_epi32(u2, cospi20);
824     y = _mm_mullo_epi32(u3, cospi44);
825     v2 = _mm_add_epi32(x, y);
826     v2 = _mm_add_epi32(v2, rnding);
827     v2 = _mm_srai_epi32(v2, bit);
828 
829     x = _mm_mullo_epi32(u2, cospi44);
830     y = _mm_mullo_epi32(u3, cospim20);
831     v3 = _mm_add_epi32(x, y);
832     v3 = _mm_add_epi32(v3, rnding);
833     v3 = _mm_srai_epi32(v3, bit);
834 
835     x = _mm_mullo_epi32(u4, cospi36);
836     y = _mm_mullo_epi32(u5, cospi28);
837     v4 = _mm_add_epi32(x, y);
838     v4 = _mm_add_epi32(v4, rnding);
839     v4 = _mm_srai_epi32(v4, bit);
840 
841     x = _mm_mullo_epi32(u4, cospi28);
842     y = _mm_mullo_epi32(u5, cospim36);
843     v5 = _mm_add_epi32(x, y);
844     v5 = _mm_add_epi32(v5, rnding);
845     v5 = _mm_srai_epi32(v5, bit);
846 
847     x = _mm_mullo_epi32(u6, cospi52);
848     y = _mm_mullo_epi32(u7, cospi12);
849     v6 = _mm_add_epi32(x, y);
850     v6 = _mm_add_epi32(v6, rnding);
851     v6 = _mm_srai_epi32(v6, bit);
852 
853     x = _mm_mullo_epi32(u6, cospi12);
854     y = _mm_mullo_epi32(u7, cospim52);
855     v7 = _mm_add_epi32(x, y);
856     v7 = _mm_add_epi32(v7, rnding);
857     v7 = _mm_srai_epi32(v7, bit);
858 
859     // stage 7
860     out[col_num * 0 + col] = v1;
861     out[col_num * 1 + col] = v6;
862     out[col_num * 2 + col] = v3;
863     out[col_num * 3 + col] = v4;
864     out[col_num * 4 + col] = v5;
865     out[col_num * 5 + col] = v2;
866     out[col_num * 6 + col] = v7;
867     out[col_num * 7 + col] = v0;
868   }
869 }
idtx8x8_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)870 static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
871   (void)bit;
872 
873   for (int i = 0; i < col_num; i += 1) {
874     out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
875     out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
876     out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
877     out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
878     out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
879     out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
880     out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
881     out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
882   }
883 }
884 #if !CONFIG_REALTIME_ONLY
idtx32x8_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)885 static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
886   (void)bit;
887   (void)col_num;
888   for (int j = 0; j < 2; j++) {
889     out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
890     out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
891     out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
892     out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
893     out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
894     out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
895     out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
896     out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
897   }
898 }
899 #endif
av1_fwd_txfm2d_8x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)900 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
901                                TX_TYPE tx_type, int bd) {
902   __m128i in[16], out[16];
903   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
904   const int txw_idx = get_txw_idx(TX_8X8);
905   const int txh_idx = get_txh_idx(TX_8X8);
906 
907   switch (tx_type) {
908     case DCT_DCT:
909       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
910       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
911       col_txfm_8x8_rounding(out, -shift[1]);
912       transpose_8x8(out, in);
913       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
914       transpose_8x8(out, in);
915       write_buffer_8x8(in, coeff);
916       break;
917     case ADST_DCT:
918       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
919       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
920       col_txfm_8x8_rounding(out, -shift[1]);
921       transpose_8x8(out, in);
922       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
923       transpose_8x8(out, in);
924       write_buffer_8x8(in, coeff);
925       break;
926     case DCT_ADST:
927       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
928       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
929       col_txfm_8x8_rounding(out, -shift[1]);
930       transpose_8x8(out, in);
931       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
932       transpose_8x8(out, in);
933       write_buffer_8x8(in, coeff);
934       break;
935     case ADST_ADST:
936       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
937       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
938       col_txfm_8x8_rounding(out, -shift[1]);
939       transpose_8x8(out, in);
940       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
941       transpose_8x8(out, in);
942       write_buffer_8x8(in, coeff);
943       break;
944     case FLIPADST_DCT:
945       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
946       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
947       col_txfm_8x8_rounding(out, -shift[1]);
948       transpose_8x8(out, in);
949       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
950       transpose_8x8(out, in);
951       write_buffer_8x8(in, coeff);
952       break;
953     case DCT_FLIPADST:
954       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
955       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
956       col_txfm_8x8_rounding(out, -shift[1]);
957       transpose_8x8(out, in);
958       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
959       transpose_8x8(out, in);
960       write_buffer_8x8(in, coeff);
961       break;
962     case FLIPADST_FLIPADST:
963       load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
964       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
965       col_txfm_8x8_rounding(out, -shift[1]);
966       transpose_8x8(out, in);
967       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
968       transpose_8x8(out, in);
969       write_buffer_8x8(in, coeff);
970       break;
971     case ADST_FLIPADST:
972       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
973       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
974       col_txfm_8x8_rounding(out, -shift[1]);
975       transpose_8x8(out, in);
976       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
977       transpose_8x8(out, in);
978       write_buffer_8x8(in, coeff);
979       break;
980     case FLIPADST_ADST:
981       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
982       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
983       col_txfm_8x8_rounding(out, -shift[1]);
984       transpose_8x8(out, in);
985       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
986       transpose_8x8(out, in);
987       write_buffer_8x8(in, coeff);
988       break;
989     case IDTX:
990       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
991       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
992       col_txfm_8x8_rounding(out, -shift[1]);
993       transpose_8x8(out, in);
994       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
995       transpose_8x8(out, in);
996       write_buffer_8x8(in, coeff);
997       break;
998     case V_DCT:
999       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1000       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1001       col_txfm_8x8_rounding(out, -shift[1]);
1002       transpose_8x8(out, in);
1003       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1004       transpose_8x8(out, in);
1005       write_buffer_8x8(in, coeff);
1006       break;
1007     case H_DCT:
1008       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1009       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1010       col_txfm_8x8_rounding(out, -shift[1]);
1011       transpose_8x8(out, in);
1012       fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1013       transpose_8x8(out, in);
1014       write_buffer_8x8(in, coeff);
1015       break;
1016     case V_ADST:
1017       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1018       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1019       col_txfm_8x8_rounding(out, -shift[1]);
1020       transpose_8x8(out, in);
1021       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1022       transpose_8x8(out, in);
1023       write_buffer_8x8(in, coeff);
1024       break;
1025     case H_ADST:
1026       load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1027       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1028       col_txfm_8x8_rounding(out, -shift[1]);
1029       transpose_8x8(out, in);
1030       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1031       transpose_8x8(out, in);
1032       write_buffer_8x8(in, coeff);
1033       break;
1034     case V_FLIPADST:
1035       load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
1036       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1037       col_txfm_8x8_rounding(out, -shift[1]);
1038       transpose_8x8(out, in);
1039       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1040       transpose_8x8(out, in);
1041       write_buffer_8x8(in, coeff);
1042       break;
1043     case H_FLIPADST:
1044       load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
1045       idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1046       col_txfm_8x8_rounding(out, -shift[1]);
1047       transpose_8x8(out, in);
1048       fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
1049       transpose_8x8(out, in);
1050       write_buffer_8x8(in, coeff);
1051       break;
1052     default: assert(0);
1053   }
1054   (void)bd;
1055 }
1056 
1057 // Hybrid Transform 16x16
1058 
convert_8x8_to_16x16(const __m128i * in,__m128i * out)1059 static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
1060   int row_index = 0;
1061   int dst_index = 0;
1062   int src_index = 0;
1063 
1064   // row 0, 1, .., 7
1065   do {
1066     out[dst_index] = in[src_index];
1067     out[dst_index + 1] = in[src_index + 1];
1068     out[dst_index + 2] = in[src_index + 16];
1069     out[dst_index + 3] = in[src_index + 17];
1070     dst_index += 4;
1071     src_index += 2;
1072     row_index += 1;
1073   } while (row_index < 8);
1074 
1075   // row 8, 9, ..., 15
1076   src_index += 16;
1077   do {
1078     out[dst_index] = in[src_index];
1079     out[dst_index + 1] = in[src_index + 1];
1080     out[dst_index + 2] = in[src_index + 16];
1081     out[dst_index + 3] = in[src_index + 17];
1082     dst_index += 4;
1083     src_index += 2;
1084     row_index += 1;
1085   } while (row_index < 16);
1086 }
1087 
load_buffer_16x16(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1088 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
1089                                      int stride, int flipud, int fliplr,
1090                                      int shift) {
1091   __m128i in[64];
1092   // Load 4 8x8 blocks
1093   const int16_t *topL = input;
1094   const int16_t *topR = input + 8;
1095   const int16_t *botL = input + 8 * stride;
1096   const int16_t *botR = input + 8 * stride + 8;
1097 
1098   const int16_t *tmp;
1099 
1100   if (flipud) {
1101     // Swap left columns
1102     tmp = topL;
1103     topL = botL;
1104     botL = tmp;
1105     // Swap right columns
1106     tmp = topR;
1107     topR = botR;
1108     botR = tmp;
1109   }
1110 
1111   if (fliplr) {
1112     // Swap top rows
1113     tmp = topL;
1114     topL = topR;
1115     topR = tmp;
1116     // Swap bottom rows
1117     tmp = botL;
1118     botL = botR;
1119     botR = tmp;
1120   }
1121 
1122   // load first 8 columns
1123   load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
1124   load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
1125 
1126   // load second 8 columns
1127   load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
1128   load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
1129 
1130   convert_8x8_to_16x16(in, out);
1131 }
1132 
load_buffer_8x16(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1133 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
1134                                     int stride, int flipud, int fliplr,
1135                                     int shift) {
1136   const int16_t *topL = input;
1137   const int16_t *botL = input + 8 * stride;
1138 
1139   const int16_t *tmp;
1140 
1141   if (flipud) {
1142     tmp = topL;
1143     topL = botL;
1144     botL = tmp;
1145   }
1146 
1147   load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
1148   load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
1149 }
1150 
load_buffer_8x4(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1151 static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
1152                                    int stride, int flipud, int fliplr,
1153                                    int shift) {
1154   const int16_t *topL = input;
1155   const int16_t *topR = input + 4;
1156 
1157   const int16_t *tmp;
1158 
1159   if (fliplr) {
1160     tmp = topL;
1161     topL = topR;
1162     topR = tmp;
1163   }
1164 
1165   load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
1166   load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
1167 }
1168 
load_buffer_16x4(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1169 static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
1170                                     int stride, int flipud, int fliplr,
1171                                     int shift) {
1172   const int16_t *topL = input;
1173   const int16_t *topR = input + 8;
1174 
1175   const int16_t *tmp;
1176 
1177   if (fliplr) {
1178     tmp = topL;
1179     topL = topR;
1180     topR = tmp;
1181   }
1182 
1183   load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
1184   load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
1185 }
1186 
load_buffer_4x8(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift)1187 static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
1188                                    int stride, int flipud, int fliplr,
1189                                    int shift) {
1190   const int16_t *topL = input;
1191   const int16_t *botL = input + 4 * stride;
1192 
1193   const int16_t *tmp;
1194 
1195   if (flipud) {
1196     tmp = topL;
1197     topL = botL;
1198     botL = tmp;
1199   }
1200 
1201   load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
1202   load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
1203 }
1204 
1205 #if !CONFIG_REALTIME_ONLY
load_buffer_4x16(const int16_t * input,__m128i * out,const int stride,const int flipud,const int fliplr,const int shift)1206 static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
1207                                     const int stride, const int flipud,
1208                                     const int fliplr, const int shift) {
1209   const int16_t *topL = input;
1210   const int16_t *botL = input + 8 * stride;
1211 
1212   const int16_t *tmp;
1213 
1214   if (flipud) {
1215     tmp = topL;
1216     topL = botL;
1217     botL = tmp;
1218   }
1219   load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
1220   load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
1221 }
1222 #endif
1223 
load_buffer_32x8n(const int16_t * input,__m128i * out,int stride,int flipud,int fliplr,int shift,const int height)1224 static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
1225                                      int stride, int flipud, int fliplr,
1226                                      int shift, const int height) {
1227   const int16_t *in = input;
1228   __m128i *output = out;
1229   for (int col = 0; col < height; col++) {
1230     in = input + col * stride;
1231     output = out + col * 8;
1232     load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
1233     load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
1234   }
1235 }
1236 
fdct16x16_sse4_1(__m128i * in,__m128i * out,int bit,const int col_num)1237 static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
1238                              const int col_num) {
1239   const int32_t *cospi = cospi_arr(bit);
1240   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1241   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
1242   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1243   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1244   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1245   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1246   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1247   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1248   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1249   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1250   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1251   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1252   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1253   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1254   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1255   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1256   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1257   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1258   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1259   __m128i u[16], v[16], x;
1260   int col;
1261 
1262   // Calculate the column 0, 1, 2, 3
1263   for (col = 0; col < col_num; ++col) {
1264     // stage 0
1265     // stage 1
1266     u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1267     u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
1268     u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1269     u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
1270     u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1271     u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
1272     u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1273     u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
1274     u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1275     u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
1276     u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1277     u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
1278     u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1279     u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
1280     u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1281     u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
1282 
1283     // stage 2
1284     v[0] = _mm_add_epi32(u[0], u[7]);
1285     v[7] = _mm_sub_epi32(u[0], u[7]);
1286     v[1] = _mm_add_epi32(u[1], u[6]);
1287     v[6] = _mm_sub_epi32(u[1], u[6]);
1288     v[2] = _mm_add_epi32(u[2], u[5]);
1289     v[5] = _mm_sub_epi32(u[2], u[5]);
1290     v[3] = _mm_add_epi32(u[3], u[4]);
1291     v[4] = _mm_sub_epi32(u[3], u[4]);
1292     v[8] = u[8];
1293     v[9] = u[9];
1294 
1295     v[10] = _mm_mullo_epi32(u[10], cospim32);
1296     x = _mm_mullo_epi32(u[13], cospi32);
1297     v[10] = _mm_add_epi32(v[10], x);
1298     v[10] = _mm_add_epi32(v[10], rnding);
1299     v[10] = _mm_srai_epi32(v[10], bit);
1300 
1301     v[13] = _mm_mullo_epi32(u[10], cospi32);
1302     x = _mm_mullo_epi32(u[13], cospim32);
1303     v[13] = _mm_sub_epi32(v[13], x);
1304     v[13] = _mm_add_epi32(v[13], rnding);
1305     v[13] = _mm_srai_epi32(v[13], bit);
1306 
1307     v[11] = _mm_mullo_epi32(u[11], cospim32);
1308     x = _mm_mullo_epi32(u[12], cospi32);
1309     v[11] = _mm_add_epi32(v[11], x);
1310     v[11] = _mm_add_epi32(v[11], rnding);
1311     v[11] = _mm_srai_epi32(v[11], bit);
1312 
1313     v[12] = _mm_mullo_epi32(u[11], cospi32);
1314     x = _mm_mullo_epi32(u[12], cospim32);
1315     v[12] = _mm_sub_epi32(v[12], x);
1316     v[12] = _mm_add_epi32(v[12], rnding);
1317     v[12] = _mm_srai_epi32(v[12], bit);
1318     v[14] = u[14];
1319     v[15] = u[15];
1320 
1321     // stage 3
1322     u[0] = _mm_add_epi32(v[0], v[3]);
1323     u[3] = _mm_sub_epi32(v[0], v[3]);
1324     u[1] = _mm_add_epi32(v[1], v[2]);
1325     u[2] = _mm_sub_epi32(v[1], v[2]);
1326     u[4] = v[4];
1327 
1328     u[5] = _mm_mullo_epi32(v[5], cospim32);
1329     x = _mm_mullo_epi32(v[6], cospi32);
1330     u[5] = _mm_add_epi32(u[5], x);
1331     u[5] = _mm_add_epi32(u[5], rnding);
1332     u[5] = _mm_srai_epi32(u[5], bit);
1333 
1334     u[6] = _mm_mullo_epi32(v[5], cospi32);
1335     x = _mm_mullo_epi32(v[6], cospim32);
1336     u[6] = _mm_sub_epi32(u[6], x);
1337     u[6] = _mm_add_epi32(u[6], rnding);
1338     u[6] = _mm_srai_epi32(u[6], bit);
1339 
1340     u[7] = v[7];
1341     u[8] = _mm_add_epi32(v[8], v[11]);
1342     u[11] = _mm_sub_epi32(v[8], v[11]);
1343     u[9] = _mm_add_epi32(v[9], v[10]);
1344     u[10] = _mm_sub_epi32(v[9], v[10]);
1345     u[12] = _mm_sub_epi32(v[15], v[12]);
1346     u[15] = _mm_add_epi32(v[15], v[12]);
1347     u[13] = _mm_sub_epi32(v[14], v[13]);
1348     u[14] = _mm_add_epi32(v[14], v[13]);
1349 
1350     // stage 4
1351     u[0] = _mm_mullo_epi32(u[0], cospi32);
1352     u[1] = _mm_mullo_epi32(u[1], cospi32);
1353     v[0] = _mm_add_epi32(u[0], u[1]);
1354     v[0] = _mm_add_epi32(v[0], rnding);
1355     v[0] = _mm_srai_epi32(v[0], bit);
1356 
1357     v[1] = _mm_sub_epi32(u[0], u[1]);
1358     v[1] = _mm_add_epi32(v[1], rnding);
1359     v[1] = _mm_srai_epi32(v[1], bit);
1360 
1361     v[2] = _mm_mullo_epi32(u[2], cospi48);
1362     x = _mm_mullo_epi32(u[3], cospi16);
1363     v[2] = _mm_add_epi32(v[2], x);
1364     v[2] = _mm_add_epi32(v[2], rnding);
1365     v[2] = _mm_srai_epi32(v[2], bit);
1366 
1367     v[3] = _mm_mullo_epi32(u[2], cospi16);
1368     x = _mm_mullo_epi32(u[3], cospi48);
1369     v[3] = _mm_sub_epi32(x, v[3]);
1370     v[3] = _mm_add_epi32(v[3], rnding);
1371     v[3] = _mm_srai_epi32(v[3], bit);
1372 
1373     v[4] = _mm_add_epi32(u[4], u[5]);
1374     v[5] = _mm_sub_epi32(u[4], u[5]);
1375     v[6] = _mm_sub_epi32(u[7], u[6]);
1376     v[7] = _mm_add_epi32(u[7], u[6]);
1377     v[8] = u[8];
1378 
1379     v[9] = _mm_mullo_epi32(u[9], cospim16);
1380     x = _mm_mullo_epi32(u[14], cospi48);
1381     v[9] = _mm_add_epi32(v[9], x);
1382     v[9] = _mm_add_epi32(v[9], rnding);
1383     v[9] = _mm_srai_epi32(v[9], bit);
1384 
1385     v[14] = _mm_mullo_epi32(u[9], cospi48);
1386     x = _mm_mullo_epi32(u[14], cospim16);
1387     v[14] = _mm_sub_epi32(v[14], x);
1388     v[14] = _mm_add_epi32(v[14], rnding);
1389     v[14] = _mm_srai_epi32(v[14], bit);
1390 
1391     v[10] = _mm_mullo_epi32(u[10], cospim48);
1392     x = _mm_mullo_epi32(u[13], cospim16);
1393     v[10] = _mm_add_epi32(v[10], x);
1394     v[10] = _mm_add_epi32(v[10], rnding);
1395     v[10] = _mm_srai_epi32(v[10], bit);
1396 
1397     v[13] = _mm_mullo_epi32(u[10], cospim16);
1398     x = _mm_mullo_epi32(u[13], cospim48);
1399     v[13] = _mm_sub_epi32(v[13], x);
1400     v[13] = _mm_add_epi32(v[13], rnding);
1401     v[13] = _mm_srai_epi32(v[13], bit);
1402 
1403     v[11] = u[11];
1404     v[12] = u[12];
1405     v[15] = u[15];
1406 
1407     // stage 5
1408     u[0] = v[0];
1409     u[1] = v[1];
1410     u[2] = v[2];
1411     u[3] = v[3];
1412 
1413     u[4] = _mm_mullo_epi32(v[4], cospi56);
1414     x = _mm_mullo_epi32(v[7], cospi8);
1415     u[4] = _mm_add_epi32(u[4], x);
1416     u[4] = _mm_add_epi32(u[4], rnding);
1417     u[4] = _mm_srai_epi32(u[4], bit);
1418 
1419     u[7] = _mm_mullo_epi32(v[4], cospi8);
1420     x = _mm_mullo_epi32(v[7], cospi56);
1421     u[7] = _mm_sub_epi32(x, u[7]);
1422     u[7] = _mm_add_epi32(u[7], rnding);
1423     u[7] = _mm_srai_epi32(u[7], bit);
1424 
1425     u[5] = _mm_mullo_epi32(v[5], cospi24);
1426     x = _mm_mullo_epi32(v[6], cospi40);
1427     u[5] = _mm_add_epi32(u[5], x);
1428     u[5] = _mm_add_epi32(u[5], rnding);
1429     u[5] = _mm_srai_epi32(u[5], bit);
1430 
1431     u[6] = _mm_mullo_epi32(v[5], cospi40);
1432     x = _mm_mullo_epi32(v[6], cospi24);
1433     u[6] = _mm_sub_epi32(x, u[6]);
1434     u[6] = _mm_add_epi32(u[6], rnding);
1435     u[6] = _mm_srai_epi32(u[6], bit);
1436 
1437     u[8] = _mm_add_epi32(v[8], v[9]);
1438     u[9] = _mm_sub_epi32(v[8], v[9]);
1439     u[10] = _mm_sub_epi32(v[11], v[10]);
1440     u[11] = _mm_add_epi32(v[11], v[10]);
1441     u[12] = _mm_add_epi32(v[12], v[13]);
1442     u[13] = _mm_sub_epi32(v[12], v[13]);
1443     u[14] = _mm_sub_epi32(v[15], v[14]);
1444     u[15] = _mm_add_epi32(v[15], v[14]);
1445 
1446     // stage 6
1447     v[0] = u[0];
1448     v[1] = u[1];
1449     v[2] = u[2];
1450     v[3] = u[3];
1451     v[4] = u[4];
1452     v[5] = u[5];
1453     v[6] = u[6];
1454     v[7] = u[7];
1455 
1456     v[8] = _mm_mullo_epi32(u[8], cospi60);
1457     x = _mm_mullo_epi32(u[15], cospi4);
1458     v[8] = _mm_add_epi32(v[8], x);
1459     v[8] = _mm_add_epi32(v[8], rnding);
1460     v[8] = _mm_srai_epi32(v[8], bit);
1461 
1462     v[15] = _mm_mullo_epi32(u[8], cospi4);
1463     x = _mm_mullo_epi32(u[15], cospi60);
1464     v[15] = _mm_sub_epi32(x, v[15]);
1465     v[15] = _mm_add_epi32(v[15], rnding);
1466     v[15] = _mm_srai_epi32(v[15], bit);
1467 
1468     v[9] = _mm_mullo_epi32(u[9], cospi28);
1469     x = _mm_mullo_epi32(u[14], cospi36);
1470     v[9] = _mm_add_epi32(v[9], x);
1471     v[9] = _mm_add_epi32(v[9], rnding);
1472     v[9] = _mm_srai_epi32(v[9], bit);
1473 
1474     v[14] = _mm_mullo_epi32(u[9], cospi36);
1475     x = _mm_mullo_epi32(u[14], cospi28);
1476     v[14] = _mm_sub_epi32(x, v[14]);
1477     v[14] = _mm_add_epi32(v[14], rnding);
1478     v[14] = _mm_srai_epi32(v[14], bit);
1479 
1480     v[10] = _mm_mullo_epi32(u[10], cospi44);
1481     x = _mm_mullo_epi32(u[13], cospi20);
1482     v[10] = _mm_add_epi32(v[10], x);
1483     v[10] = _mm_add_epi32(v[10], rnding);
1484     v[10] = _mm_srai_epi32(v[10], bit);
1485 
1486     v[13] = _mm_mullo_epi32(u[10], cospi20);
1487     x = _mm_mullo_epi32(u[13], cospi44);
1488     v[13] = _mm_sub_epi32(x, v[13]);
1489     v[13] = _mm_add_epi32(v[13], rnding);
1490     v[13] = _mm_srai_epi32(v[13], bit);
1491 
1492     v[11] = _mm_mullo_epi32(u[11], cospi12);
1493     x = _mm_mullo_epi32(u[12], cospi52);
1494     v[11] = _mm_add_epi32(v[11], x);
1495     v[11] = _mm_add_epi32(v[11], rnding);
1496     v[11] = _mm_srai_epi32(v[11], bit);
1497 
1498     v[12] = _mm_mullo_epi32(u[11], cospi52);
1499     x = _mm_mullo_epi32(u[12], cospi12);
1500     v[12] = _mm_sub_epi32(x, v[12]);
1501     v[12] = _mm_add_epi32(v[12], rnding);
1502     v[12] = _mm_srai_epi32(v[12], bit);
1503 
1504     out[0 * col_num + col] = v[0];
1505     out[1 * col_num + col] = v[8];
1506     out[2 * col_num + col] = v[4];
1507     out[3 * col_num + col] = v[12];
1508     out[4 * col_num + col] = v[2];
1509     out[5 * col_num + col] = v[10];
1510     out[6 * col_num + col] = v[6];
1511     out[7 * col_num + col] = v[14];
1512     out[8 * col_num + col] = v[1];
1513     out[9 * col_num + col] = v[9];
1514     out[10 * col_num + col] = v[5];
1515     out[11 * col_num + col] = v[13];
1516     out[12 * col_num + col] = v[3];
1517     out[13 * col_num + col] = v[11];
1518     out[14 * col_num + col] = v[7];
1519     out[15 * col_num + col] = v[15];
1520   }
1521 }
1522 
fadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,const int num_cols)1523 static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
1524                               const int num_cols) {
1525   const int32_t *cospi = cospi_arr(bit);
1526   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1527   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1528   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1529   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1530   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1531   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1532   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1533   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
1534   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1535   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1536   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
1537   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1538   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1539   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1540   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1541   const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
1542   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
1543   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
1544   const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
1545   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
1546   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
1547   const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
1548   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
1549   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
1550   const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
1551   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
1552   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
1553   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
1554   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
1555   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
1556   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
1557   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
1558   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
1559   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
1560   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
1561   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
1562   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
1563   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1564   const __m128i zero = _mm_setzero_si128();
1565 
1566   __m128i u[16], v[16], x, y;
1567   int col;
1568 
1569   for (col = 0; col < num_cols; ++col) {
1570     // stage 0
1571     // stage 1
1572     u[0] = in[0 * num_cols + col];
1573     u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
1574     u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
1575     u[3] = in[8 * num_cols + col];
1576     u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
1577     u[5] = in[12 * num_cols + col];
1578     u[6] = in[4 * num_cols + col];
1579     u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
1580     u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
1581     u[9] = in[14 * num_cols + col];
1582     u[10] = in[6 * num_cols + col];
1583     u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
1584     u[12] = in[2 * num_cols + col];
1585     u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
1586     u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
1587     u[15] = in[10 * num_cols + col];
1588 
1589     // stage 2
1590     v[0] = u[0];
1591     v[1] = u[1];
1592 
1593     x = _mm_mullo_epi32(u[2], cospi32);
1594     y = _mm_mullo_epi32(u[3], cospi32);
1595     v[2] = _mm_add_epi32(x, y);
1596     v[2] = _mm_add_epi32(v[2], rnding);
1597     v[2] = _mm_srai_epi32(v[2], bit);
1598 
1599     v[3] = _mm_sub_epi32(x, y);
1600     v[3] = _mm_add_epi32(v[3], rnding);
1601     v[3] = _mm_srai_epi32(v[3], bit);
1602 
1603     v[4] = u[4];
1604     v[5] = u[5];
1605 
1606     x = _mm_mullo_epi32(u[6], cospi32);
1607     y = _mm_mullo_epi32(u[7], cospi32);
1608     v[6] = _mm_add_epi32(x, y);
1609     v[6] = _mm_add_epi32(v[6], rnding);
1610     v[6] = _mm_srai_epi32(v[6], bit);
1611 
1612     v[7] = _mm_sub_epi32(x, y);
1613     v[7] = _mm_add_epi32(v[7], rnding);
1614     v[7] = _mm_srai_epi32(v[7], bit);
1615 
1616     v[8] = u[8];
1617     v[9] = u[9];
1618 
1619     x = _mm_mullo_epi32(u[10], cospi32);
1620     y = _mm_mullo_epi32(u[11], cospi32);
1621     v[10] = _mm_add_epi32(x, y);
1622     v[10] = _mm_add_epi32(v[10], rnding);
1623     v[10] = _mm_srai_epi32(v[10], bit);
1624 
1625     v[11] = _mm_sub_epi32(x, y);
1626     v[11] = _mm_add_epi32(v[11], rnding);
1627     v[11] = _mm_srai_epi32(v[11], bit);
1628 
1629     v[12] = u[12];
1630     v[13] = u[13];
1631 
1632     x = _mm_mullo_epi32(u[14], cospi32);
1633     y = _mm_mullo_epi32(u[15], cospi32);
1634     v[14] = _mm_add_epi32(x, y);
1635     v[14] = _mm_add_epi32(v[14], rnding);
1636     v[14] = _mm_srai_epi32(v[14], bit);
1637 
1638     v[15] = _mm_sub_epi32(x, y);
1639     v[15] = _mm_add_epi32(v[15], rnding);
1640     v[15] = _mm_srai_epi32(v[15], bit);
1641 
1642     // stage 3
1643     u[0] = _mm_add_epi32(v[0], v[2]);
1644     u[1] = _mm_add_epi32(v[1], v[3]);
1645     u[2] = _mm_sub_epi32(v[0], v[2]);
1646     u[3] = _mm_sub_epi32(v[1], v[3]);
1647     u[4] = _mm_add_epi32(v[4], v[6]);
1648     u[5] = _mm_add_epi32(v[5], v[7]);
1649     u[6] = _mm_sub_epi32(v[4], v[6]);
1650     u[7] = _mm_sub_epi32(v[5], v[7]);
1651     u[8] = _mm_add_epi32(v[8], v[10]);
1652     u[9] = _mm_add_epi32(v[9], v[11]);
1653     u[10] = _mm_sub_epi32(v[8], v[10]);
1654     u[11] = _mm_sub_epi32(v[9], v[11]);
1655     u[12] = _mm_add_epi32(v[12], v[14]);
1656     u[13] = _mm_add_epi32(v[13], v[15]);
1657     u[14] = _mm_sub_epi32(v[12], v[14]);
1658     u[15] = _mm_sub_epi32(v[13], v[15]);
1659 
1660     // stage 4
1661     v[0] = u[0];
1662     v[1] = u[1];
1663     v[2] = u[2];
1664     v[3] = u[3];
1665     v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
1666     v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
1667     v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
1668     v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
1669     v[8] = u[8];
1670     v[9] = u[9];
1671     v[10] = u[10];
1672     v[11] = u[11];
1673     v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
1674     v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
1675     v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
1676     v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
1677 
1678     // stage 5
1679     u[0] = _mm_add_epi32(v[0], v[4]);
1680     u[1] = _mm_add_epi32(v[1], v[5]);
1681     u[2] = _mm_add_epi32(v[2], v[6]);
1682     u[3] = _mm_add_epi32(v[3], v[7]);
1683     u[4] = _mm_sub_epi32(v[0], v[4]);
1684     u[5] = _mm_sub_epi32(v[1], v[5]);
1685     u[6] = _mm_sub_epi32(v[2], v[6]);
1686     u[7] = _mm_sub_epi32(v[3], v[7]);
1687     u[8] = _mm_add_epi32(v[8], v[12]);
1688     u[9] = _mm_add_epi32(v[9], v[13]);
1689     u[10] = _mm_add_epi32(v[10], v[14]);
1690     u[11] = _mm_add_epi32(v[11], v[15]);
1691     u[12] = _mm_sub_epi32(v[8], v[12]);
1692     u[13] = _mm_sub_epi32(v[9], v[13]);
1693     u[14] = _mm_sub_epi32(v[10], v[14]);
1694     u[15] = _mm_sub_epi32(v[11], v[15]);
1695 
1696     // stage 6
1697     v[0] = u[0];
1698     v[1] = u[1];
1699     v[2] = u[2];
1700     v[3] = u[3];
1701     v[4] = u[4];
1702     v[5] = u[5];
1703     v[6] = u[6];
1704     v[7] = u[7];
1705     v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
1706     v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
1707     v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
1708     v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
1709     v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
1710     v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
1711     v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
1712     v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
1713 
1714     // stage 7
1715     u[0] = _mm_add_epi32(v[0], v[8]);
1716     u[1] = _mm_add_epi32(v[1], v[9]);
1717     u[2] = _mm_add_epi32(v[2], v[10]);
1718     u[3] = _mm_add_epi32(v[3], v[11]);
1719     u[4] = _mm_add_epi32(v[4], v[12]);
1720     u[5] = _mm_add_epi32(v[5], v[13]);
1721     u[6] = _mm_add_epi32(v[6], v[14]);
1722     u[7] = _mm_add_epi32(v[7], v[15]);
1723     u[8] = _mm_sub_epi32(v[0], v[8]);
1724     u[9] = _mm_sub_epi32(v[1], v[9]);
1725     u[10] = _mm_sub_epi32(v[2], v[10]);
1726     u[11] = _mm_sub_epi32(v[3], v[11]);
1727     u[12] = _mm_sub_epi32(v[4], v[12]);
1728     u[13] = _mm_sub_epi32(v[5], v[13]);
1729     u[14] = _mm_sub_epi32(v[6], v[14]);
1730     u[15] = _mm_sub_epi32(v[7], v[15]);
1731 
1732     // stage 8
1733     v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
1734     v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
1735     v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
1736     v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
1737     v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
1738     v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
1739     v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
1740     v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
1741     v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
1742     v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
1743     v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
1744     v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
1745     v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
1746     v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
1747     v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
1748     v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
1749 
1750     // stage 9
1751     out[0 * num_cols + col] = v[1];
1752     out[1 * num_cols + col] = v[14];
1753     out[2 * num_cols + col] = v[3];
1754     out[3 * num_cols + col] = v[12];
1755     out[4 * num_cols + col] = v[5];
1756     out[5 * num_cols + col] = v[10];
1757     out[6 * num_cols + col] = v[7];
1758     out[7 * num_cols + col] = v[8];
1759     out[8 * num_cols + col] = v[9];
1760     out[9 * num_cols + col] = v[6];
1761     out[10 * num_cols + col] = v[11];
1762     out[11 * num_cols + col] = v[4];
1763     out[12 * num_cols + col] = v[13];
1764     out[13 * num_cols + col] = v[2];
1765     out[14 * num_cols + col] = v[15];
1766     out[15 * num_cols + col] = v[0];
1767   }
1768 }
1769 
col_txfm_16x16_rounding(__m128i * in,int shift)1770 static void col_txfm_16x16_rounding(__m128i *in, int shift) {
1771   // Note:
1772   //  We split 16x16 rounding into 4 sections of 8x8 rounding,
1773   //  instead of 4 columns
1774   col_txfm_8x8_rounding(&in[0], shift);
1775   col_txfm_8x8_rounding(&in[16], shift);
1776   col_txfm_8x8_rounding(&in[32], shift);
1777   col_txfm_8x8_rounding(&in[48], shift);
1778 }
1779 
col_txfm_8x16_rounding(__m128i * in,int shift)1780 static void col_txfm_8x16_rounding(__m128i *in, int shift) {
1781   col_txfm_8x8_rounding(&in[0], shift);
1782   col_txfm_8x8_rounding(&in[16], shift);
1783 }
1784 
write_buffer_16x16(const __m128i * in,int32_t * output)1785 static void write_buffer_16x16(const __m128i *in, int32_t *output) {
1786   const int size_8x8 = 16 * 4;
1787   write_buffer_8x8(&in[0], output);
1788   output += size_8x8;
1789   write_buffer_8x8(&in[16], output);
1790   output += size_8x8;
1791   write_buffer_8x8(&in[32], output);
1792   output += size_8x8;
1793   write_buffer_8x8(&in[48], output);
1794 }
idtx16x16_sse4_1(__m128i * in,__m128i * out,int bit,int col_num)1795 static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
1796   (void)bit;
1797   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
1798   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
1799   __m128i a_low;
1800 
1801   int num_iters = 16 * col_num;
1802   for (int i = 0; i < num_iters; i++) {
1803     a_low = _mm_mullo_epi32(in[i], fact);
1804     a_low = _mm_add_epi32(a_low, offset);
1805     out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
1806   }
1807 }
av1_fwd_txfm2d_16x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)1808 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
1809                                  int stride, TX_TYPE tx_type, int bd) {
1810   __m128i in[64], out[64];
1811   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
1812   const int txw_idx = get_txw_idx(TX_16X16);
1813   const int txh_idx = get_txh_idx(TX_16X16);
1814   const int col_num = 4;
1815   switch (tx_type) {
1816     case DCT_DCT:
1817       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1818       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1819       col_txfm_16x16_rounding(out, -shift[1]);
1820       transpose_16x16(out, in);
1821       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1822       transpose_16x16(out, in);
1823       write_buffer_16x16(in, coeff);
1824       break;
1825     case ADST_DCT:
1826       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1827       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1828                         col_num);
1829       col_txfm_16x16_rounding(out, -shift[1]);
1830       transpose_16x16(out, in);
1831       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1832       transpose_16x16(out, in);
1833       write_buffer_16x16(in, coeff);
1834       break;
1835     case DCT_ADST:
1836       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1837       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1838       col_txfm_16x16_rounding(out, -shift[1]);
1839       transpose_16x16(out, in);
1840       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1841                         col_num);
1842       transpose_16x16(out, in);
1843       write_buffer_16x16(in, coeff);
1844       break;
1845     case ADST_ADST:
1846       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1847       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1848                         col_num);
1849       col_txfm_16x16_rounding(out, -shift[1]);
1850       transpose_16x16(out, in);
1851       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1852                         col_num);
1853       transpose_16x16(out, in);
1854       write_buffer_16x16(in, coeff);
1855       break;
1856     case FLIPADST_DCT:
1857       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1858       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1859                         col_num);
1860       col_txfm_16x16_rounding(out, -shift[1]);
1861       transpose_16x16(out, in);
1862       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1863       transpose_16x16(out, in);
1864       write_buffer_16x16(in, coeff);
1865       break;
1866     case DCT_FLIPADST:
1867       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1868       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1869       col_txfm_16x16_rounding(out, -shift[1]);
1870       transpose_16x16(out, in);
1871       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1872                         col_num);
1873       transpose_16x16(out, in);
1874       write_buffer_16x16(in, coeff);
1875       break;
1876     case FLIPADST_FLIPADST:
1877       load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
1878       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1879                         col_num);
1880       col_txfm_16x16_rounding(out, -shift[1]);
1881       transpose_16x16(out, in);
1882       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1883                         col_num);
1884       transpose_16x16(out, in);
1885       write_buffer_16x16(in, coeff);
1886       break;
1887     case ADST_FLIPADST:
1888       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1889       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1890                         col_num);
1891       col_txfm_16x16_rounding(out, -shift[1]);
1892       transpose_16x16(out, in);
1893       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1894                         col_num);
1895       transpose_16x16(out, in);
1896       write_buffer_16x16(in, coeff);
1897       break;
1898     case FLIPADST_ADST:
1899       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1900       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1901                         col_num);
1902       col_txfm_16x16_rounding(out, -shift[1]);
1903       transpose_16x16(out, in);
1904       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1905                         col_num);
1906       transpose_16x16(out, in);
1907       write_buffer_16x16(in, coeff);
1908       break;
1909     case IDTX:
1910       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1911       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1912       col_txfm_16x16_rounding(out, -shift[1]);
1913       transpose_16x16(out, in);
1914       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1915       transpose_16x16(out, in);
1916       write_buffer_16x16(in, coeff);
1917       break;
1918     case V_DCT:
1919       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1920       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1921       col_txfm_16x16_rounding(out, -shift[1]);
1922       transpose_16x16(out, in);
1923       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1924       transpose_16x16(out, in);
1925       write_buffer_16x16(in, coeff);
1926       break;
1927     case H_DCT:
1928       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1929       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1930       col_txfm_16x16_rounding(out, -shift[1]);
1931       transpose_16x16(out, in);
1932       fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1933       transpose_16x16(out, in);
1934       write_buffer_16x16(in, coeff);
1935       break;
1936     case V_ADST:
1937       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1938       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1939                         col_num);
1940       col_txfm_16x16_rounding(out, -shift[1]);
1941       transpose_16x16(out, in);
1942       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1943       transpose_16x16(out, in);
1944       write_buffer_16x16(in, coeff);
1945       break;
1946     case H_ADST:
1947       load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
1948       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1949       col_txfm_16x16_rounding(out, -shift[1]);
1950       transpose_16x16(out, in);
1951       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1952                         col_num);
1953       transpose_16x16(out, in);
1954       write_buffer_16x16(in, coeff);
1955       break;
1956     case V_FLIPADST:
1957       load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
1958       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
1959                         col_num);
1960       col_txfm_16x16_rounding(out, -shift[1]);
1961       transpose_16x16(out, in);
1962       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
1963       transpose_16x16(out, in);
1964       write_buffer_16x16(in, coeff);
1965       break;
1966     case H_FLIPADST:
1967       load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
1968       idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
1969       col_txfm_16x16_rounding(out, -shift[1]);
1970       transpose_16x16(out, in);
1971       fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
1972                         col_num);
1973       transpose_16x16(out, in);
1974       write_buffer_16x16(in, coeff);
1975       break;
1976     default: assert(0);
1977   }
1978   (void)bd;
1979 }
1980 
flip_buf_sse4_1(__m128i * in,__m128i * out,int size)1981 static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
1982   for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
1983   for (int i = 1; i < size; i += 2) in[size - i] = out[i];
1984 }
1985 
1986 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
1987   fdct8x8_sse4_1,   // DCT_DCT
1988   fadst8x8_sse4_1,  // ADST_DCT
1989   fdct8x8_sse4_1,   // DCT_ADST
1990   fadst8x8_sse4_1,  // ADST_ADST
1991   fadst8x8_sse4_1,  // FLIPADST_DCT
1992   fdct8x8_sse4_1,   // DCT_FLIPADST
1993   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
1994   fadst8x8_sse4_1,  // ADST_FLIPADST
1995   fadst8x8_sse4_1,  // FLIPADST_ADST
1996   idtx8x8_sse4_1,   // IDTX
1997   fdct8x8_sse4_1,   // V_DCT
1998   idtx8x8_sse4_1,   // H_DCT
1999   fadst8x8_sse4_1,  // V_ADST
2000   idtx8x8_sse4_1,   // H_ADST
2001   fadst8x8_sse4_1,  // V_FLIPADST
2002   idtx8x8_sse4_1    // H_FLIPADST
2003 };
2004 #if !CONFIG_REALTIME_ONLY
2005 static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
2006   fdct8x8_sse4_1,   // DCT_DCT
2007   NULL,             // ADST_DCT
2008   NULL,             // DCT_ADST
2009   NULL,             // ADST_ADST
2010   NULL,             // FLIPADST_DCT
2011   NULL,             // DCT_FLIPADST
2012   NULL,             // FLIPADST_FLIPADST
2013   NULL,             // ADST_FLIPADST
2014   NULL,             // FLIPADST-ADST
2015   idtx32x8_sse4_1,  // IDTX
2016   NULL,             // V_DCT
2017   NULL,             // H_DCT
2018   NULL,             // V_ADST
2019   NULL,             // H_ADST
2020   NULL,             // V_FLIPADST
2021   NULL,             // H_FLIPADST
2022 };
2023 #endif
2024 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
2025   fdct4x8_sse4_1,   // DCT_DCT
2026   fadst8x8_sse4_1,  // ADST_DCT
2027   fdct4x8_sse4_1,   // DCT_ADST
2028   fadst8x8_sse4_1,  // ADST_ADST
2029   fadst8x8_sse4_1,  // FLIPADST_DCT
2030   fdct4x8_sse4_1,   // DCT_FLIPADST
2031   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
2032   fadst8x8_sse4_1,  // ADST_FLIPADST
2033   fadst8x8_sse4_1,  // FLIPADST_ADST
2034   idtx8x8_sse4_1,   // IDTX
2035   fdct4x8_sse4_1,   // V_DCT
2036   idtx8x8_sse4_1,   // H_DCT
2037   fadst8x8_sse4_1,  // V_ADST
2038   idtx8x8_sse4_1,   // H_ADST
2039   fadst8x8_sse4_1,  // V_FLIPADST
2040   idtx8x8_sse4_1    // H_FLIPADST
2041 };
2042 
2043 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
2044   fdct16x16_sse4_1,   // DCT_DCT
2045   fdct16x16_sse4_1,   // ADST_DCT
2046   fadst16x16_sse4_1,  // DCT_ADST
2047   fadst16x16_sse4_1,  // ADST_ADST
2048   fdct16x16_sse4_1,   // FLIPADST_DCT
2049   fadst16x16_sse4_1,  // DCT_FLIPADST
2050   fadst16x16_sse4_1,  // FLIPADST_FLIPADST
2051   fadst16x16_sse4_1,  // ADST_FLIPADST
2052   fadst16x16_sse4_1,  // FLIPADST_ADST
2053   idtx16x16_sse4_1,   // IDTX
2054   idtx16x16_sse4_1,   // V_DCT
2055   fdct16x16_sse4_1,   // H_DCT
2056   idtx16x16_sse4_1,   // V_ADST
2057   fadst16x16_sse4_1,  // H_ADST
2058   idtx16x16_sse4_1,   // V_FLIPADST
2059   fadst16x16_sse4_1   // H_FLIPADST
2060 };
2061 
2062 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
2063   fdct16x16_sse4_1,   // DCT_DCT
2064   fadst16x16_sse4_1,  // ADST_DCT
2065   fdct16x16_sse4_1,   // DCT_ADST
2066   fadst16x16_sse4_1,  // ADST_ADST
2067   fadst16x16_sse4_1,  // FLIPADST_DCT
2068   fdct16x16_sse4_1,   // DCT_FLIPADST
2069   fadst16x16_sse4_1,  // FLIPADST_FLIPADST
2070   fadst16x16_sse4_1,  // ADST_FLIPADST
2071   fadst16x16_sse4_1,  // FLIPADST_ADST
2072   idtx16x16_sse4_1,   // IDTX
2073   fdct16x16_sse4_1,   // V_DCT
2074   idtx16x16_sse4_1,   // H_DCT
2075   fadst16x16_sse4_1,  // V_ADST
2076   idtx16x16_sse4_1,   // H_ADST
2077   fadst16x16_sse4_1,  // V_FLIPADST
2078   idtx16x16_sse4_1    // H_FLIPADST
2079 };
2080 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
2081   fdct8x8_sse4_1,   // DCT_DCT
2082   fdct8x8_sse4_1,   // ADST_DCT
2083   fadst8x8_sse4_1,  // DCT_ADST
2084   fadst8x8_sse4_1,  // ADST_ADST
2085   fdct8x8_sse4_1,   // FLIPADST_DCT
2086   fadst8x8_sse4_1,  // DCT_FLIPADST
2087   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
2088   fadst8x8_sse4_1,  // ADST_FLIPADST
2089   fadst8x8_sse4_1,  // FLIPADST_ADST
2090   idtx8x8_sse4_1,   // IDTX
2091   idtx8x8_sse4_1,   // V_DCT
2092   fdct8x8_sse4_1,   // H_DCT
2093   idtx8x8_sse4_1,   // V_ADST
2094   fadst8x8_sse4_1,  // H_ADST
2095   idtx8x8_sse4_1,   // V_FLIPADST
2096   fadst8x8_sse4_1   // H_FLIPADST
2097 };
2098 
2099 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
2100   fdct4x8_sse4_1,   // DCT_DCT
2101   fdct4x8_sse4_1,   // ADST_DCT
2102   fadst8x8_sse4_1,  // DCT_ADST
2103   fadst8x8_sse4_1,  // ADST_ADST
2104   fdct4x8_sse4_1,   // FLIPADST_DCT
2105   fadst8x8_sse4_1,  // DCT_FLIPADST
2106   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
2107   fadst8x8_sse4_1,  // ADST_FLIPADST
2108   fadst8x8_sse4_1,  // FLIPADST_ADST
2109   idtx8x8_sse4_1,   // IDTX
2110   idtx8x8_sse4_1,   // V_DCT
2111   fdct4x8_sse4_1,   // H_DCT
2112   idtx8x8_sse4_1,   // V_ADST
2113   fadst8x8_sse4_1,  // H_ADST
2114   idtx8x8_sse4_1,   // V_FLIPADST
2115   fadst8x8_sse4_1   // H_FLIPADST
2116 };
2117 
2118 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
2119   fdct4x4_sse4_1,   // DCT_DCT
2120   fdct4x4_sse4_1,   // ADST_DCT
2121   fadst4x4_sse4_1,  // DCT_ADST
2122   fadst4x4_sse4_1,  // ADST_ADST
2123   fdct4x4_sse4_1,   // FLIPADST_DCT
2124   fadst4x4_sse4_1,  // DCT_FLIPADST
2125   fadst4x4_sse4_1,  // FLIPADST_FLIPADST
2126   fadst4x4_sse4_1,  // ADST_FLIPADST
2127   fadst4x4_sse4_1,  // FLIPADST_ADST
2128   idtx4x4_sse4_1,   // IDTX
2129   idtx4x4_sse4_1,   // V_DCT
2130   fdct4x4_sse4_1,   // H_DCT
2131   idtx4x4_sse4_1,   // V_ADST
2132   fadst4x4_sse4_1,  // H_ADST
2133   idtx4x4_sse4_1,   // V_FLIPADST
2134   fadst4x4_sse4_1   // H_FLIPADST
2135 };
2136 
2137 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
2138   fdct4x4_sse4_1,   // DCT_DCT
2139   fadst4x4_sse4_1,  // ADST_DCT
2140   fdct4x4_sse4_1,   // DCT_ADST
2141   fadst4x4_sse4_1,  // ADST_ADST
2142   fadst4x4_sse4_1,  // FLIPADST_DCT
2143   fdct4x4_sse4_1,   // DCT_FLIPADST
2144   fadst4x4_sse4_1,  // FLIPADST_FLIPADST
2145   fadst4x4_sse4_1,  // ADST_FLIPADST
2146   fadst4x4_sse4_1,  // FLIPADST_ADST
2147   idtx4x4_sse4_1,   // IDTX
2148   fdct4x4_sse4_1,   // V_DCT
2149   idtx4x4_sse4_1,   // H_DCT
2150   fadst4x4_sse4_1,  // V_ADST
2151   idtx4x4_sse4_1,   // H_ADST
2152   fadst4x4_sse4_1,  // V_FLIPADST
2153   idtx4x4_sse4_1    // H_FLIPADST
2154 };
2155 
2156 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
2157   av1_fdct32_sse4_1,  // DCT_DCT
2158   NULL,               // ADST_DCT
2159   NULL,               // DCT_ADST
2160   NULL,               // ADST_ADST
2161   NULL,               // FLIPADST_DCT
2162   NULL,               // DCT_FLIPADST
2163   NULL,               // FLIPADST_FLIPADST
2164   NULL,               // ADST_FLIPADST
2165   NULL,               // FLIPADST_ADST
2166   av1_idtx32_sse4_1,  // IDTX
2167   NULL,               // V_DCT
2168   NULL,               // H_DCT
2169   NULL,               // V_ADST
2170   NULL,               // H_ADST
2171   NULL,               // V_FLIPADST
2172   NULL                // H_FLIPADST
2173 };
2174 
2175 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
2176   fdct16x16_sse4_1,  // DCT_DCT
2177   NULL,              // ADST_DCT
2178   NULL,              // DCT_ADST
2179   NULL,              // ADST_ADST
2180   NULL,              // FLIPADST_DCT
2181   NULL,              // DCT_FLIPADST
2182   NULL,              // FLIPADST_FLIPADST
2183   NULL,              // ADST_FLIPADST
2184   NULL,              // FLIPADST_ADST
2185   idtx16x16_sse4_1,  // IDTX
2186   NULL,              // V_DCT
2187   NULL,              // H_DCT
2188   NULL,              // V_ADST
2189   NULL,              // H_ADST
2190   NULL,              // V_FLIPADST
2191   NULL               // H_FLIPADST
2192 };
2193 
av1_fwd_txfm2d_16x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2194 void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
2195                                 int stride, TX_TYPE tx_type, int bd) {
2196   __m128i in[32], out[32];
2197   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2198   const int txw_idx = get_txw_idx(TX_16X8);
2199   const int txh_idx = get_txh_idx(TX_16X8);
2200   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
2201   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
2202   int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2203   int ud_flip, lr_flip;
2204   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2205 
2206   for (int i = 0; i < 2; i++) {
2207     load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
2208     col_txfm(in, in, bit, 2);
2209     col_txfm_8x8_rounding(in, -shift[1]);
2210     transpose_8x8(in, out + i * 16);
2211   }
2212 
2213   if (lr_flip) {
2214     flip_buf_sse4_1(in, out, 32);
2215     row_txfm(in, out, bit, 2);
2216   } else {
2217     row_txfm(out, out, bit, 2);
2218   }
2219 
2220   for (int i = 0; i < 2; i++) {
2221     transpose_8x8(out + i * 16, in);
2222     av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
2223     write_buffer_16x8(in, coeff + i * 8, 16);
2224   }
2225 
2226   (void)bd;
2227 }
2228 
av1_fwd_txfm2d_8x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2229 void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
2230                                 int stride, TX_TYPE tx_type, int bd) {
2231   __m128i in[32], out[32];
2232   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2233   const int txw_idx = get_txw_idx(TX_8X16);
2234   const int txh_idx = get_txh_idx(TX_8X16);
2235   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
2236   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
2237   int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2238   int ud_flip, lr_flip;
2239   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2240 
2241   load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
2242   col_txfm(in, in, bit, 2);
2243   col_txfm_8x16_rounding(in, -shift[1]);
2244   transpose_8x8(in, out);
2245   transpose_8x8(in + 16, out + 16);
2246 
2247   for (int i = 0; i < 2; i++) {
2248     row_txfm(out + i * 16, out, bit, 2);
2249     transpose_8x8(out, in);
2250     av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
2251     write_buffer_8x8(in, coeff + i * 64);
2252   }
2253 
2254   (void)bd;
2255 }
2256 
2257 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_4x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2258 void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
2259                                 int stride, TX_TYPE tx_type, int bd) {
2260   __m128i in[16];
2261   __m128i *outcoeff128 = (__m128i *)coeff;
2262   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
2263   const int txw_idx = get_txw_idx(TX_4X16);
2264   const int txh_idx = get_txh_idx(TX_4X16);
2265   const int txfm_size_col = tx_size_wide[TX_4X16];
2266   const int txfm_size_row = tx_size_high[TX_4X16];
2267   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2268   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2269   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
2270   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
2271 
2272   int ud_flip, lr_flip;
2273   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2274   // col transform
2275   load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
2276   col_txfm(in, outcoeff128, bitcol, 1);
2277   col_txfm_8x8_rounding(outcoeff128, -shift[1]);
2278   transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
2279 
2280   // row transform
2281   for (int i = 0; i < txfm_size_col; i++) {
2282     row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col);
2283   }
2284   (void)bd;
2285 }
2286 #endif
2287 
av1_fwd_txfm2d_16x4_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2288 void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
2289                                 int stride, TX_TYPE tx_type, int bd) {
2290   __m128i in[16];
2291   __m128i *outcoeff128 = (__m128i *)coeff;
2292   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
2293   const int txw_idx = get_txw_idx(TX_16X4);
2294   const int txh_idx = get_txh_idx(TX_16X4);
2295   const int txfm_size_col = tx_size_wide[TX_16X4];
2296   const int txfm_size_row = tx_size_high[TX_16X4];
2297   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2298   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2299   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
2300   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
2301   int ud_flip, lr_flip;
2302   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2303 
2304   // col transform
2305   load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
2306 
2307   for (int i = 0; i < txfm_size_row; i++) {
2308     col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol,
2309              1);
2310   }
2311   col_txfm_8x8_rounding(outcoeff128, -shift[1]);
2312 
2313   // row transform
2314   row_txfm(outcoeff128, in, bitrow, 1);
2315   transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
2316   (void)bd;
2317 }
2318 
av1_fwd_txfm2d_16x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2319 void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
2320                                  int stride, TX_TYPE tx_type, int bd) {
2321   __m128i in[128];
2322   __m128i *outcoef128 = (__m128i *)coeff;
2323   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
2324   const int txw_idx = get_txw_idx(TX_16X32);
2325   const int txh_idx = get_txh_idx(TX_16X32);
2326   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
2327   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
2328   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2329   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2330 
2331   // column transform
2332   load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2333   load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
2334 
2335   for (int i = 0; i < 4; i++) {
2336     col_txfm((in + i), (in + i), bitcol, 4);
2337   }
2338   col_txfm_16x16_rounding(&in[0], -shift[1]);
2339   col_txfm_16x16_rounding(&in[64], -shift[1]);
2340   transpose_8nx8n(in, outcoef128, 16, 32);
2341 
2342   // row transform
2343   row_txfm(outcoef128, in, bitrow, 8);
2344   transpose_8nx8n(in, outcoef128, 32, 16);
2345   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
2346                                        NewSqrt2);
2347   (void)bd;
2348 }
2349 
av1_fwd_txfm2d_32x64_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2350 void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
2351                                  int stride, TX_TYPE tx_type, int bd) {
2352   (void)tx_type;
2353   __m128i in[512];
2354   __m128i *outcoef128 = (__m128i *)coeff;
2355   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
2356   const int txw_idx = get_txw_idx(TX_32X64);
2357   const int txh_idx = get_txh_idx(TX_32X64);
2358   const int txfm_size_col = tx_size_wide[TX_32X64];
2359   const int txfm_size_row = tx_size_high[TX_32X64];
2360   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2361   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2362   const int num_row = txfm_size_row >> 2;
2363   const int num_col = txfm_size_col >> 2;
2364 
2365   // column transform
2366   load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
2367   for (int i = 0; i < num_col; i++) {
2368     av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
2369   }
2370   for (int i = 0; i < num_col; i++) {
2371     col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
2372   }
2373   transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2374 
2375   // row transform
2376   for (int i = 0; i < num_row; i++) {
2377     av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
2378   }
2379   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2380   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512, -shift[2],
2381                                        NewSqrt2);
2382   (void)bd;
2383 }
2384 
av1_fwd_txfm2d_64x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2385 void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
2386                                  int stride, TX_TYPE tx_type, int bd) {
2387   (void)tx_type;
2388   __m128i in[512];
2389   __m128i *outcoef128 = (__m128i *)coeff;
2390   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
2391   const int txw_idx = get_txw_idx(TX_64X32);
2392   const int txh_idx = get_txh_idx(TX_64X32);
2393   const int txfm_size_col = tx_size_wide[TX_64X32];
2394   const int txfm_size_row = tx_size_high[TX_64X32];
2395   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2396   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2397   const int num_row = txfm_size_row >> 2;
2398   const int num_col = txfm_size_col >> 2;
2399 
2400   // column transform
2401   for (int i = 0; i < 32; i++) {
2402     load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
2403     load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
2404                     shift[0]);
2405     load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
2406                     shift[0]);
2407     load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
2408                     shift[0]);
2409   }
2410 
2411   for (int i = 0; i < num_col; i++) {
2412     av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
2413   }
2414 
2415   for (int i = 0; i < num_row; i++) {
2416     col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
2417   }
2418   transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2419 
2420   // row transform
2421   for (int i = 0; i < num_row; i++) {
2422     av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
2423   }
2424   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
2425   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512 >> 1,
2426                                        -shift[2], NewSqrt2);
2427   (void)bd;
2428 }
2429 
av1_fwd_txfm2d_32x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2430 void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
2431                                  int stride, TX_TYPE tx_type, int bd) {
2432   __m128i in[128];
2433   __m128i *outcoef128 = (__m128i *)coeff;
2434   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
2435   const int txw_idx = get_txw_idx(TX_32X16);
2436   const int txh_idx = get_txh_idx(TX_32X16);
2437   const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
2438   const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
2439   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2440   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2441 
2442   // column transform
2443   load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
2444   col_txfm(in, in, bitcol, 8);
2445   col_txfm_16x16_rounding(&in[0], -shift[1]);
2446   col_txfm_16x16_rounding(&in[64], -shift[1]);
2447   transpose_8nx8n(in, outcoef128, 32, 16);
2448 
2449   // row transform
2450   for (int i = 0; i < 4; i++) {
2451     row_txfm((outcoef128 + i), (in + i), bitrow, 4);
2452   }
2453   transpose_8nx8n(in, outcoef128, 16, 32);
2454   av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
2455                                        NewSqrt2);
2456   (void)bd;
2457 }
2458 
2459 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_8x32_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2460 void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
2461                                 int stride, TX_TYPE tx_type, int bd) {
2462   __m128i in[64];
2463   __m128i *outcoef128 = (__m128i *)coeff;
2464   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
2465   const int txw_idx = get_txw_idx(TX_8X32);
2466   const int txh_idx = get_txh_idx(TX_8X32);
2467   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
2468   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
2469   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2470   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2471 
2472   const int txfm_size_col = tx_size_wide[TX_8X32];
2473   const int txfm_size_row = tx_size_high[TX_8X32];
2474   const int num_col = txfm_size_col >> 2;
2475 
2476   // column transform
2477   load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
2478   load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
2479                    stride, 0, 0, shift[0]);
2480 
2481   for (int i = 0; i < num_col; i++) {
2482     col_txfm((in + i), (in + i), bitcol, num_col);
2483   }
2484   col_txfm_16x16_rounding(in, -shift[1]);
2485   transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2486 
2487   // row transform
2488   for (int i = 0; i < txfm_size_col; i += 2) {
2489     row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col);
2490   }
2491   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2492   (void)bd;
2493 }
2494 
av1_fwd_txfm2d_32x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2495 void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
2496                                 int stride, TX_TYPE tx_type, int bd) {
2497   __m128i in[64];
2498   __m128i *outcoef128 = (__m128i *)coeff;
2499   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
2500   const int txw_idx = get_txw_idx(TX_32X8);
2501   const int txh_idx = get_txh_idx(TX_32X8);
2502   const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
2503   const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
2504   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2505   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2506 
2507   const int txfm_size_col = tx_size_wide[TX_32X8];
2508   const int txfm_size_row = tx_size_high[TX_32X8];
2509   const int num_col = txfm_size_row >> 2;
2510 
2511   // column transform
2512   load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
2513   for (int i = 0; i < txfm_size_row; i += 2) {
2514     col_txfm((in + i), (in + i), bitcol, txfm_size_row);
2515   }
2516 
2517   col_txfm_16x16_rounding(&in[0], -shift[1]);
2518   transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
2519 
2520   // row transform
2521   for (int i = 0; i < num_col; i++) {
2522     row_txfm((outcoef128 + i), (in + i), bitrow, num_col);
2523   }
2524   transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
2525   (void)bd;
2526 }
2527 #endif
2528 
av1_fwd_txfm2d_4x8_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2529 void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
2530                                TX_TYPE tx_type, int bd) {
2531   __m128i in[8];
2532   __m128i *outcoeff128 = (__m128i *)coeff;
2533   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
2534   const int txw_idx = get_txw_idx(TX_4X8);
2535   const int txh_idx = get_txh_idx(TX_4X8);
2536   const int txfm_size_col = tx_size_wide[TX_4X8];
2537   const int txfm_size_row = tx_size_high[TX_4X8];
2538   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2539   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2540   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
2541   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
2542 
2543   int ud_flip, lr_flip;
2544   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2545 
2546   load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
2547   col_txfm(in, in, bitcol, 1);
2548   col_txfm_4x8_rounding(in, -shift[1]);
2549   transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row);
2550 
2551   for (int i = 0; i < 2; i++) {
2552     row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2);
2553   }
2554   av1_round_shift_rect_array_32_sse4_1(in, outcoeff128, txfm_size_row,
2555                                        -shift[2], NewSqrt2);
2556   (void)bd;
2557 }
2558 
av1_fwd_txfm2d_8x4_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2559 void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
2560                                TX_TYPE tx_type, int bd) {
2561   __m128i in[8];
2562   __m128i *outcoeff128 = (__m128i *)coeff;
2563   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
2564   const int txw_idx = get_txw_idx(TX_8X4);
2565   const int txh_idx = get_txh_idx(TX_8X4);
2566   const int txfm_size_col = tx_size_wide[TX_8X4];
2567   const int txfm_size_row = tx_size_high[TX_8X4];
2568   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2569   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2570   const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
2571   const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
2572   int ud_flip, lr_flip;
2573   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2574   // col tranform
2575   load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
2576   for (int i = 0; i < 2; i++) {
2577     col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1);
2578   }
2579   col_txfm_4x8_rounding(in, -shift[1]);
2580 
2581   // row tranform
2582   row_txfm(in, outcoeff128, bitrow, 1);
2583   av1_round_shift_rect_array_32_sse4_1(outcoeff128, in, txfm_size_col,
2584                                        -shift[2], NewSqrt2);
2585   transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
2586   (void)bd;
2587 }
2588 
2589 #if !CONFIG_REALTIME_ONLY
av1_fwd_txfm2d_16x64_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2590 void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
2591                                  int stride, TX_TYPE tx_type, int bd) {
2592   __m128i in[256];
2593   __m128i *outcoeff128 = (__m128i *)coeff;
2594   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
2595   const int txw_idx = get_txw_idx(TX_16X64);
2596   const int txh_idx = get_txh_idx(TX_16X64);
2597   const int txfm_size_col = tx_size_wide[TX_16X64];
2598   const int txfm_size_row = tx_size_high[TX_16X64];
2599   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2600   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2601   int ud_flip, lr_flip;
2602   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2603   const int num_col = txfm_size_col >> 2;
2604   // col tranform
2605   for (int i = 0; i < txfm_size_row; i += num_col) {
2606     load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
2607                     ud_flip, lr_flip, shift[0]);
2608     load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
2609                     ud_flip, lr_flip, shift[0]);
2610     load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
2611                     ud_flip, lr_flip, shift[0]);
2612     load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
2613                     ud_flip, lr_flip, shift[0]);
2614   }
2615 
2616   for (int i = 0; i < num_col; i++) {
2617     av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
2618   }
2619 
2620   col_txfm_16x16_rounding(outcoeff128, -shift[1]);
2621   col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
2622   col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
2623   col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
2624 
2625   transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
2626   fdct16x16_sse4_1(in, in, bitrow, 8);
2627   transpose_8nx8n(in, outcoeff128, 32, txfm_size_col);
2628   memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff));
2629   (void)bd;
2630 }
2631 
av1_fwd_txfm2d_64x16_sse4_1(const int16_t * input,int32_t * coeff,int stride,TX_TYPE tx_type,int bd)2632 void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
2633                                  int stride, TX_TYPE tx_type, int bd) {
2634   __m128i in[256];
2635   __m128i *outcoeff128 = (__m128i *)coeff;
2636   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
2637   const int txw_idx = get_txw_idx(TX_64X16);
2638   const int txh_idx = get_txh_idx(TX_64X16);
2639   const int txfm_size_col = tx_size_wide[TX_64X16];
2640   const int txfm_size_row = tx_size_high[TX_64X16];
2641   int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2642   int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2643   int ud_flip, lr_flip;
2644   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2645   // col tranform
2646   for (int i = 0; i < txfm_size_row; i++) {
2647     load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
2648                     ud_flip, lr_flip, shift[0]);
2649     load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
2650                     ud_flip, lr_flip, shift[0]);
2651     load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
2652                     ud_flip, lr_flip, shift[0]);
2653     load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
2654                     ud_flip, lr_flip, shift[0]);
2655   }
2656 
2657   fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
2658   col_txfm_16x16_rounding(outcoeff128, -shift[1]);
2659   col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
2660   col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
2661   col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
2662 
2663   transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
2664   for (int i = 0; i < 4; i++) {
2665     av1_fdct64_sse4_1(in + i, in + i, bitrow, 4, 4);
2666   }
2667   transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
2668   (void)bd;
2669 }
2670 #endif
2671