1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_idct.h"
13 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17
highbd_iadst_half_butterfly_sse4_1(const __m128i in,const int c,__m128i * const s)18 static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
19 const int c,
20 __m128i *const s) {
21 const __m128i pair_c = pair_set_epi32(4 * c, 0);
22 __m128i x[2];
23
24 extend_64bit(in, x);
25 s[0] = _mm_mul_epi32(pair_c, x[0]);
26 s[1] = _mm_mul_epi32(pair_c, x[1]);
27 }
28
highbd_iadst_butterfly_sse4_1(const __m128i in0,const __m128i in1,const int c0,const int c1,__m128i * const s0,__m128i * const s1)29 static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
30 const __m128i in1,
31 const int c0, const int c1,
32 __m128i *const s0,
33 __m128i *const s1) {
34 const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
35 const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
36 __m128i t00[2], t01[2], t10[2], t11[2];
37 __m128i x0[2], x1[2];
38
39 extend_64bit(in0, x0);
40 extend_64bit(in1, x1);
41 t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
42 t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
43 t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
44 t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
45 t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
46 t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
47 t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
48 t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
49
50 s0[0] = _mm_add_epi64(t00[0], t11[0]);
51 s0[1] = _mm_add_epi64(t00[1], t11[1]);
52 s1[0] = _mm_sub_epi64(t10[0], t01[0]);
53 s1[1] = _mm_sub_epi64(t10[1], t01[1]);
54 }
55
highbd_iadst8_sse4_1(__m128i * const io)56 static void highbd_iadst8_sse4_1(__m128i *const io) {
57 __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
58 __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
59
60 transpose_32bit_4x4x2(io, io);
61
62 // stage 1
63 highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
64 highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
65 x0[0] = _mm_add_epi64(s0[0], s4[0]);
66 x0[1] = _mm_add_epi64(s0[1], s4[1]);
67 x1[0] = _mm_add_epi64(s1[0], s5[0]);
68 x1[1] = _mm_add_epi64(s1[1], s5[1]);
69 x4[0] = _mm_sub_epi64(s0[0], s4[0]);
70 x4[1] = _mm_sub_epi64(s0[1], s4[1]);
71 x5[0] = _mm_sub_epi64(s1[0], s5[0]);
72 x5[1] = _mm_sub_epi64(s1[1], s5[1]);
73
74 highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
75 highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
76 x2[0] = _mm_add_epi64(s2[0], s6[0]);
77 x2[1] = _mm_add_epi64(s2[1], s6[1]);
78 x3[0] = _mm_add_epi64(s3[0], s7[0]);
79 x3[1] = _mm_add_epi64(s3[1], s7[1]);
80 x6[0] = _mm_sub_epi64(s2[0], s6[0]);
81 x6[1] = _mm_sub_epi64(s2[1], s6[1]);
82 x7[0] = _mm_sub_epi64(s3[0], s7[0]);
83 x7[1] = _mm_sub_epi64(s3[1], s7[1]);
84
85 x0[0] = dct_const_round_shift_64bit(x0[0]);
86 x0[1] = dct_const_round_shift_64bit(x0[1]);
87 x1[0] = dct_const_round_shift_64bit(x1[0]);
88 x1[1] = dct_const_round_shift_64bit(x1[1]);
89 x2[0] = dct_const_round_shift_64bit(x2[0]);
90 x2[1] = dct_const_round_shift_64bit(x2[1]);
91 x3[0] = dct_const_round_shift_64bit(x3[0]);
92 x3[1] = dct_const_round_shift_64bit(x3[1]);
93 x4[0] = dct_const_round_shift_64bit(x4[0]);
94 x4[1] = dct_const_round_shift_64bit(x4[1]);
95 x5[0] = dct_const_round_shift_64bit(x5[0]);
96 x5[1] = dct_const_round_shift_64bit(x5[1]);
97 x6[0] = dct_const_round_shift_64bit(x6[0]);
98 x6[1] = dct_const_round_shift_64bit(x6[1]);
99 x7[0] = dct_const_round_shift_64bit(x7[0]);
100 x7[1] = dct_const_round_shift_64bit(x7[1]);
101 s0[0] = pack_4(x0[0], x0[1]); // s0 = x0;
102 s1[0] = pack_4(x1[0], x1[1]); // s1 = x1;
103 s2[0] = pack_4(x2[0], x2[1]); // s2 = x2;
104 s3[0] = pack_4(x3[0], x3[1]); // s3 = x3;
105 x4[0] = pack_4(x4[0], x4[1]);
106 x5[0] = pack_4(x5[0], x5[1]);
107 x6[0] = pack_4(x6[0], x6[1]);
108 x7[0] = pack_4(x7[0], x7[1]);
109
110 // stage 2
111 x0[0] = _mm_add_epi32(s0[0], s2[0]);
112 x1[0] = _mm_add_epi32(s1[0], s3[0]);
113 x2[0] = _mm_sub_epi32(s0[0], s2[0]);
114 x3[0] = _mm_sub_epi32(s1[0], s3[0]);
115
116 highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
117 highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
118
119 x4[0] = _mm_add_epi64(s4[0], s6[0]);
120 x4[1] = _mm_add_epi64(s4[1], s6[1]);
121 x5[0] = _mm_add_epi64(s5[0], s7[0]);
122 x5[1] = _mm_add_epi64(s5[1], s7[1]);
123 x6[0] = _mm_sub_epi64(s4[0], s6[0]);
124 x6[1] = _mm_sub_epi64(s4[1], s6[1]);
125 x7[0] = _mm_sub_epi64(s5[0], s7[0]);
126 x7[1] = _mm_sub_epi64(s5[1], s7[1]);
127 x4[0] = dct_const_round_shift_64bit(x4[0]);
128 x4[1] = dct_const_round_shift_64bit(x4[1]);
129 x5[0] = dct_const_round_shift_64bit(x5[0]);
130 x5[1] = dct_const_round_shift_64bit(x5[1]);
131 x6[0] = dct_const_round_shift_64bit(x6[0]);
132 x6[1] = dct_const_round_shift_64bit(x6[1]);
133 x7[0] = dct_const_round_shift_64bit(x7[0]);
134 x7[1] = dct_const_round_shift_64bit(x7[1]);
135 x4[0] = pack_4(x4[0], x4[1]);
136 x5[0] = pack_4(x5[0], x5[1]);
137 x6[0] = pack_4(x6[0], x6[1]);
138 x7[0] = pack_4(x7[0], x7[1]);
139
140 // stage 3
141 s2[0] = _mm_add_epi32(x2[0], x3[0]);
142 s3[0] = _mm_sub_epi32(x2[0], x3[0]);
143 s6[0] = _mm_add_epi32(x6[0], x7[0]);
144 s7[0] = _mm_sub_epi32(x6[0], x7[0]);
145 highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
146 highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
147 highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
148 highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
149
150 x2[0] = dct_const_round_shift_64bit(s2[0]);
151 x2[1] = dct_const_round_shift_64bit(s2[1]);
152 x3[0] = dct_const_round_shift_64bit(s3[0]);
153 x3[1] = dct_const_round_shift_64bit(s3[1]);
154 x6[0] = dct_const_round_shift_64bit(s6[0]);
155 x6[1] = dct_const_round_shift_64bit(s6[1]);
156 x7[0] = dct_const_round_shift_64bit(s7[0]);
157 x7[1] = dct_const_round_shift_64bit(s7[1]);
158 x2[0] = pack_4(x2[0], x2[1]);
159 x3[0] = pack_4(x3[0], x3[1]);
160 x6[0] = pack_4(x6[0], x6[1]);
161 x7[0] = pack_4(x7[0], x7[1]);
162
163 io[0] = x0[0];
164 io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
165 io[2] = x6[0];
166 io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
167 io[4] = x3[0];
168 io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
169 io[6] = x5[0];
170 io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
171 }
172
vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)173 void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
174 int stride, int tx_type, int bd) {
175 __m128i io[16];
176
177 io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
178 io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
179 io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
180 io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
181 io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
182 io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
183 io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
184 io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
185 io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
186 io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
187 io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
188 io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
189 io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
190 io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
191 io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
192 io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
193
194 if (bd == 8) {
195 __m128i io_short[8];
196
197 io_short[0] = _mm_packs_epi32(io[0], io[4]);
198 io_short[1] = _mm_packs_epi32(io[1], io[5]);
199 io_short[2] = _mm_packs_epi32(io[2], io[6]);
200 io_short[3] = _mm_packs_epi32(io[3], io[7]);
201 io_short[4] = _mm_packs_epi32(io[8], io[12]);
202 io_short[5] = _mm_packs_epi32(io[9], io[13]);
203 io_short[6] = _mm_packs_epi32(io[10], io[14]);
204 io_short[7] = _mm_packs_epi32(io[11], io[15]);
205
206 if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
207 vpx_idct8_sse2(io_short);
208 } else {
209 iadst8_sse2(io_short);
210 }
211 if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
212 vpx_idct8_sse2(io_short);
213 } else {
214 iadst8_sse2(io_short);
215 }
216 round_shift_8x8(io_short, io);
217 } else {
218 __m128i temp[4];
219
220 if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
221 vpx_highbd_idct8x8_half1d_sse4_1(io);
222 vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
223 } else {
224 highbd_iadst8_sse4_1(io);
225 highbd_iadst8_sse4_1(&io[8]);
226 }
227
228 temp[0] = io[4];
229 temp[1] = io[5];
230 temp[2] = io[6];
231 temp[3] = io[7];
232 io[4] = io[8];
233 io[5] = io[9];
234 io[6] = io[10];
235 io[7] = io[11];
236
237 if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
238 vpx_highbd_idct8x8_half1d_sse4_1(io);
239 io[8] = temp[0];
240 io[9] = temp[1];
241 io[10] = temp[2];
242 io[11] = temp[3];
243 vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
244 } else {
245 highbd_iadst8_sse4_1(io);
246 io[8] = temp[0];
247 io[9] = temp[1];
248 io[10] = temp[2];
249 io[11] = temp[3];
250 highbd_iadst8_sse4_1(&io[8]);
251 }
252 highbd_idct8x8_final_round(io);
253 }
254 recon_and_store_8x8(io, dest, stride, bd);
255 }
256