• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
12 #define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
13 
14 #include <emmintrin.h>  // SSE2
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_dsp/inv_txfm.h"
19 #include "vpx_dsp/x86/transpose_sse2.h"
20 #include "vpx_dsp/x86/txfm_common_sse2.h"
21 
extend_64bit(const __m128i in,__m128i * const out)22 static INLINE void extend_64bit(const __m128i in,
23                                 __m128i *const out /*out[2]*/) {
24   out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
25   out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
26 }
27 
wraplow_16bit_shift4(const __m128i in0,const __m128i in1,const __m128i rounding)28 static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
29                                            const __m128i rounding) {
30   __m128i temp[2];
31   temp[0] = _mm_add_epi32(in0, rounding);
32   temp[1] = _mm_add_epi32(in1, rounding);
33   temp[0] = _mm_srai_epi32(temp[0], 4);
34   temp[1] = _mm_srai_epi32(temp[1], 4);
35   return _mm_packs_epi32(temp[0], temp[1]);
36 }
37 
wraplow_16bit_shift5(const __m128i in0,const __m128i in1,const __m128i rounding)38 static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
39                                            const __m128i rounding) {
40   __m128i temp[2];
41   temp[0] = _mm_add_epi32(in0, rounding);
42   temp[1] = _mm_add_epi32(in1, rounding);
43   temp[0] = _mm_srai_epi32(temp[0], 5);
44   temp[1] = _mm_srai_epi32(temp[1], 5);
45   return _mm_packs_epi32(temp[0], temp[1]);
46 }
47 
dct_const_round_shift_64bit(const __m128i in)48 static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
49   const __m128i t =
50       _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
51   return _mm_srli_si128(t, 2);
52 }
53 
pack_4(const __m128i in0,const __m128i in1)54 static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
55   const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
56   const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
57   return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
58 }
59 
abs_extend_64bit_sse2(const __m128i in,__m128i * const out,__m128i * const sign)60 static INLINE void abs_extend_64bit_sse2(const __m128i in,
61                                          __m128i *const out /*out[2]*/,
62                                          __m128i *const sign /*sign[2]*/) {
63   sign[0] = _mm_srai_epi32(in, 31);
64   out[0] = _mm_xor_si128(in, sign[0]);
65   out[0] = _mm_sub_epi32(out[0], sign[0]);
66   sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]);  // 64-bit sign of 2, 3
67   sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]);  // 64-bit sign of 0, 1
68   out[1] = _mm_unpackhi_epi32(out[0], out[0]);     // 2, 3
69   out[0] = _mm_unpacklo_epi32(out[0], out[0]);     // 0, 1
70 }
71 
72 // Note: cospi must be non negative.
multiply_apply_sign_sse2(const __m128i in,const __m128i sign,const __m128i cospi)73 static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
74                                                const __m128i sign,
75                                                const __m128i cospi) {
76   __m128i out = _mm_mul_epu32(in, cospi);
77   out = _mm_xor_si128(out, sign);
78   return _mm_sub_epi64(out, sign);
79 }
80 
81 // Note: c must be non negative.
multiplication_round_shift_sse2(const __m128i * const in,const __m128i * const sign,const int c)82 static INLINE __m128i multiplication_round_shift_sse2(
83     const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
84     const int c) {
85   const __m128i pair_c = pair_set_epi32(c << 2, 0);
86   __m128i t0, t1;
87 
88   assert(c >= 0);
89   t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
90   t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
91   t0 = dct_const_round_shift_64bit(t0);
92   t1 = dct_const_round_shift_64bit(t1);
93 
94   return pack_4(t0, t1);
95 }
96 
97 // Note: c must be non negative.
multiplication_neg_round_shift_sse2(const __m128i * const in,const __m128i * const sign,const int c)98 static INLINE __m128i multiplication_neg_round_shift_sse2(
99     const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
100     const int c) {
101   const __m128i pair_c = pair_set_epi32(c << 2, 0);
102   __m128i t0, t1;
103 
104   assert(c >= 0);
105   t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
106   t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
107   t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
108   t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
109   t0 = dct_const_round_shift_64bit(t0);
110   t1 = dct_const_round_shift_64bit(t1);
111 
112   return pack_4(t0, t1);
113 }
114 
115 // Note: c0 and c1 must be non negative.
highbd_butterfly_sse2(const __m128i in0,const __m128i in1,const int c0,const int c1,__m128i * const out0,__m128i * const out1)116 static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
117                                          const int c0, const int c1,
118                                          __m128i *const out0,
119                                          __m128i *const out1) {
120   const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
121   const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
122   __m128i temp1[4], temp2[4], sign1[2], sign2[2];
123 
124   assert(c0 >= 0);
125   assert(c1 >= 0);
126   abs_extend_64bit_sse2(in0, temp1, sign1);
127   abs_extend_64bit_sse2(in1, temp2, sign2);
128   temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
129   temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
130   temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
131   temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
132   temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
133   temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
134   temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
135   temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
136   temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
137   temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
138   temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
139   temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
140   temp1[0] = dct_const_round_shift_64bit(temp1[0]);
141   temp1[1] = dct_const_round_shift_64bit(temp1[1]);
142   temp2[0] = dct_const_round_shift_64bit(temp2[0]);
143   temp2[1] = dct_const_round_shift_64bit(temp2[1]);
144   *out0 = pack_4(temp1[0], temp1[1]);
145   *out1 = pack_4(temp2[0], temp2[1]);
146 }
147 
148 // Note: c0 and c1 must be non negative.
highbd_partial_butterfly_sse2(const __m128i in,const int c0,const int c1,__m128i * const out0,__m128i * const out1)149 static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
150                                                  const int c1,
151                                                  __m128i *const out0,
152                                                  __m128i *const out1) {
153   __m128i temp[2], sign[2];
154 
155   assert(c0 >= 0);
156   assert(c1 >= 0);
157   abs_extend_64bit_sse2(in, temp, sign);
158   *out0 = multiplication_round_shift_sse2(temp, sign, c0);
159   *out1 = multiplication_round_shift_sse2(temp, sign, c1);
160 }
161 
162 // Note: c0 and c1 must be non negative.
highbd_partial_butterfly_neg_sse2(const __m128i in,const int c0,const int c1,__m128i * const out0,__m128i * const out1)163 static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
164                                                      const int c0, const int c1,
165                                                      __m128i *const out0,
166                                                      __m128i *const out1) {
167   __m128i temp[2], sign[2];
168 
169   assert(c0 >= 0);
170   assert(c1 >= 0);
171   abs_extend_64bit_sse2(in, temp, sign);
172   *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
173   *out1 = multiplication_round_shift_sse2(temp, sign, c0);
174 }
175 
highbd_butterfly_cospi16_sse2(const __m128i in0,const __m128i in1,__m128i * const out0,__m128i * const out1)176 static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
177                                                  const __m128i in1,
178                                                  __m128i *const out0,
179                                                  __m128i *const out1) {
180   __m128i temp1[2], temp2, sign[2];
181 
182   temp2 = _mm_add_epi32(in0, in1);
183   abs_extend_64bit_sse2(temp2, temp1, sign);
184   *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
185   temp2 = _mm_sub_epi32(in0, in1);
186   abs_extend_64bit_sse2(temp2, temp1, sign);
187   *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
188 }
189 
190 // Only do addition and subtraction butterfly, size = 16, 32
highbd_add_sub_butterfly(const __m128i * in,__m128i * out,int size)191 static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
192                                             int size) {
193   int i = 0;
194   const int num = size >> 1;
195   const int bound = size - 1;
196   while (i < num) {
197     out[i] = _mm_add_epi32(in[i], in[bound - i]);
198     out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
199     i++;
200   }
201 }
202 
highbd_idct8_stage4(const __m128i * const in,__m128i * const out)203 static INLINE void highbd_idct8_stage4(const __m128i *const in,
204                                        __m128i *const out) {
205   out[0] = _mm_add_epi32(in[0], in[7]);
206   out[1] = _mm_add_epi32(in[1], in[6]);
207   out[2] = _mm_add_epi32(in[2], in[5]);
208   out[3] = _mm_add_epi32(in[3], in[4]);
209   out[4] = _mm_sub_epi32(in[3], in[4]);
210   out[5] = _mm_sub_epi32(in[2], in[5]);
211   out[6] = _mm_sub_epi32(in[1], in[6]);
212   out[7] = _mm_sub_epi32(in[0], in[7]);
213 }
214 
highbd_idct8x8_final_round(__m128i * const io)215 static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
216   io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
217   io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
218   io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
219   io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
220   io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
221   io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
222   io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
223   io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
224 }
225 
highbd_idct16_4col_stage7(const __m128i * const in,__m128i * const out)226 static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
227                                              __m128i *const out) {
228   out[0] = _mm_add_epi32(in[0], in[15]);
229   out[1] = _mm_add_epi32(in[1], in[14]);
230   out[2] = _mm_add_epi32(in[2], in[13]);
231   out[3] = _mm_add_epi32(in[3], in[12]);
232   out[4] = _mm_add_epi32(in[4], in[11]);
233   out[5] = _mm_add_epi32(in[5], in[10]);
234   out[6] = _mm_add_epi32(in[6], in[9]);
235   out[7] = _mm_add_epi32(in[7], in[8]);
236   out[8] = _mm_sub_epi32(in[7], in[8]);
237   out[9] = _mm_sub_epi32(in[6], in[9]);
238   out[10] = _mm_sub_epi32(in[5], in[10]);
239   out[11] = _mm_sub_epi32(in[4], in[11]);
240   out[12] = _mm_sub_epi32(in[3], in[12]);
241   out[13] = _mm_sub_epi32(in[2], in[13]);
242   out[14] = _mm_sub_epi32(in[1], in[14]);
243   out[15] = _mm_sub_epi32(in[0], in[15]);
244 }
245 
add_clamp(const __m128i in0,const __m128i in1,const int bd)246 static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
247                                 const int bd) {
248   const __m128i zero = _mm_set1_epi16(0);
249   // Faster than _mm_set1_epi16((1 << bd) - 1).
250   const __m128i one = _mm_set1_epi16(1);
251   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
252   __m128i d;
253 
254   d = _mm_adds_epi16(in0, in1);
255   d = _mm_max_epi16(d, zero);
256   d = _mm_min_epi16(d, max);
257 
258   return d;
259 }
260 
highbd_idct_1_add_kernel(const tran_low_t * input,uint16_t * dest,int stride,int bd,const int size)261 static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
262                                             uint16_t *dest, int stride, int bd,
263                                             const int size) {
264   int a1, i, j;
265   tran_low_t out;
266   __m128i dc, d;
267 
268   out = HIGHBD_WRAPLOW(
269       dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
270   out =
271       HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
272   a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
273   dc = _mm_set1_epi16(a1);
274 
275   for (i = 0; i < size; ++i) {
276     for (j = 0; j < size; j += 8) {
277       d = _mm_load_si128((const __m128i *)(&dest[j]));
278       d = add_clamp(d, dc, bd);
279       _mm_store_si128((__m128i *)(&dest[j]), d);
280     }
281     dest += stride;
282   }
283 }
284 
recon_and_store_4(const __m128i in,uint16_t * const dest,const int bd)285 static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
286                                      const int bd) {
287   __m128i d;
288 
289   d = _mm_loadl_epi64((const __m128i *)dest);
290   d = add_clamp(d, in, bd);
291   _mm_storel_epi64((__m128i *)dest, d);
292 }
293 
recon_and_store_4x2(const __m128i in,uint16_t * const dest,const int stride,const int bd)294 static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
295                                        const int stride, const int bd) {
296   __m128i d;
297 
298   d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
299   d = _mm_castps_si128(
300       _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
301   d = add_clamp(d, in, bd);
302   _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
303   _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
304 }
305 
recon_and_store_4x4(const __m128i * const in,uint16_t * dest,const int stride,const int bd)306 static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
307                                        const int stride, const int bd) {
308   recon_and_store_4x2(in[0], dest, stride, bd);
309   dest += 2 * stride;
310   recon_and_store_4x2(in[1], dest, stride, bd);
311 }
312 
recon_and_store_8(const __m128i in,uint16_t ** const dest,const int stride,const int bd)313 static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
314                                      const int stride, const int bd) {
315   __m128i d;
316 
317   d = _mm_load_si128((const __m128i *)(*dest));
318   d = add_clamp(d, in, bd);
319   _mm_store_si128((__m128i *)(*dest), d);
320   *dest += stride;
321 }
322 
recon_and_store_8x8(const __m128i * const in,uint16_t * dest,const int stride,const int bd)323 static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
324                                        const int stride, const int bd) {
325   recon_and_store_8(in[0], &dest, stride, bd);
326   recon_and_store_8(in[1], &dest, stride, bd);
327   recon_and_store_8(in[2], &dest, stride, bd);
328   recon_and_store_8(in[3], &dest, stride, bd);
329   recon_and_store_8(in[4], &dest, stride, bd);
330   recon_and_store_8(in[5], &dest, stride, bd);
331   recon_and_store_8(in[6], &dest, stride, bd);
332   recon_and_store_8(in[7], &dest, stride, bd);
333 }
334 
load_pack_8_32bit(const tran_low_t * const input)335 static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
336   const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
337   const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
338   return _mm_packs_epi32(t0, t1);
339 }
340 
highbd_load_pack_transpose_32bit_8x8(const tran_low_t * input,const int stride,__m128i * const in)341 static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
342                                                         const int stride,
343                                                         __m128i *const in) {
344   in[0] = load_pack_8_32bit(input + 0 * stride);
345   in[1] = load_pack_8_32bit(input + 1 * stride);
346   in[2] = load_pack_8_32bit(input + 2 * stride);
347   in[3] = load_pack_8_32bit(input + 3 * stride);
348   in[4] = load_pack_8_32bit(input + 4 * stride);
349   in[5] = load_pack_8_32bit(input + 5 * stride);
350   in[6] = load_pack_8_32bit(input + 6 * stride);
351   in[7] = load_pack_8_32bit(input + 7 * stride);
352   transpose_16bit_8x8(in, in);
353 }
354 
highbd_load_transpose_32bit_8x4(const tran_low_t * input,const int stride,__m128i * in)355 static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
356                                                    const int stride,
357                                                    __m128i *in) {
358   in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
359   in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
360   in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
361   in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
362   in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
363   in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
364   in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
365   in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
366   transpose_32bit_8x4(in, in);
367 }
368 
highbd_load_transpose_32bit_4x4(const tran_low_t * input,const int stride,__m128i * in)369 static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
370                                                    const int stride,
371                                                    __m128i *in) {
372   in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
373   in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
374   in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
375   in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
376   transpose_32bit_4x4(in, in);
377 }
378 
highbd_write_buffer_8(uint16_t * dest,const __m128i in,const int bd)379 static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
380                                          const int bd) {
381   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
382   __m128i out;
383 
384   out = _mm_adds_epi16(in, final_rounding);
385   out = _mm_srai_epi16(out, 6);
386   recon_and_store_8(out, &dest, 0, bd);
387 }
388 
highbd_write_buffer_4(uint16_t * const dest,const __m128i in,const int bd)389 static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
390                                          const int bd) {
391   const __m128i final_rounding = _mm_set1_epi32(1 << 5);
392   __m128i out;
393 
394   out = _mm_add_epi32(in, final_rounding);
395   out = _mm_srai_epi32(out, 6);
396   out = _mm_packs_epi32(out, out);
397   recon_and_store_4(out, dest, bd);
398 }
399 
400 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
401