• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_dsp/x86/inv_txfm_sse2.h"
12 #include "vpx_dsp/x86/txfm_common_sse2.h"
13 #include "vpx_ports/mem.h"
14 
vp9_iht4x4_16_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)15 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
16                             int tx_type) {
17   __m128i in[2];
18   const __m128i zero = _mm_setzero_si128();
19   const __m128i eight = _mm_set1_epi16(8);
20 
21   in[0] = _mm_loadu_si128((const __m128i *)(input));
22   in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
23 
24   switch (tx_type) {
25     case 0:  // DCT_DCT
26       idct4_sse2(in);
27       idct4_sse2(in);
28       break;
29     case 1:  // ADST_DCT
30       idct4_sse2(in);
31       iadst4_sse2(in);
32       break;
33     case 2:  // DCT_ADST
34       iadst4_sse2(in);
35       idct4_sse2(in);
36       break;
37     case 3:  // ADST_ADST
38       iadst4_sse2(in);
39       iadst4_sse2(in);
40       break;
41     default:
42       assert(0);
43       break;
44   }
45 
46   // Final round and shift
47   in[0] = _mm_add_epi16(in[0], eight);
48   in[1] = _mm_add_epi16(in[1], eight);
49 
50   in[0] = _mm_srai_epi16(in[0], 4);
51   in[1] = _mm_srai_epi16(in[1], 4);
52 
53   // Reconstruction and Store
54   {
55     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
56     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
57     d0 = _mm_unpacklo_epi32(d0,
58                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
59     d2 = _mm_unpacklo_epi32(
60         d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
61     d0 = _mm_unpacklo_epi8(d0, zero);
62     d2 = _mm_unpacklo_epi8(d2, zero);
63     d0 = _mm_add_epi16(d0, in[0]);
64     d2 = _mm_add_epi16(d2, in[1]);
65     d0 = _mm_packus_epi16(d0, d2);
66     // store result[0]
67     *(int *)dest = _mm_cvtsi128_si32(d0);
68     // store result[1]
69     d0 = _mm_srli_si128(d0, 4);
70     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
71     // store result[2]
72     d0 = _mm_srli_si128(d0, 4);
73     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
74     // store result[3]
75     d0 = _mm_srli_si128(d0, 4);
76     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
77   }
78 }
79 
vp9_iht8x8_64_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)80 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
81                             int tx_type) {
82   __m128i in[8];
83   const __m128i zero = _mm_setzero_si128();
84   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
85 
86   // load input data
87   in[0] = _mm_load_si128((const __m128i *)input);
88   in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
89   in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
90   in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
91   in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
92   in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
93   in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
94   in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
95 
96   switch (tx_type) {
97     case 0:  // DCT_DCT
98       idct8_sse2(in);
99       idct8_sse2(in);
100       break;
101     case 1:  // ADST_DCT
102       idct8_sse2(in);
103       iadst8_sse2(in);
104       break;
105     case 2:  // DCT_ADST
106       iadst8_sse2(in);
107       idct8_sse2(in);
108       break;
109     case 3:  // ADST_ADST
110       iadst8_sse2(in);
111       iadst8_sse2(in);
112       break;
113     default:
114       assert(0);
115       break;
116   }
117 
118   // Final rounding and shift
119   in[0] = _mm_adds_epi16(in[0], final_rounding);
120   in[1] = _mm_adds_epi16(in[1], final_rounding);
121   in[2] = _mm_adds_epi16(in[2], final_rounding);
122   in[3] = _mm_adds_epi16(in[3], final_rounding);
123   in[4] = _mm_adds_epi16(in[4], final_rounding);
124   in[5] = _mm_adds_epi16(in[5], final_rounding);
125   in[6] = _mm_adds_epi16(in[6], final_rounding);
126   in[7] = _mm_adds_epi16(in[7], final_rounding);
127 
128   in[0] = _mm_srai_epi16(in[0], 5);
129   in[1] = _mm_srai_epi16(in[1], 5);
130   in[2] = _mm_srai_epi16(in[2], 5);
131   in[3] = _mm_srai_epi16(in[3], 5);
132   in[4] = _mm_srai_epi16(in[4], 5);
133   in[5] = _mm_srai_epi16(in[5], 5);
134   in[6] = _mm_srai_epi16(in[6], 5);
135   in[7] = _mm_srai_epi16(in[7], 5);
136 
137   RECON_AND_STORE(dest + 0 * stride, in[0]);
138   RECON_AND_STORE(dest + 1 * stride, in[1]);
139   RECON_AND_STORE(dest + 2 * stride, in[2]);
140   RECON_AND_STORE(dest + 3 * stride, in[3]);
141   RECON_AND_STORE(dest + 4 * stride, in[4]);
142   RECON_AND_STORE(dest + 5 * stride, in[5]);
143   RECON_AND_STORE(dest + 6 * stride, in[6]);
144   RECON_AND_STORE(dest + 7 * stride, in[7]);
145 }
146 
vp9_iht16x16_256_add_sse2(const int16_t * input,uint8_t * dest,int stride,int tx_type)147 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
148                                int tx_type) {
149   __m128i in0[16], in1[16];
150 
151   load_buffer_8x16(input, in0);
152   input += 8;
153   load_buffer_8x16(input, in1);
154 
155   switch (tx_type) {
156     case 0:  // DCT_DCT
157       idct16_sse2(in0, in1);
158       idct16_sse2(in0, in1);
159       break;
160     case 1:  // ADST_DCT
161       idct16_sse2(in0, in1);
162       iadst16_sse2(in0, in1);
163       break;
164     case 2:  // DCT_ADST
165       iadst16_sse2(in0, in1);
166       idct16_sse2(in0, in1);
167       break;
168     case 3:  // ADST_ADST
169       iadst16_sse2(in0, in1);
170       iadst16_sse2(in0, in1);
171       break;
172     default:
173       assert(0);
174       break;
175   }
176 
177   write_buffer_8x16(dest, in0, stride);
178   dest += 8;
179   write_buffer_8x16(dest, in1, stride);
180 }
181