• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "./vpx_config.h"
14 #include "vpx_dsp/vpx_dsp_common.h"
15 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
16 
vpx_fdct4x4_1_sse2(const int16_t * input,tran_low_t * output,int stride)17 void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
18   __m128i in0, in1;
19   __m128i tmp;
20   const __m128i zero = _mm_setzero_si128();
21   in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
22   in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
23   in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
24          (input +  2 * stride)));
25   in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
26          (input +  3 * stride)));
27 
28   tmp = _mm_add_epi16(in0, in1);
29   in0 = _mm_unpacklo_epi16(zero, tmp);
30   in1 = _mm_unpackhi_epi16(zero, tmp);
31   in0 = _mm_srai_epi32(in0, 16);
32   in1 = _mm_srai_epi32(in1, 16);
33 
34   tmp = _mm_add_epi32(in0, in1);
35   in0 = _mm_unpacklo_epi32(tmp, zero);
36   in1 = _mm_unpackhi_epi32(tmp, zero);
37 
38   tmp = _mm_add_epi32(in0, in1);
39   in0 = _mm_srli_si128(tmp, 8);
40 
41   in1 = _mm_add_epi32(tmp, in0);
42   in0 = _mm_slli_epi32(in1, 1);
43   store_output(&in0, output);
44 }
45 
vpx_fdct8x8_1_sse2(const int16_t * input,tran_low_t * output,int stride)46 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
47   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
48   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
49   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
50   __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
51   __m128i u0, u1, sum;
52 
53   u0 = _mm_add_epi16(in0, in1);
54   u1 = _mm_add_epi16(in2, in3);
55 
56   in0  = _mm_load_si128((const __m128i *)(input + 4 * stride));
57   in1  = _mm_load_si128((const __m128i *)(input + 5 * stride));
58   in2  = _mm_load_si128((const __m128i *)(input + 6 * stride));
59   in3  = _mm_load_si128((const __m128i *)(input + 7 * stride));
60 
61   sum = _mm_add_epi16(u0, u1);
62 
63   in0 = _mm_add_epi16(in0, in1);
64   in2 = _mm_add_epi16(in2, in3);
65   sum = _mm_add_epi16(sum, in0);
66 
67   u0  = _mm_setzero_si128();
68   sum = _mm_add_epi16(sum, in2);
69 
70   in0 = _mm_unpacklo_epi16(u0, sum);
71   in1 = _mm_unpackhi_epi16(u0, sum);
72   in0 = _mm_srai_epi32(in0, 16);
73   in1 = _mm_srai_epi32(in1, 16);
74 
75   sum = _mm_add_epi32(in0, in1);
76   in0 = _mm_unpacklo_epi32(sum, u0);
77   in1 = _mm_unpackhi_epi32(sum, u0);
78 
79   sum = _mm_add_epi32(in0, in1);
80   in0 = _mm_srli_si128(sum, 8);
81 
82   in1 = _mm_add_epi32(sum, in0);
83   store_output(&in1, output);
84 }
85 
vpx_fdct16x16_1_sse2(const int16_t * input,tran_low_t * output,int stride)86 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
87                           int stride) {
88   __m128i in0, in1, in2, in3;
89   __m128i u0, u1;
90   __m128i sum = _mm_setzero_si128();
91   int i;
92 
93   for (i = 0; i < 2; ++i) {
94     input += 8 * i;
95     in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
96     in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
97     in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
98     in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
99 
100     u0 = _mm_add_epi16(in0, in1);
101     u1 = _mm_add_epi16(in2, in3);
102     sum = _mm_add_epi16(sum, u0);
103 
104     in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
105     in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
106     in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
107     in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
108 
109     sum = _mm_add_epi16(sum, u1);
110     u0  = _mm_add_epi16(in0, in1);
111     u1  = _mm_add_epi16(in2, in3);
112     sum = _mm_add_epi16(sum, u0);
113 
114     in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
115     in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
116     in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
117     in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
118 
119     sum = _mm_add_epi16(sum, u1);
120     u0  = _mm_add_epi16(in0, in1);
121     u1  = _mm_add_epi16(in2, in3);
122     sum = _mm_add_epi16(sum, u0);
123 
124     in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
125     in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
126     in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
127     in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
128 
129     sum = _mm_add_epi16(sum, u1);
130     u0  = _mm_add_epi16(in0, in1);
131     u1  = _mm_add_epi16(in2, in3);
132     sum = _mm_add_epi16(sum, u0);
133 
134     sum = _mm_add_epi16(sum, u1);
135   }
136 
137   u0  = _mm_setzero_si128();
138   in0 = _mm_unpacklo_epi16(u0, sum);
139   in1 = _mm_unpackhi_epi16(u0, sum);
140   in0 = _mm_srai_epi32(in0, 16);
141   in1 = _mm_srai_epi32(in1, 16);
142 
143   sum = _mm_add_epi32(in0, in1);
144   in0 = _mm_unpacklo_epi32(sum, u0);
145   in1 = _mm_unpackhi_epi32(sum, u0);
146 
147   sum = _mm_add_epi32(in0, in1);
148   in0 = _mm_srli_si128(sum, 8);
149 
150   in1 = _mm_add_epi32(sum, in0);
151   in1 = _mm_srai_epi32(in1, 1);
152   store_output(&in1, output);
153 }
154 
vpx_fdct32x32_1_sse2(const int16_t * input,tran_low_t * output,int stride)155 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
156                           int stride) {
157   __m128i in0, in1, in2, in3;
158   __m128i u0, u1;
159   __m128i sum = _mm_setzero_si128();
160   int i;
161 
162   for (i = 0; i < 8; ++i) {
163     in0  = _mm_load_si128((const __m128i *)(input +  0));
164     in1  = _mm_load_si128((const __m128i *)(input +  8));
165     in2  = _mm_load_si128((const __m128i *)(input + 16));
166     in3  = _mm_load_si128((const __m128i *)(input + 24));
167 
168     input += stride;
169     u0 = _mm_add_epi16(in0, in1);
170     u1 = _mm_add_epi16(in2, in3);
171     sum = _mm_add_epi16(sum, u0);
172 
173     in0  = _mm_load_si128((const __m128i *)(input +  0));
174     in1  = _mm_load_si128((const __m128i *)(input +  8));
175     in2  = _mm_load_si128((const __m128i *)(input + 16));
176     in3  = _mm_load_si128((const __m128i *)(input + 24));
177 
178     input += stride;
179     sum = _mm_add_epi16(sum, u1);
180     u0  = _mm_add_epi16(in0, in1);
181     u1  = _mm_add_epi16(in2, in3);
182     sum = _mm_add_epi16(sum, u0);
183 
184     in0  = _mm_load_si128((const __m128i *)(input +  0));
185     in1  = _mm_load_si128((const __m128i *)(input +  8));
186     in2  = _mm_load_si128((const __m128i *)(input + 16));
187     in3  = _mm_load_si128((const __m128i *)(input + 24));
188 
189     input += stride;
190     sum = _mm_add_epi16(sum, u1);
191     u0  = _mm_add_epi16(in0, in1);
192     u1  = _mm_add_epi16(in2, in3);
193     sum = _mm_add_epi16(sum, u0);
194 
195     in0  = _mm_load_si128((const __m128i *)(input +  0));
196     in1  = _mm_load_si128((const __m128i *)(input +  8));
197     in2  = _mm_load_si128((const __m128i *)(input + 16));
198     in3  = _mm_load_si128((const __m128i *)(input + 24));
199 
200     input += stride;
201     sum = _mm_add_epi16(sum, u1);
202     u0  = _mm_add_epi16(in0, in1);
203     u1  = _mm_add_epi16(in2, in3);
204     sum = _mm_add_epi16(sum, u0);
205 
206     sum = _mm_add_epi16(sum, u1);
207   }
208 
209   u0  = _mm_setzero_si128();
210   in0 = _mm_unpacklo_epi16(u0, sum);
211   in1 = _mm_unpackhi_epi16(u0, sum);
212   in0 = _mm_srai_epi32(in0, 16);
213   in1 = _mm_srai_epi32(in1, 16);
214 
215   sum = _mm_add_epi32(in0, in1);
216   in0 = _mm_unpacklo_epi32(sum, u0);
217   in1 = _mm_unpackhi_epi32(sum, u0);
218 
219   sum = _mm_add_epi32(in0, in1);
220   in0 = _mm_srli_si128(sum, 8);
221 
222   in1 = _mm_add_epi32(sum, in0);
223   in1 = _mm_srai_epi32(in1, 3);
224   store_output(&in1, output);
225 }
226 
227 #define DCT_HIGH_BIT_DEPTH 0
228 #define FDCT4x4_2D vpx_fdct4x4_sse2
229 #define FDCT8x8_2D vpx_fdct8x8_sse2
230 #define FDCT16x16_2D vpx_fdct16x16_sse2
231 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
232 #undef  FDCT4x4_2D
233 #undef  FDCT8x8_2D
234 #undef  FDCT16x16_2D
235 
236 #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
237 #define FDCT32x32_HIGH_PRECISION 0
238 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
239 #undef  FDCT32x32_2D
240 #undef  FDCT32x32_HIGH_PRECISION
241 
242 #define FDCT32x32_2D vpx_fdct32x32_sse2
243 #define FDCT32x32_HIGH_PRECISION 1
244 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
245 #undef  FDCT32x32_2D
246 #undef  FDCT32x32_HIGH_PRECISION
247 #undef  DCT_HIGH_BIT_DEPTH
248 
249 #if CONFIG_VP9_HIGHBITDEPTH
250 #define DCT_HIGH_BIT_DEPTH 1
251 #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
252 #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
253 #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
254 #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
255 #undef  FDCT4x4_2D
256 #undef  FDCT8x8_2D
257 #undef  FDCT16x16_2D
258 
259 #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
260 #define FDCT32x32_HIGH_PRECISION 0
261 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
262 #undef  FDCT32x32_2D
263 #undef  FDCT32x32_HIGH_PRECISION
264 
265 #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
266 #define FDCT32x32_HIGH_PRECISION 1
267 #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
268 #undef  FDCT32x32_2D
269 #undef  FDCT32x32_HIGH_PRECISION
270 #undef  DCT_HIGH_BIT_DEPTH
271 #endif  // CONFIG_VP9_HIGHBITDEPTH
272