1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <xmmintrin.h>
13
14 #include "./vp8_rtcd.h"
15 #include "./vpx_config.h"
16 #include "vp8/common/filter.h"
17 #include "vpx_dsp/x86/mem_sse2.h"
18 #include "vpx_ports/mem.h"
19
horizontal_16x16(uint8_t * src,const int stride,uint16_t * dst,const int xoffset)20 static INLINE void horizontal_16x16(uint8_t *src, const int stride,
21 uint16_t *dst, const int xoffset) {
22 int h;
23 const __m128i zero = _mm_setzero_si128();
24
25 if (xoffset == 0) {
26 for (h = 0; h < 17; ++h) {
27 const __m128i a = _mm_loadu_si128((__m128i *)src);
28 const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
29 const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
30 _mm_store_si128((__m128i *)dst, a_lo);
31 _mm_store_si128((__m128i *)(dst + 8), a_hi);
32 src += stride;
33 dst += 16;
34 }
35 return;
36 }
37
38 {
39 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
40 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
41 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
42
43 for (h = 0; h < 17; ++h) {
44 const __m128i a = _mm_loadu_si128((__m128i *)src);
45 const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
46 const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
47 const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
48 const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
49
50 const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
51 const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
52 const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
53 const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
54 const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
55
56 const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
57 const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
58
59 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
60 const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
61
62 const __m128i shifted_lo =
63 _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
64 const __m128i shifted_hi =
65 _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
66
67 _mm_store_si128((__m128i *)dst, shifted_lo);
68 _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
69 src += stride;
70 dst += 16;
71 }
72 }
73 }
74
vertical_16x16(uint16_t * src,uint8_t * dst,const int stride,const int yoffset)75 static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
76 const int yoffset) {
77 int h;
78
79 if (yoffset == 0) {
80 for (h = 0; h < 16; ++h) {
81 const __m128i row_lo = _mm_load_si128((__m128i *)src);
82 const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
83 const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
84 _mm_store_si128((__m128i *)dst, packed);
85 src += 16;
86 dst += stride;
87 }
88 return;
89 }
90
91 {
92 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
93 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
94 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
95
96 __m128i row_0_lo = _mm_load_si128((__m128i *)src);
97 __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
98 src += 16;
99 for (h = 0; h < 16; ++h) {
100 const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
101 const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
102
103 const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
104 const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
105 const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
106 const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
107
108 const __m128i sum_lo =
109 _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
110 const __m128i sum_hi =
111 _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
112
113 const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
114 const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
115
116 const __m128i shifted_lo =
117 _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
118 const __m128i shifted_hi =
119 _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
120
121 const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
122 _mm_store_si128((__m128i *)dst, packed);
123 row_0_lo = row_1_lo;
124 row_0_hi = row_1_hi;
125 src += 16;
126 dst += stride;
127 }
128 }
129 }
130
vp8_bilinear_predict16x16_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)131 void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
132 int xoffset, int yoffset, uint8_t *dst_ptr,
133 int dst_pitch) {
134 DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
135
136 assert((xoffset | yoffset) != 0);
137
138 horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
139
140 vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
141 }
142
horizontal_8xN(uint8_t * src,const int stride,uint16_t * dst,const int xoffset,const int height)143 static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
144 const int xoffset, const int height) {
145 int h;
146 const __m128i zero = _mm_setzero_si128();
147
148 if (xoffset == 0) {
149 for (h = 0; h < height; ++h) {
150 const __m128i a = _mm_loadl_epi64((__m128i *)src);
151 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
152 _mm_store_si128((__m128i *)dst, a_u16);
153 src += stride;
154 dst += 8;
155 }
156 return;
157 }
158
159 {
160 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
161 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
162 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
163
164 // Filter horizontally. Rather than load the whole array and transpose, load
165 // 16 values (overreading) and shift to set up the second value. Do an
166 // "extra" 9th line so the vertical pass has the necessary context.
167 for (h = 0; h < height; ++h) {
168 const __m128i a = _mm_loadu_si128((__m128i *)src);
169 const __m128i b = _mm_srli_si128(a, 1);
170 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
171 const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
172 const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
173 const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
174 const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
175 const __m128i compensated = _mm_add_epi16(sum, round_factor);
176 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
177 _mm_store_si128((__m128i *)dst, shifted);
178 src += stride;
179 dst += 8;
180 }
181 }
182 }
183
vertical_8xN(uint16_t * src,uint8_t * dst,const int stride,const int yoffset,const int height)184 static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
185 const int yoffset, const int height) {
186 int h;
187
188 if (yoffset == 0) {
189 for (h = 0; h < height; ++h) {
190 const __m128i row = _mm_load_si128((__m128i *)src);
191 const __m128i packed = _mm_packus_epi16(row, row);
192 _mm_storel_epi64((__m128i *)dst, packed);
193 src += 8;
194 dst += stride;
195 }
196 return;
197 }
198
199 {
200 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
201 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
202 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
203
204 __m128i row_0 = _mm_load_si128((__m128i *)src);
205 src += 8;
206 for (h = 0; h < height; ++h) {
207 const __m128i row_1 = _mm_load_si128((__m128i *)src);
208 const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
209 const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
210 const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
211 const __m128i compensated = _mm_add_epi16(sum, round_factor);
212 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
213 const __m128i packed = _mm_packus_epi16(shifted, shifted);
214 _mm_storel_epi64((__m128i *)dst, packed);
215 row_0 = row_1;
216 src += 8;
217 dst += stride;
218 }
219 }
220 }
221
vp8_bilinear_predict8x8_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)222 void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
223 int xoffset, int yoffset, uint8_t *dst_ptr,
224 int dst_pitch) {
225 DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
226
227 assert((xoffset | yoffset) != 0);
228
229 horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
230
231 vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
232 }
233
vp8_bilinear_predict8x4_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)234 void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
235 int xoffset, int yoffset, uint8_t *dst_ptr,
236 int dst_pitch) {
237 DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
238
239 assert((xoffset | yoffset) != 0);
240
241 horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
242
243 vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
244 }
245
horizontal_4x4(uint8_t * src,const int stride,uint16_t * dst,const int xoffset)246 static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
247 const int xoffset) {
248 int h;
249 const __m128i zero = _mm_setzero_si128();
250
251 if (xoffset == 0) {
252 for (h = 0; h < 5; ++h) {
253 const __m128i a = load_unaligned_u32(src);
254 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
255 _mm_storel_epi64((__m128i *)dst, a_u16);
256 src += stride;
257 dst += 4;
258 }
259 return;
260 }
261
262 {
263 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
264 const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
265 const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
266
267 for (h = 0; h < 5; ++h) {
268 const __m128i a = load_unaligned_u32(src);
269 const __m128i b = load_unaligned_u32(src + 1);
270 const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
271 const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
272 const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
273 const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
274 const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
275 const __m128i compensated = _mm_add_epi16(sum, round_factor);
276 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
277 _mm_storel_epi64((__m128i *)dst, shifted);
278 src += stride;
279 dst += 4;
280 }
281 }
282 }
283
vertical_4x4(uint16_t * src,uint8_t * dst,const int stride,const int yoffset)284 static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
285 const int yoffset) {
286 int h;
287
288 if (yoffset == 0) {
289 for (h = 0; h < 4; h += 2) {
290 const __m128i row = _mm_load_si128((__m128i *)src);
291 __m128i packed = _mm_packus_epi16(row, row);
292 store_unaligned_u32(dst, packed);
293 dst += stride;
294 packed = _mm_srli_si128(packed, 4);
295 store_unaligned_u32(dst, packed);
296 dst += stride;
297 src += 8;
298 }
299 return;
300 }
301
302 {
303 const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
304 const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
305 const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
306
307 for (h = 0; h < 4; h += 2) {
308 const __m128i row_0 = _mm_load_si128((__m128i *)src);
309 const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
310 const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
311 const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
312 const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
313 const __m128i compensated = _mm_add_epi16(sum, round_factor);
314 const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
315 __m128i packed = _mm_packus_epi16(shifted, shifted);
316 storeu_uint32(dst, _mm_cvtsi128_si32(packed));
317 packed = _mm_srli_si128(packed, 4);
318 dst += stride;
319 storeu_uint32(dst, _mm_cvtsi128_si32(packed));
320 dst += stride;
321 src += 8;
322 }
323 }
324 }
325
vp8_bilinear_predict4x4_sse2(uint8_t * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,uint8_t * dst_ptr,int dst_pitch)326 void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
327 int xoffset, int yoffset, uint8_t *dst_ptr,
328 int dst_pitch) {
329 DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
330
331 assert((xoffset | yoffset) != 0);
332
333 horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
334
335 vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
336 }
337