1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <emmintrin.h>
12
13 #include "config/aom_dsp_rtcd.h"
14 #include "aom_dsp/x86/convolve.h"
15
16 // -----------------------------------------------------------------------------
17
aom_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)18 void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
19 ptrdiff_t src_pitch, uint16_t *dst_ptr,
20 ptrdiff_t dst_pitch, uint32_t height,
21 const int16_t *filter, int bd) {
22 __m128i filtersReg;
23 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
24 __m128i srcReg23_lo, srcReg34_lo;
25 __m128i srcReg45_lo, srcReg56_lo;
26 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
27 __m128i resReg23_45_lo, resReg34_56_lo;
28 __m128i resReg23_45, resReg34_56;
29 __m128i addFilterReg64, secondFilters, thirdFilters;
30 unsigned int i;
31 ptrdiff_t src_stride, dst_stride;
32
33 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
34 addFilterReg64 = _mm_set1_epi32(64);
35 filtersReg = _mm_loadu_si128((const __m128i *)filter);
36
37 // coeffs 0 1 0 1 2 3 2 3
38 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
39 // coeffs 4 5 4 5 6 7 6 7
40 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
41
42 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
43 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
44
45 // multiply the size of the source and destination stride by two
46 src_stride = src_pitch << 1;
47 dst_stride = dst_pitch << 1;
48
49 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
50 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
51 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
52
53 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
54 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
55
56 for (i = height; i > 1; i -= 2) {
57 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
58 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
59
60 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
61 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
62
63 // multiply 2 adjacent elements with the filter and add the result
64
65 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
66 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
67 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
68 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
69
70 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
71 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
72
73 // shift by 7 bit each 32 bit
74 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
75 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
76 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
77 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
78
79 // shrink to 16 bit each 32 bits, the first lane contain the first
80 // convolve result and the second lane contain the second convolve
81 // result
82 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
83 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
84
85 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
86 resReg23_45 = _mm_min_epi16(resReg23_45, max);
87 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
88 resReg34_56 = _mm_min_epi16(resReg34_56, max);
89
90 src_ptr += src_stride;
91
92 _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
93 _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
94
95 dst_ptr += dst_stride;
96
97 // save part of the registers for next strides
98 srcReg23_lo = srcReg45_lo;
99 srcReg34_lo = srcReg56_lo;
100 srcReg4 = srcReg6;
101 }
102 }
103
aom_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)104 void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
105 ptrdiff_t src_pitch, uint16_t *dst_ptr,
106 ptrdiff_t dst_pitch, uint32_t height,
107 const int16_t *filter, int bd) {
108 __m128i filtersReg;
109 __m128i addFilterReg64;
110 __m128i secondFilters, thirdFilters;
111 __m128i srcRegFilt32b1_1;
112 __m128i srcReg32b1;
113 unsigned int i;
114 src_ptr -= 3;
115 addFilterReg64 = _mm_set1_epi32(64);
116 filtersReg = _mm_loadu_si128((const __m128i *)filter);
117 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
118
119 // coeffs 0 1 0 1 2 3 2 3
120 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
121 // coeffs 4 5 4 5 6 7 6 7
122 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
123
124 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
125 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
126
127 for (i = height; i > 0; i -= 1) {
128 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
129
130 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
131 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
132 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
133 __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
134 __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
135
136 ss_23 = _mm_madd_epi16(ss_23, secondFilters);
137 ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
138 srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
139
140 // shift by 7 bit each 32 bit
141 srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
142 srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
143
144 srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
145 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
146 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
147
148 src_ptr += src_pitch;
149
150 _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
151
152 dst_ptr += dst_pitch;
153 }
154 }
155
aom_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)156 void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
157 ptrdiff_t src_pitch, uint16_t *dst_ptr,
158 ptrdiff_t dst_pitch, uint32_t height,
159 const int16_t *filter, int bd) {
160 __m128i filtersReg;
161 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
162 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
163 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
164 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
165 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
166 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
167 __m128i resReg23_45, resReg34_56;
168 __m128i addFilterReg64, secondFilters, thirdFilters;
169 unsigned int i;
170 ptrdiff_t src_stride, dst_stride;
171
172 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
173 addFilterReg64 = _mm_set1_epi32(64);
174 filtersReg = _mm_loadu_si128((const __m128i *)filter);
175
176 // coeffs 0 1 0 1 2 3 2 3
177 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
178 // coeffs 4 5 4 5 6 7 6 7
179 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
180
181 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
182 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
183
184 // multiple the size of the source and destination stride by two
185 src_stride = src_pitch << 1;
186 dst_stride = dst_pitch << 1;
187
188 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
189 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
190 srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
191 srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
192
193 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
194 srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
195 srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
196
197 for (i = height; i > 1; i -= 2) {
198 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
199
200 srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
201 srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
202
203 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
204
205 srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
206 srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
207
208 // multiply 2 adjacent elements with the filter and add the result
209
210 resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
211 resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
212 resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
213 resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
214
215 resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
216 resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
217
218 // multiply 2 adjacent elements with the filter and add the result
219
220 resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
221 resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
222 resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
223 resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
224
225 resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
226 resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
227
228 // shift by 7 bit each 32 bit
229 resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
230 resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
231 resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
232 resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
233 resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
234 resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
235 resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
236 resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
237
238 // shrink to 16 bit each 32 bits, the first lane contain the first
239 // convolve result and the second lane contain the second convolve
240 // result
241 resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
242 resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
243
244 resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
245 resReg23_45 = _mm_min_epi16(resReg23_45, max);
246 resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
247 resReg34_56 = _mm_min_epi16(resReg34_56, max);
248
249 src_ptr += src_stride;
250
251 _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
252 _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
253
254 dst_ptr += dst_stride;
255
256 // save part of the registers for next strides
257 srcReg23_lo = srcReg45_lo;
258 srcReg23_hi = srcReg45_hi;
259 srcReg34_lo = srcReg56_lo;
260 srcReg34_hi = srcReg56_hi;
261 srcReg4 = srcReg6;
262 }
263 }
264
aom_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)265 void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
266 ptrdiff_t src_pitch, uint16_t *dst_ptr,
267 ptrdiff_t dst_pitch, uint32_t height,
268 const int16_t *filter, int bd) {
269 __m128i filtersReg;
270 __m128i addFilterReg64;
271 __m128i secondFilters, thirdFilters;
272 __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
273 __m128i srcReg32b1, srcReg32b2;
274 unsigned int i;
275 src_ptr -= 3;
276 addFilterReg64 = _mm_set1_epi32(64);
277 filtersReg = _mm_loadu_si128((const __m128i *)filter);
278 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
279
280 // coeffs 0 1 0 1 2 3 2 3
281 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
282 // coeffs 4 5 4 5 6 7 6 7
283 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
284
285 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
286 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
287
288 for (i = height; i > 0; i -= 1) {
289 srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
290 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
291
292 __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
293 __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
294 __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
295
296 __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
297 __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
298 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
299
300 __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
301 __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
302 __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
303 __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
304 __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
305 __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
306
307 d1 = _mm_madd_epi16(ss_3, secondFilters);
308 d2 = _mm_madd_epi16(ss_5, thirdFilters);
309 srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
310
311 __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
312 __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
313
314 // shift by 7 bit each 32 bit
315 res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
316 res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
317 res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
318 res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
319
320 srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
321
322 srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
323 srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
324
325 src_ptr += src_pitch;
326
327 _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
328
329 dst_ptr += dst_pitch;
330 }
331 }
332
aom_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)333 void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
334 ptrdiff_t src_pitch, uint16_t *dst_ptr,
335 ptrdiff_t dst_pitch, uint32_t height,
336 const int16_t *filter, int bd) {
337 aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
338 height, filter, bd);
339 aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
340 dst_pitch, height, filter, bd);
341 }
342
aom_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)343 void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
344 ptrdiff_t src_pitch, uint16_t *dst_ptr,
345 ptrdiff_t dst_pitch, uint32_t height,
346 const int16_t *filter, int bd) {
347 aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
348 height, filter, bd);
349 aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
350 dst_pitch, height, filter, bd);
351 }
352