• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <emmintrin.h>
12 
13 #include "config/aom_dsp_rtcd.h"
14 #include "aom_dsp/x86/convolve.h"
15 
16 // -----------------------------------------------------------------------------
17 
aom_highbd_filter_block1d4_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)18 void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
19                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
20                                         ptrdiff_t dst_pitch, uint32_t height,
21                                         const int16_t *filter, int bd) {
22   __m128i filtersReg;
23   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
24   __m128i srcReg23_lo, srcReg34_lo;
25   __m128i srcReg45_lo, srcReg56_lo;
26   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
27   __m128i resReg23_45_lo, resReg34_56_lo;
28   __m128i resReg23_45, resReg34_56;
29   __m128i addFilterReg64, secondFilters, thirdFilters;
30   unsigned int i;
31   ptrdiff_t src_stride, dst_stride;
32 
33   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
34   addFilterReg64 = _mm_set1_epi32(64);
35   filtersReg = _mm_loadu_si128((const __m128i *)filter);
36 
37   // coeffs 0 1 0 1 2 3 2 3
38   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
39   // coeffs 4 5 4 5 6 7 6 7
40   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
41 
42   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
43   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
44 
45   // multiply the size of the source and destination stride by two
46   src_stride = src_pitch << 1;
47   dst_stride = dst_pitch << 1;
48 
49   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
50   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
51   srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
52 
53   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
54   srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
55 
56   for (i = height; i > 1; i -= 2) {
57     srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
58     srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
59 
60     srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
61     srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
62 
63     // multiply 2 adjacent elements with the filter and add the result
64 
65     resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
66     resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
67     resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
68     resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
69 
70     resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
71     resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
72 
73     // shift by 7 bit each 32 bit
74     resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
75     resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
76     resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
77     resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
78 
79     // shrink to 16 bit each 32 bits, the first lane contain the first
80     // convolve result and the second lane contain the second convolve
81     // result
82     resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
83     resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
84 
85     resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
86     resReg23_45 = _mm_min_epi16(resReg23_45, max);
87     resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
88     resReg34_56 = _mm_min_epi16(resReg34_56, max);
89 
90     src_ptr += src_stride;
91 
92     _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
93     _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
94 
95     dst_ptr += dst_stride;
96 
97     // save part of the registers for next strides
98     srcReg23_lo = srcReg45_lo;
99     srcReg34_lo = srcReg56_lo;
100     srcReg4 = srcReg6;
101   }
102 }
103 
aom_highbd_filter_block1d4_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)104 void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
105                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
106                                         ptrdiff_t dst_pitch, uint32_t height,
107                                         const int16_t *filter, int bd) {
108   __m128i filtersReg;
109   __m128i addFilterReg64;
110   __m128i secondFilters, thirdFilters;
111   __m128i srcRegFilt32b1_1;
112   __m128i srcReg32b1;
113   unsigned int i;
114   src_ptr -= 3;
115   addFilterReg64 = _mm_set1_epi32(64);
116   filtersReg = _mm_loadu_si128((const __m128i *)filter);
117   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
118 
119   // coeffs 0 1 0 1 2 3 2 3
120   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
121   // coeffs 4 5 4 5 6 7 6 7
122   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
123 
124   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
125   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
126 
127   for (i = height; i > 0; i -= 1) {
128     srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
129 
130     __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
131     __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
132     __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
133     __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
134     __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
135 
136     ss_23 = _mm_madd_epi16(ss_23, secondFilters);
137     ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
138     srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
139 
140     // shift by 7 bit each 32 bit
141     srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
142     srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
143 
144     srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
145     srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
146     srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
147 
148     src_ptr += src_pitch;
149 
150     _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
151 
152     dst_ptr += dst_pitch;
153   }
154 }
155 
aom_highbd_filter_block1d8_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)156 void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
157                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
158                                         ptrdiff_t dst_pitch, uint32_t height,
159                                         const int16_t *filter, int bd) {
160   __m128i filtersReg;
161   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
162   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
163   __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
164   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
165   __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
166   __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
167   __m128i resReg23_45, resReg34_56;
168   __m128i addFilterReg64, secondFilters, thirdFilters;
169   unsigned int i;
170   ptrdiff_t src_stride, dst_stride;
171 
172   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
173   addFilterReg64 = _mm_set1_epi32(64);
174   filtersReg = _mm_loadu_si128((const __m128i *)filter);
175 
176   // coeffs 0 1 0 1 2 3 2 3
177   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
178   // coeffs 4 5 4 5 6 7 6 7
179   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
180 
181   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
182   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
183 
184   // multiple the size of the source and destination stride by two
185   src_stride = src_pitch << 1;
186   dst_stride = dst_pitch << 1;
187 
188   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
189   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
190   srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
191   srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
192 
193   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
194   srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
195   srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
196 
197   for (i = height; i > 1; i -= 2) {
198     srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
199 
200     srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
201     srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
202 
203     srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
204 
205     srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
206     srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
207 
208     // multiply 2 adjacent elements with the filter and add the result
209 
210     resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
211     resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
212     resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
213     resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
214 
215     resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
216     resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
217 
218     // multiply 2 adjacent elements with the filter and add the result
219 
220     resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
221     resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
222     resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
223     resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
224 
225     resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
226     resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
227 
228     // shift by 7 bit each 32 bit
229     resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
230     resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
231     resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
232     resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
233     resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
234     resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
235     resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
236     resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
237 
238     // shrink to 16 bit each 32 bits, the first lane contain the first
239     // convolve result and the second lane contain the second convolve
240     // result
241     resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
242     resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
243 
244     resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
245     resReg23_45 = _mm_min_epi16(resReg23_45, max);
246     resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
247     resReg34_56 = _mm_min_epi16(resReg34_56, max);
248 
249     src_ptr += src_stride;
250 
251     _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
252     _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
253 
254     dst_ptr += dst_stride;
255 
256     // save part of the registers for next strides
257     srcReg23_lo = srcReg45_lo;
258     srcReg23_hi = srcReg45_hi;
259     srcReg34_lo = srcReg56_lo;
260     srcReg34_hi = srcReg56_hi;
261     srcReg4 = srcReg6;
262   }
263 }
264 
aom_highbd_filter_block1d8_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)265 void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
266                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
267                                         ptrdiff_t dst_pitch, uint32_t height,
268                                         const int16_t *filter, int bd) {
269   __m128i filtersReg;
270   __m128i addFilterReg64;
271   __m128i secondFilters, thirdFilters;
272   __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
273   __m128i srcReg32b1, srcReg32b2;
274   unsigned int i;
275   src_ptr -= 3;
276   addFilterReg64 = _mm_set1_epi32(64);
277   filtersReg = _mm_loadu_si128((const __m128i *)filter);
278   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
279 
280   // coeffs 0 1 0 1 2 3 2 3
281   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
282   // coeffs 4 5 4 5 6 7 6 7
283   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
284 
285   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
286   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
287 
288   for (i = height; i > 0; i -= 1) {
289     srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
290     srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
291 
292     __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
293     __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
294     __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
295 
296     __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
297     __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
298     srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
299 
300     __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
301     __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
302     __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
303     __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
304     __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
305     __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
306 
307     d1 = _mm_madd_epi16(ss_3, secondFilters);
308     d2 = _mm_madd_epi16(ss_5, thirdFilters);
309     srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
310 
311     __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
312     __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
313 
314     // shift by 7 bit each 32 bit
315     res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
316     res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
317     res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
318     res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
319 
320     srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
321 
322     srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
323     srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
324 
325     src_ptr += src_pitch;
326 
327     _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
328 
329     dst_ptr += dst_pitch;
330   }
331 }
332 
aom_highbd_filter_block1d16_v4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)333 void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
334                                          ptrdiff_t src_pitch, uint16_t *dst_ptr,
335                                          ptrdiff_t dst_pitch, uint32_t height,
336                                          const int16_t *filter, int bd) {
337   aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
338                                      height, filter, bd);
339   aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
340                                      dst_pitch, height, filter, bd);
341 }
342 
aom_highbd_filter_block1d16_h4_sse2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)343 void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
344                                          ptrdiff_t src_pitch, uint16_t *dst_ptr,
345                                          ptrdiff_t dst_pitch, uint32_t height,
346                                          const int16_t *filter, int bd) {
347   aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
348                                      height, filter, bd);
349   aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
350                                      dst_pitch, height, filter, bd);
351 }
352