• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>  // SSE2
13 
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom_dsp/x86/convolve.h"
16 #include "aom_ports/mem.h"
17 
aom_filter_block1d16_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)18 void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
19                                   ptrdiff_t src_pixels_per_line,
20                                   uint8_t *output_ptr, ptrdiff_t output_pitch,
21                                   uint32_t output_height,
22                                   const int16_t *filter) {
23   __m128i filtersReg;
24   __m128i addFilterReg32;
25   __m128i secondFilters, thirdFilters;
26   __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
27       srcRegFilt32b2_2;
28   __m128i srcReg32b1, srcReg32b2;
29   unsigned int i;
30   src_ptr -= 3;
31   addFilterReg32 = _mm_set1_epi16(32);
32   filtersReg = _mm_loadu_si128((const __m128i *)filter);
33   filtersReg = _mm_srai_epi16(filtersReg, 1);
34 
35   // coeffs 0 1 0 1 2 3 2 3
36   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
37   // coeffs 4 5 4 5 6 7 6 7
38   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
39 
40   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
41   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
42 
43   for (i = output_height; i > 0; i -= 1) {
44     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
45 
46     __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
47     __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
48     __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
49     __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
50     __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
51     __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
52     srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
53 
54     __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
55     __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
56     __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
57     __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
58     d1 = _mm_madd_epi16(ss_1_2, secondFilters);
59     d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
60     srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
61 
62     __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
63     __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
64     srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
65 
66     // reading stride of the next 16 bytes
67     // (part of it was being read by earlier read)
68     srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
69 
70     ss_2 = _mm_srli_si128(srcReg32b2, 2);
71     ss_4 = _mm_srli_si128(srcReg32b2, 4);
72     ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
73     ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
74     d1 = _mm_madd_epi16(ss_1_1, secondFilters);
75     d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
76     srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
77 
78     ss_1 = _mm_srli_si128(srcReg32b2, 3);
79     ss_3 = _mm_srli_si128(srcReg32b2, 5);
80     ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
81     ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
82     d1 = _mm_madd_epi16(ss_1_2, secondFilters);
83     d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
84     srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
85 
86     res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
87     res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
88     srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
89 
90     // shift by 6 bit each 16 bit
91     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
92     srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
93     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
94     srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
95 
96     // shrink to 8 bit each 16 bits, the first lane contain the first
97     // convolve result and the second lane contain the second convolve result
98     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
99 
100     src_ptr += src_pixels_per_line;
101 
102     _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
103 
104     output_ptr += output_pitch;
105   }
106 }
107 
aom_filter_block1d16_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)108 void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
109                                   uint8_t *output_ptr, ptrdiff_t out_pitch,
110                                   uint32_t output_height,
111                                   const int16_t *filter) {
112   __m128i filtersReg;
113   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
114   __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
115   __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
116   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
117   __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
118   __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
119   __m128i resReg23_45, resReg34_56;
120   __m128i addFilterReg32, secondFilters, thirdFilters;
121   __m128i tmp_0, tmp_1;
122   unsigned int i;
123   ptrdiff_t src_stride, dst_stride;
124 
125   addFilterReg32 = _mm_set1_epi16(32);
126   filtersReg = _mm_loadu_si128((const __m128i *)filter);
127   filtersReg = _mm_srai_epi16(filtersReg, 1);
128 
129   // coeffs 0 1 0 1 2 3 2 3
130   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
131   // coeffs 4 5 4 5 6 7 6 7
132   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
133 
134   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
135   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
136 
137   // multiply the size of the source and destination stride by two
138   src_stride = src_pitch << 1;
139   dst_stride = out_pitch << 1;
140 
141   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
142   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
143   srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
144   srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
145   __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
146   __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
147   __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
148   __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
149 
150   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
151   srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
152   srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
153   __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
154   __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
155   __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
156   __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
157 
158   for (i = output_height; i > 1; i -= 2) {
159     srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
160 
161     srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
162     srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
163 
164     srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
165 
166     srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
167     srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
168 
169     // multiply 2 adjacent elements with the filter and add the result
170 
171     tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
172     tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
173     resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
174 
175     tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
176     tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
177     resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
178 
179     __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
180     __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
181     tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
182     tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
183     resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
184 
185     __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
186     __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
187     tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
188     tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
189     resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
190 
191     // add and saturate the results together
192     resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
193     resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
194 
195     // multiply 2 adjacent elements with the filter and add the result
196 
197     tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
198     tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
199     resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
200 
201     tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
202     tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
203     resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
204 
205     __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
206     __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
207     tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
208     tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
209     resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
210 
211     __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
212     __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
213     tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
214     tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
215     resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
216 
217     // add and saturate the results together
218     resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
219     resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
220 
221     // shift by 6 bit each 16 bit
222     resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
223     resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
224     resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
225     resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
226     resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
227     resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
228     resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
229     resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
230 
231     // shrink to 8 bit each 16 bits, the first lane contain the first
232     // convolve result and the second lane contain the second convolve
233     // result
234     resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
235     resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
236 
237     src_ptr += src_stride;
238 
239     _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
240     _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
241 
242     output_ptr += dst_stride;
243 
244     // save part of the registers for next strides
245     resReg23_lo_1 = resReg45_lo_1;
246     resReg23_lo_2 = resReg45_lo_2;
247     resReg23_hi_1 = resReg45_hi_1;
248     resReg23_hi_2 = resReg45_hi_2;
249     resReg34_lo_1 = resReg56_lo_1;
250     resReg34_lo_2 = resReg56_lo_2;
251     resReg34_hi_1 = resReg56_hi_1;
252     resReg34_hi_2 = resReg56_hi_2;
253     srcReg4 = srcReg6;
254   }
255 }
256 
aom_filter_block1d8_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)257 void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
258                                  ptrdiff_t src_pixels_per_line,
259                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
260                                  uint32_t output_height,
261                                  const int16_t *filter) {
262   __m128i filtersReg;
263   __m128i addFilterReg32;
264   __m128i secondFilters, thirdFilters;
265   __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
266   __m128i srcReg32b1;
267   unsigned int i;
268   src_ptr -= 3;
269   addFilterReg32 = _mm_set1_epi16(32);
270   filtersReg = _mm_loadu_si128((const __m128i *)filter);
271   filtersReg = _mm_srai_epi16(filtersReg, 1);
272 
273   // coeffs 0 1 0 1 2 3 2 3
274   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
275   // coeffs 4 5 4 5 6 7 6 7
276   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
277 
278   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
279   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
280 
281   for (i = output_height; i > 0; i -= 1) {
282     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
283 
284     __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
285     __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
286     ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
287     ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
288     __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
289     __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
290     srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
291 
292     __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
293     __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
294     ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
295     ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
296     d1 = _mm_madd_epi16(ss_3, secondFilters);
297     d2 = _mm_madd_epi16(ss_5, thirdFilters);
298     srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
299 
300     __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
301     __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
302     srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
303 
304     // shift by 6 bit each 16 bit
305     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
306     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
307 
308     // shrink to 8 bit each 16 bits, the first lane contain the first
309     // convolve result and the second lane contain the second convolve result
310     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
311 
312     src_ptr += src_pixels_per_line;
313 
314     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
315 
316     output_ptr += output_pitch;
317   }
318 }
319 
aom_filter_block1d8_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)320 void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
321                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
322                                  uint32_t output_height,
323                                  const int16_t *filter) {
324   __m128i filtersReg;
325   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
326   __m128i srcReg23_lo, srcReg34_lo;
327   __m128i srcReg45_lo, srcReg56_lo;
328   __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
329   __m128i resReg23_45_lo, resReg34_56_lo;
330   __m128i resReg23_45, resReg34_56;
331   __m128i addFilterReg32, secondFilters, thirdFilters;
332   __m128i tmp_0, tmp_1;
333   unsigned int i;
334   ptrdiff_t src_stride, dst_stride;
335 
336   addFilterReg32 = _mm_set1_epi16(32);
337   filtersReg = _mm_loadu_si128((const __m128i *)filter);
338   filtersReg = _mm_srai_epi16(filtersReg, 1);
339 
340   // coeffs 0 1 0 1 2 3 2 3
341   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
342   // coeffs 4 5 4 5 6 7 6 7
343   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
344 
345   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
346   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
347 
348   // multiply the size of the source and destination stride by two
349   src_stride = src_pitch << 1;
350   dst_stride = out_pitch << 1;
351 
352   srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
353   srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
354   srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
355   __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
356   __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
357 
358   srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
359   srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
360   __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
361   __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
362 
363   for (i = output_height; i > 1; i -= 2) {
364     srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
365     srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
366 
367     srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
368     srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
369 
370     // multiply 2 adjacent elements with the filter and add the result
371 
372     tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
373     tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
374     resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
375 
376     tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
377     tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
378     resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
379 
380     __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
381     __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
382     tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
383     tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
384     resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
385 
386     __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
387     __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
388     tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
389     tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
390     resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
391 
392     // add and saturate the results together
393     resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
394     resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
395 
396     // shift by 6 bit each 16 bit
397     resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
398     resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
399     resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
400     resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
401 
402     // shrink to 8 bit each 16 bits, the first lane contain the first
403     // convolve result and the second lane contain the second convolve
404     // result
405     resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
406     resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
407 
408     src_ptr += src_stride;
409 
410     _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
411     _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
412 
413     output_ptr += dst_stride;
414 
415     // save part of the registers for next strides
416     resReg23_lo_1 = resReg45_lo_1;
417     resReg23_lo_2 = resReg45_lo_2;
418     resReg34_lo_1 = resReg56_lo_1;
419     resReg34_lo_2 = resReg56_lo_2;
420     srcReg4 = srcReg6;
421   }
422 }
423 
aom_filter_block1d4_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)424 void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
425                                  ptrdiff_t src_pixels_per_line,
426                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
427                                  uint32_t output_height,
428                                  const int16_t *filter) {
429   __m128i filtersReg;
430   __m128i addFilterReg32;
431   __m128i secondFilters, thirdFilters;
432   __m128i srcRegFilt32b1_1;
433   __m128i srcReg32b1;
434   unsigned int i;
435   src_ptr -= 3;
436   addFilterReg32 = _mm_set1_epi16(32);
437   filtersReg = _mm_loadu_si128((const __m128i *)filter);
438   filtersReg = _mm_srai_epi16(filtersReg, 1);
439 
440   // coeffs 0 1 0 1 2 3 2 3
441   const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
442   // coeffs 4 5 4 5 6 7 6 7
443   const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
444 
445   secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
446   thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
447 
448   for (i = output_height; i > 0; i -= 1) {
449     srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
450 
451     __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
452     __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
453     __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
454     __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
455 
456     ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
457     ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
458     ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
459     ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
460 
461     __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
462     __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
463 
464     __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
465     __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
466     srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
467 
468     srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
469 
470     // shift by 6 bit each 16 bit
471     srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
472     srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
473 
474     // shrink to 8 bit each 16 bits, the first lane contain the first
475     // convolve result and the second lane contain the second convolve result
476     srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
477 
478     src_ptr += src_pixels_per_line;
479 
480     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
481 
482     output_ptr += output_pitch;
483   }
484 }
485 
aom_filter_block1d4_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)486 void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
487                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
488                                  uint32_t output_height,
489                                  const int16_t *filter) {
490   __m128i filtersReg;
491   __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
492   __m128i srcReg23, srcReg34, srcReg45, srcReg56;
493   __m128i resReg23_34, resReg45_56;
494   __m128i resReg23_34_45_56;
495   __m128i addFilterReg32, secondFilters, thirdFilters;
496   __m128i tmp_0, tmp_1;
497   unsigned int i;
498   ptrdiff_t src_stride, dst_stride;
499 
500   addFilterReg32 = _mm_set1_epi16(32);
501   filtersReg = _mm_loadu_si128((const __m128i *)filter);
502   filtersReg = _mm_srai_epi16(filtersReg, 1);
503 
504   // coeffs 0 1 0 1 2 3 2 3
505   const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
506   // coeffs 4 5 4 5 6 7 6 7
507   const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
508 
509   secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
510   thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
511 
512   // multiply the size of the source and destination stride by two
513   src_stride = src_pitch << 1;
514   dst_stride = out_pitch << 1;
515 
516   srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
517   srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
518   srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
519   __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
520 
521   srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
522   srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
523   __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
524 
525   for (i = output_height; i > 1; i -= 2) {
526     srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
527     srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
528     srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
529     srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
530 
531     // multiply 2 adjacent elements with the filter and add the result
532     tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
533     tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
534     resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
535 
536     __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
537     __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
538 
539     tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
540     tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
541     resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
542 
543     // add and saturate the results together
544     resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
545 
546     // shift by 6 bit each 16 bit
547     resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
548     resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
549 
550     // shrink to 8 bit each 16 bits, the first lane contain the first
551     // convolve result and the second lane contain the second convolve
552     // result
553     resReg23_34_45_56 =
554         _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
555 
556     src_ptr += src_stride;
557 
558     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
559     *((uint32_t *)(output_ptr + out_pitch)) =
560         _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
561 
562     output_ptr += dst_stride;
563 
564     // save part of the registers for next strides
565     resReg23 = resReg45;
566     resReg34 = resReg56;
567     srcReg4 = srcReg6;
568   }
569 }
570