1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h> // SSE2
13
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom_dsp/x86/convolve.h"
16 #include "aom_ports/mem.h"
17
aom_filter_block1d16_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)18 void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
19 ptrdiff_t src_pixels_per_line,
20 uint8_t *output_ptr, ptrdiff_t output_pitch,
21 uint32_t output_height,
22 const int16_t *filter) {
23 __m128i filtersReg;
24 __m128i addFilterReg32;
25 __m128i secondFilters, thirdFilters;
26 __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
27 srcRegFilt32b2_2;
28 __m128i srcReg32b1, srcReg32b2;
29 unsigned int i;
30 src_ptr -= 3;
31 addFilterReg32 = _mm_set1_epi16(32);
32 filtersReg = _mm_loadu_si128((const __m128i *)filter);
33 filtersReg = _mm_srai_epi16(filtersReg, 1);
34
35 // coeffs 0 1 0 1 2 3 2 3
36 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
37 // coeffs 4 5 4 5 6 7 6 7
38 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
39
40 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
41 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
42
43 for (i = output_height; i > 0; i -= 1) {
44 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
45
46 __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
47 __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
48 __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
49 __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
50 __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
51 __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
52 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
53
54 __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
55 __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
56 __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
57 __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
58 d1 = _mm_madd_epi16(ss_1_2, secondFilters);
59 d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
60 srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
61
62 __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
63 __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
64 srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
65
66 // reading stride of the next 16 bytes
67 // (part of it was being read by earlier read)
68 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
69
70 ss_2 = _mm_srli_si128(srcReg32b2, 2);
71 ss_4 = _mm_srli_si128(srcReg32b2, 4);
72 ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
73 ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
74 d1 = _mm_madd_epi16(ss_1_1, secondFilters);
75 d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
76 srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
77
78 ss_1 = _mm_srli_si128(srcReg32b2, 3);
79 ss_3 = _mm_srli_si128(srcReg32b2, 5);
80 ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
81 ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
82 d1 = _mm_madd_epi16(ss_1_2, secondFilters);
83 d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
84 srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
85
86 res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
87 res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
88 srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
89
90 // shift by 6 bit each 16 bit
91 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
92 srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
93 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
94 srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
95
96 // shrink to 8 bit each 16 bits, the first lane contain the first
97 // convolve result and the second lane contain the second convolve result
98 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
99
100 src_ptr += src_pixels_per_line;
101
102 _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
103
104 output_ptr += output_pitch;
105 }
106 }
107
aom_filter_block1d16_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)108 void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
109 uint8_t *output_ptr, ptrdiff_t out_pitch,
110 uint32_t output_height,
111 const int16_t *filter) {
112 __m128i filtersReg;
113 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
114 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
115 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
116 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
117 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
118 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
119 __m128i resReg23_45, resReg34_56;
120 __m128i addFilterReg32, secondFilters, thirdFilters;
121 __m128i tmp_0, tmp_1;
122 unsigned int i;
123 ptrdiff_t src_stride, dst_stride;
124
125 addFilterReg32 = _mm_set1_epi16(32);
126 filtersReg = _mm_loadu_si128((const __m128i *)filter);
127 filtersReg = _mm_srai_epi16(filtersReg, 1);
128
129 // coeffs 0 1 0 1 2 3 2 3
130 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
131 // coeffs 4 5 4 5 6 7 6 7
132 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
133
134 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
135 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
136
137 // multiply the size of the source and destination stride by two
138 src_stride = src_pitch << 1;
139 dst_stride = out_pitch << 1;
140
141 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
142 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
143 srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
144 srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
145 __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
146 __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
147 __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
148 __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
149
150 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
151 srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
152 srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
153 __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
154 __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
155 __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
156 __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
157
158 for (i = output_height; i > 1; i -= 2) {
159 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
160
161 srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
162 srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
163
164 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
165
166 srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
167 srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
168
169 // multiply 2 adjacent elements with the filter and add the result
170
171 tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
172 tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
173 resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
174
175 tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
176 tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
177 resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
178
179 __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
180 __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
181 tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
182 tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
183 resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
184
185 __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
186 __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
187 tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
188 tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
189 resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
190
191 // add and saturate the results together
192 resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
193 resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
194
195 // multiply 2 adjacent elements with the filter and add the result
196
197 tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
198 tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
199 resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
200
201 tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
202 tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
203 resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
204
205 __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
206 __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
207 tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
208 tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
209 resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
210
211 __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
212 __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
213 tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
214 tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
215 resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
216
217 // add and saturate the results together
218 resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
219 resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
220
221 // shift by 6 bit each 16 bit
222 resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
223 resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
224 resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
225 resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
226 resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
227 resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
228 resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
229 resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
230
231 // shrink to 8 bit each 16 bits, the first lane contain the first
232 // convolve result and the second lane contain the second convolve
233 // result
234 resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
235 resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
236
237 src_ptr += src_stride;
238
239 _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
240 _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
241
242 output_ptr += dst_stride;
243
244 // save part of the registers for next strides
245 resReg23_lo_1 = resReg45_lo_1;
246 resReg23_lo_2 = resReg45_lo_2;
247 resReg23_hi_1 = resReg45_hi_1;
248 resReg23_hi_2 = resReg45_hi_2;
249 resReg34_lo_1 = resReg56_lo_1;
250 resReg34_lo_2 = resReg56_lo_2;
251 resReg34_hi_1 = resReg56_hi_1;
252 resReg34_hi_2 = resReg56_hi_2;
253 srcReg4 = srcReg6;
254 }
255 }
256
aom_filter_block1d8_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)257 void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
258 ptrdiff_t src_pixels_per_line,
259 uint8_t *output_ptr, ptrdiff_t output_pitch,
260 uint32_t output_height,
261 const int16_t *filter) {
262 __m128i filtersReg;
263 __m128i addFilterReg32;
264 __m128i secondFilters, thirdFilters;
265 __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
266 __m128i srcReg32b1;
267 unsigned int i;
268 src_ptr -= 3;
269 addFilterReg32 = _mm_set1_epi16(32);
270 filtersReg = _mm_loadu_si128((const __m128i *)filter);
271 filtersReg = _mm_srai_epi16(filtersReg, 1);
272
273 // coeffs 0 1 0 1 2 3 2 3
274 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
275 // coeffs 4 5 4 5 6 7 6 7
276 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
277
278 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
279 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
280
281 for (i = output_height; i > 0; i -= 1) {
282 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
283
284 __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
285 __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
286 ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
287 ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
288 __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
289 __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
290 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
291
292 __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
293 __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
294 ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
295 ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
296 d1 = _mm_madd_epi16(ss_3, secondFilters);
297 d2 = _mm_madd_epi16(ss_5, thirdFilters);
298 srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
299
300 __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
301 __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
302 srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
303
304 // shift by 6 bit each 16 bit
305 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
306 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
307
308 // shrink to 8 bit each 16 bits, the first lane contain the first
309 // convolve result and the second lane contain the second convolve result
310 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
311
312 src_ptr += src_pixels_per_line;
313
314 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
315
316 output_ptr += output_pitch;
317 }
318 }
319
aom_filter_block1d8_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)320 void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
321 uint8_t *output_ptr, ptrdiff_t out_pitch,
322 uint32_t output_height,
323 const int16_t *filter) {
324 __m128i filtersReg;
325 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
326 __m128i srcReg23_lo, srcReg34_lo;
327 __m128i srcReg45_lo, srcReg56_lo;
328 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
329 __m128i resReg23_45_lo, resReg34_56_lo;
330 __m128i resReg23_45, resReg34_56;
331 __m128i addFilterReg32, secondFilters, thirdFilters;
332 __m128i tmp_0, tmp_1;
333 unsigned int i;
334 ptrdiff_t src_stride, dst_stride;
335
336 addFilterReg32 = _mm_set1_epi16(32);
337 filtersReg = _mm_loadu_si128((const __m128i *)filter);
338 filtersReg = _mm_srai_epi16(filtersReg, 1);
339
340 // coeffs 0 1 0 1 2 3 2 3
341 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
342 // coeffs 4 5 4 5 6 7 6 7
343 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
344
345 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
346 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
347
348 // multiply the size of the source and destination stride by two
349 src_stride = src_pitch << 1;
350 dst_stride = out_pitch << 1;
351
352 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
353 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
354 srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
355 __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
356 __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
357
358 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
359 srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
360 __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
361 __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
362
363 for (i = output_height; i > 1; i -= 2) {
364 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
365 srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
366
367 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
368 srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
369
370 // multiply 2 adjacent elements with the filter and add the result
371
372 tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
373 tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
374 resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
375
376 tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
377 tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
378 resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
379
380 __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
381 __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
382 tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
383 tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
384 resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
385
386 __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
387 __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
388 tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
389 tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
390 resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
391
392 // add and saturate the results together
393 resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
394 resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
395
396 // shift by 6 bit each 16 bit
397 resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
398 resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
399 resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
400 resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
401
402 // shrink to 8 bit each 16 bits, the first lane contain the first
403 // convolve result and the second lane contain the second convolve
404 // result
405 resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
406 resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
407
408 src_ptr += src_stride;
409
410 _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
411 _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
412
413 output_ptr += dst_stride;
414
415 // save part of the registers for next strides
416 resReg23_lo_1 = resReg45_lo_1;
417 resReg23_lo_2 = resReg45_lo_2;
418 resReg34_lo_1 = resReg56_lo_1;
419 resReg34_lo_2 = resReg56_lo_2;
420 srcReg4 = srcReg6;
421 }
422 }
423
aom_filter_block1d4_h4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)424 void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
425 ptrdiff_t src_pixels_per_line,
426 uint8_t *output_ptr, ptrdiff_t output_pitch,
427 uint32_t output_height,
428 const int16_t *filter) {
429 __m128i filtersReg;
430 __m128i addFilterReg32;
431 __m128i secondFilters, thirdFilters;
432 __m128i srcRegFilt32b1_1;
433 __m128i srcReg32b1;
434 unsigned int i;
435 src_ptr -= 3;
436 addFilterReg32 = _mm_set1_epi16(32);
437 filtersReg = _mm_loadu_si128((const __m128i *)filter);
438 filtersReg = _mm_srai_epi16(filtersReg, 1);
439
440 // coeffs 0 1 0 1 2 3 2 3
441 const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
442 // coeffs 4 5 4 5 6 7 6 7
443 const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
444
445 secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
446 thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
447
448 for (i = output_height; i > 0; i -= 1) {
449 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
450
451 __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
452 __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
453 __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
454 __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
455
456 ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
457 ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
458 ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
459 ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
460
461 __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
462 __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
463
464 __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
465 __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
466 srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
467
468 srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
469
470 // shift by 6 bit each 16 bit
471 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
472 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
473
474 // shrink to 8 bit each 16 bits, the first lane contain the first
475 // convolve result and the second lane contain the second convolve result
476 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
477
478 src_ptr += src_pixels_per_line;
479
480 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
481
482 output_ptr += output_pitch;
483 }
484 }
485
aom_filter_block1d4_v4_sse2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)486 void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
487 uint8_t *output_ptr, ptrdiff_t out_pitch,
488 uint32_t output_height,
489 const int16_t *filter) {
490 __m128i filtersReg;
491 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
492 __m128i srcReg23, srcReg34, srcReg45, srcReg56;
493 __m128i resReg23_34, resReg45_56;
494 __m128i resReg23_34_45_56;
495 __m128i addFilterReg32, secondFilters, thirdFilters;
496 __m128i tmp_0, tmp_1;
497 unsigned int i;
498 ptrdiff_t src_stride, dst_stride;
499
500 addFilterReg32 = _mm_set1_epi16(32);
501 filtersReg = _mm_loadu_si128((const __m128i *)filter);
502 filtersReg = _mm_srai_epi16(filtersReg, 1);
503
504 // coeffs 0 1 0 1 2 3 2 3
505 const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
506 // coeffs 4 5 4 5 6 7 6 7
507 const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
508
509 secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
510 thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
511
512 // multiply the size of the source and destination stride by two
513 src_stride = src_pitch << 1;
514 dst_stride = out_pitch << 1;
515
516 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
517 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
518 srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
519 __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
520
521 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
522 srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
523 __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
524
525 for (i = output_height; i > 1; i -= 2) {
526 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
527 srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
528 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
529 srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
530
531 // multiply 2 adjacent elements with the filter and add the result
532 tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
533 tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
534 resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
535
536 __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
537 __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
538
539 tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
540 tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
541 resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
542
543 // add and saturate the results together
544 resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
545
546 // shift by 6 bit each 16 bit
547 resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
548 resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
549
550 // shrink to 8 bit each 16 bits, the first lane contain the first
551 // convolve result and the second lane contain the second convolve
552 // result
553 resReg23_34_45_56 =
554 _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
555
556 src_ptr += src_stride;
557
558 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
559 *((uint32_t *)(output_ptr + out_pitch)) =
560 _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
561
562 output_ptr += dst_stride;
563
564 // save part of the registers for next strides
565 resReg23 = resReg45;
566 resReg34 = resReg56;
567 srcReg4 = srcReg6;
568 }
569 }
570