1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "av1/common/convolve.h"
20
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)21 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
22 const int subpel_q4,
23 __m128i *const coeffs /* [4] */) {
24 const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
25 filter_params, subpel_q4 & SUBPEL_MASK);
26 const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
27 // coeffs 0 1 0 1 2 3 2 3
28 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
29 // coeffs 4 5 4 5 6 7 6 7
30 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
31
32 coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
33 coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
34 coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
35 coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
36 }
37
convolve(const __m128i * const s,const __m128i * const coeffs)38 static INLINE __m128i convolve(const __m128i *const s,
39 const __m128i *const coeffs) {
40 const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
41 const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
42 const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
43 const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
44 const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
45 return d;
46 }
47
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)48 static INLINE __m128i convolve_lo_x(const __m128i *const s,
49 const __m128i *const coeffs) {
50 __m128i ss[4];
51 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
52 ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
53 ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
54 ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
55 return convolve(ss, coeffs);
56 }
57
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)58 static INLINE __m128i convolve_lo_y(const __m128i *const s,
59 const __m128i *const coeffs) {
60 __m128i ss[4];
61 ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
62 ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
63 ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
64 ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
65 return convolve(ss, coeffs);
66 }
67
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)68 static INLINE __m128i convolve_hi_y(const __m128i *const s,
69 const __m128i *const coeffs) {
70 __m128i ss[4];
71 ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
72 ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
73 ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
74 ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
75 return convolve(ss, coeffs);
76 }
77
av1_convolve_y_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,int subpel_y_qn)78 void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
79 uint8_t *dst, int dst_stride, int w, int h,
80 const InterpFilterParams *filter_params_y,
81 int subpel_y_qn) {
82 const int fo_vert = filter_params_y->taps / 2 - 1;
83 const uint8_t *src_ptr = src - fo_vert * src_stride;
84 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
85 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
86 __m128i coeffs[6];
87
88 prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
89
90 int j = 0;
91 do {
92 __m128i s[12], src10, res_lo, res_hi;
93 __m128i res_lo_round, res_hi_round, res16, res;
94 const uint8_t *data = &src_ptr[j];
95
96 src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
97 s[0] =
98 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
99 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
100 s[1] =
101 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
102 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
103 s[2] =
104 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
105 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
106 s[3] =
107 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
108 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
109 s[4] =
110 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
111 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
112 s[5] =
113 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
114 _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
115 s[6] =
116 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
117 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
118 s[7] =
119 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
120 _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
121 s[8] =
122 _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
123 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
124 s[9] = _mm_unpacklo_epi8(
125 _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
126
127 int i = 0;
128 do {
129 data = &src_ptr[i * src_stride + j];
130 s[10] = _mm_unpacklo_epi8(
131 src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
132 src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
133 s[11] = _mm_unpacklo_epi8(
134 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
135
136 res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels
137 res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels
138
139 res_lo_round =
140 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
141 res_hi_round =
142 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
143
144 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
145 res = _mm_packus_epi16(res16, res16);
146
147 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
148 i++;
149
150 res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels
151 res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels
152
153 res_lo_round =
154 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
155 res_hi_round =
156 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
157
158 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
159 res = _mm_packus_epi16(res16, res16);
160
161 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
162 i++;
163
164 s[0] = s[2];
165 s[1] = s[3];
166 s[2] = s[4];
167 s[3] = s[5];
168 s[4] = s[6];
169 s[5] = s[7];
170 s[6] = s[8];
171 s[7] = s[9];
172 s[8] = s[10];
173 s[9] = s[11];
174 } while (i < h);
175 j += 8;
176 } while (j < w);
177 }
178
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)179 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
180 int dst_stride, int w, int h,
181 const InterpFilterParams *filter_params_y,
182 const int subpel_y_qn) {
183 if (filter_params_y->taps > 8) {
184 if (w < 8) {
185 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
186 filter_params_y, subpel_y_qn);
187 } else {
188 av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
189 filter_params_y, subpel_y_qn);
190 }
191 } else {
192 const int fo_vert = filter_params_y->taps / 2 - 1;
193 const uint8_t *src_ptr = src - fo_vert * src_stride;
194 const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
195 const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
196 __m128i coeffs[4];
197
198 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
199
200 if (w <= 4) {
201 __m128i s[8], src6, res, res_round, res16;
202 int res_int;
203 src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
204 s[0] = _mm_unpacklo_epi8(
205 _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
206 _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
207 s[1] = _mm_unpacklo_epi8(
208 _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
209 _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
210 s[2] = _mm_unpacklo_epi8(
211 _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
212 _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
213 s[3] = _mm_unpacklo_epi8(
214 _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
215 _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
216 s[4] = _mm_unpacklo_epi8(
217 _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
218 _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
219 s[5] = _mm_unpacklo_epi8(
220 _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
221
222 do {
223 s[6] = _mm_unpacklo_epi8(
224 src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
225 src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
226 s[7] = _mm_unpacklo_epi8(
227 _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
228
229 res = convolve_lo_y(s + 0, coeffs);
230 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
231 res16 = _mm_packs_epi32(res_round, res_round);
232 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
233
234 if (w == 2)
235 *(uint16_t *)dst = (uint16_t)res_int;
236 else
237 *(int *)dst = res_int;
238
239 src_ptr += src_stride;
240 dst += dst_stride;
241
242 res = convolve_lo_y(s + 1, coeffs);
243 res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
244 res16 = _mm_packs_epi32(res_round, res_round);
245 res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
246
247 if (w == 2)
248 *(uint16_t *)dst = (uint16_t)res_int;
249 else
250 *(int *)dst = res_int;
251
252 src_ptr += src_stride;
253 dst += dst_stride;
254
255 s[0] = s[2];
256 s[1] = s[3];
257 s[2] = s[4];
258 s[3] = s[5];
259 s[4] = s[6];
260 s[5] = s[7];
261 h -= 2;
262 } while (h);
263 } else {
264 assert(!(w % 8));
265 int j = 0;
266 do {
267 __m128i s[8], src6, res_lo, res_hi;
268 __m128i res_lo_round, res_hi_round, res16, res;
269 const uint8_t *data = &src_ptr[j];
270
271 src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
272 s[0] = _mm_unpacklo_epi8(
273 _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
274 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
275 s[1] = _mm_unpacklo_epi8(
276 _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
277 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
278 s[2] = _mm_unpacklo_epi8(
279 _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
280 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
281 s[3] = _mm_unpacklo_epi8(
282 _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
283 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
284 s[4] = _mm_unpacklo_epi8(
285 _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
286 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
287 s[5] = _mm_unpacklo_epi8(
288 _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
289
290 int i = 0;
291 do {
292 data = &src_ptr[i * src_stride + j];
293 s[6] = _mm_unpacklo_epi8(
294 src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
295 src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
296 s[7] = _mm_unpacklo_epi8(
297 _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
298
299 res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
300 res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
301
302 res_lo_round =
303 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
304 res_hi_round =
305 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
306
307 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
308 res = _mm_packus_epi16(res16, res16);
309
310 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
311 i++;
312
313 res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
314 res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
315
316 res_lo_round =
317 _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
318 res_hi_round =
319 _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
320
321 res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
322 res = _mm_packus_epi16(res16, res16);
323
324 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
325 i++;
326
327 s[0] = s[2];
328 s[1] = s[3];
329 s[2] = s[4];
330 s[3] = s[5];
331 s[4] = s[6];
332 s[5] = s[7];
333 } while (i < h);
334 j += 8;
335 } while (j < w);
336 }
337 }
338 }
339
av1_convolve_x_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,int subpel_x_qn,ConvolveParams * conv_params)340 void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
341 uint8_t *dst, int dst_stride, int w, int h,
342 const InterpFilterParams *filter_params_x,
343 int subpel_x_qn,
344 ConvolveParams *conv_params) {
345 const int fo_horiz = filter_params_x->taps / 2 - 1;
346 const uint8_t *src_ptr = src - fo_horiz;
347 const int bits = FILTER_BITS - conv_params->round_0;
348 const __m128i round_0_const =
349 _mm_set1_epi32((1 << conv_params->round_0) >> 1);
350 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
351 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
352 const __m128i round_shift = _mm_cvtsi32_si128(bits);
353 const __m128i zero = _mm_setzero_si128();
354 __m128i coeffs[6];
355
356 assert(bits >= 0);
357 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
358 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
359
360 prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
361
362 int i = 0;
363 do {
364 int j = 0;
365 do {
366 const __m128i data =
367 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
368 __m128i s[4];
369
370 s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
371 s[1] =
372 _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
373 s[2] =
374 _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
375 s[3] =
376 _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
377
378 const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
379
380 __m128i res32_round =
381 _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
382 res32_round =
383 _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
384
385 const __m128i res16 = _mm_packs_epi32(res32_round, zero);
386 const __m128i res = _mm_packus_epi16(res16, zero);
387
388 const int val = _mm_cvtsi128_si32(res);
389 memcpy((dst + i * dst_stride + j), &val, sizeof(val));
390 j += 4;
391 } while (j < w);
392 } while (++i < h);
393 }
394
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)395 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
396 int dst_stride, int w, int h,
397 const InterpFilterParams *filter_params_x,
398 const int subpel_x_qn,
399 ConvolveParams *conv_params) {
400 if (filter_params_x->taps > 8) {
401 if (w < 4) {
402 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
403 filter_params_x, subpel_x_qn, conv_params);
404 } else {
405 av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
406 filter_params_x, subpel_x_qn, conv_params);
407 }
408 } else {
409 const int fo_horiz = filter_params_x->taps / 2 - 1;
410 const uint8_t *src_ptr = src - fo_horiz;
411 const int bits = FILTER_BITS - conv_params->round_0;
412 const __m128i round_0_const =
413 _mm_set1_epi32((1 << conv_params->round_0) >> 1);
414 const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
415 const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
416 const __m128i round_shift = _mm_cvtsi32_si128(bits);
417 __m128i coeffs[4];
418
419 assert(bits >= 0);
420 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
421 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
422
423 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
424
425 if (w <= 4) {
426 do {
427 const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
428 __m128i s[4];
429
430 s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
431 s[1] =
432 _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
433 s[2] =
434 _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
435 s[3] =
436 _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
437 const __m128i res_lo = convolve_lo_x(s, coeffs);
438 __m128i res_lo_round =
439 _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
440 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
441 round_shift);
442
443 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
444 const __m128i res = _mm_packus_epi16(res16, res16);
445
446 int r = _mm_cvtsi128_si32(res);
447 if (w == 2)
448 *(uint16_t *)dst = (uint16_t)r;
449 else
450 *(int *)dst = r;
451
452 src_ptr += src_stride;
453 dst += dst_stride;
454 } while (--h);
455 } else {
456 assert(!(w % 8));
457 int i = 0;
458 do {
459 int j = 0;
460 do {
461 const __m128i data =
462 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
463 __m128i s[4];
464
465 // Filter even-index pixels
466 s[0] = data;
467 s[1] = _mm_srli_si128(data, 2);
468 s[2] = _mm_srli_si128(data, 4);
469 s[3] = _mm_srli_si128(data, 6);
470 const __m128i res_even = convolve_lo_x(s, coeffs);
471
472 // Filter odd-index pixels
473 s[0] = _mm_srli_si128(data, 1);
474 s[1] = _mm_srli_si128(data, 3);
475 s[2] = _mm_srli_si128(data, 5);
476 s[3] = _mm_srli_si128(data, 7);
477 const __m128i res_odd = convolve_lo_x(s, coeffs);
478
479 // Rearrange pixels back into the order 0 ... 7
480 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
481 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
482 __m128i res_lo_round = _mm_sra_epi32(
483 _mm_add_epi32(res_lo, round_0_const), round_0_shift);
484 res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
485 round_shift);
486 __m128i res_hi_round = _mm_sra_epi32(
487 _mm_add_epi32(res_hi, round_0_const), round_0_shift);
488 res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
489 round_shift);
490
491 const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
492 const __m128i res = _mm_packus_epi16(res16, res16);
493
494 _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
495 j += 8;
496 } while (j < w);
497 } while (++i < h);
498 }
499 }
500 }
501