• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_common_intrin.h"
19 #include "av1/common/convolve.h"
20 
prepare_coeffs(const InterpFilterParams * const filter_params,const int subpel_q4,__m128i * const coeffs)21 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
22                                   const int subpel_q4,
23                                   __m128i *const coeffs /* [4] */) {
24   const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
25       filter_params, subpel_q4 & SUBPEL_MASK);
26   const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
27   // coeffs 0 1 0 1 2 3 2 3
28   const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
29   // coeffs 4 5 4 5 6 7 6 7
30   const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
31 
32   coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
33   coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
34   coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
35   coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
36 }
37 
convolve(const __m128i * const s,const __m128i * const coeffs)38 static INLINE __m128i convolve(const __m128i *const s,
39                                const __m128i *const coeffs) {
40   const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
41   const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
42   const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
43   const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
44   const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
45   return d;
46 }
47 
convolve_lo_x(const __m128i * const s,const __m128i * const coeffs)48 static INLINE __m128i convolve_lo_x(const __m128i *const s,
49                                     const __m128i *const coeffs) {
50   __m128i ss[4];
51   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
52   ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
53   ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
54   ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
55   return convolve(ss, coeffs);
56 }
57 
convolve_lo_y(const __m128i * const s,const __m128i * const coeffs)58 static INLINE __m128i convolve_lo_y(const __m128i *const s,
59                                     const __m128i *const coeffs) {
60   __m128i ss[4];
61   ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
62   ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
63   ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
64   ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
65   return convolve(ss, coeffs);
66 }
67 
convolve_hi_y(const __m128i * const s,const __m128i * const coeffs)68 static INLINE __m128i convolve_hi_y(const __m128i *const s,
69                                     const __m128i *const coeffs) {
70   __m128i ss[4];
71   ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
72   ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
73   ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
74   ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
75   return convolve(ss, coeffs);
76 }
77 
av1_convolve_y_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,int subpel_y_qn)78 void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
79                                   uint8_t *dst, int dst_stride, int w, int h,
80                                   const InterpFilterParams *filter_params_y,
81                                   int subpel_y_qn) {
82   const int fo_vert = filter_params_y->taps / 2 - 1;
83   const uint8_t *src_ptr = src - fo_vert * src_stride;
84   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
85   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
86   __m128i coeffs[6];
87 
88   prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
89 
90   int j = 0;
91   do {
92     __m128i s[12], src10, res_lo, res_hi;
93     __m128i res_lo_round, res_hi_round, res16, res;
94     const uint8_t *data = &src_ptr[j];
95 
96     src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
97     s[0] =
98         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
99                           _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
100     s[1] =
101         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
102                           _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
103     s[2] =
104         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
105                           _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
106     s[3] =
107         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
108                           _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
109     s[4] =
110         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
111                           _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
112     s[5] =
113         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
114                           _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
115     s[6] =
116         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
117                           _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
118     s[7] =
119         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
120                           _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
121     s[8] =
122         _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
123                           _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
124     s[9] = _mm_unpacklo_epi8(
125         _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
126 
127     int i = 0;
128     do {
129       data = &src_ptr[i * src_stride + j];
130       s[10] = _mm_unpacklo_epi8(
131           src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
132       src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
133       s[11] = _mm_unpacklo_epi8(
134           _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
135 
136       res_lo = convolve_lo_y_12tap(s, coeffs);  // Filter low index pixels
137       res_hi = convolve_hi_y_12tap(s, coeffs);  // Filter high index pixels
138 
139       res_lo_round =
140           _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
141       res_hi_round =
142           _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
143 
144       res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
145       res = _mm_packus_epi16(res16, res16);
146 
147       _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
148       i++;
149 
150       res_lo = convolve_lo_y_12tap(s + 1, coeffs);  // Filter low index pixels
151       res_hi = convolve_hi_y_12tap(s + 1, coeffs);  // Filter high index pixels
152 
153       res_lo_round =
154           _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
155       res_hi_round =
156           _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
157 
158       res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
159       res = _mm_packus_epi16(res16, res16);
160 
161       _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
162       i++;
163 
164       s[0] = s[2];
165       s[1] = s[3];
166       s[2] = s[4];
167       s[3] = s[5];
168       s[4] = s[6];
169       s[5] = s[7];
170       s[6] = s[8];
171       s[7] = s[9];
172       s[8] = s[10];
173       s[9] = s[11];
174     } while (i < h);
175     j += 8;
176   } while (j < w);
177 }
178 
av1_convolve_y_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)179 void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
180                             int dst_stride, int w, int h,
181                             const InterpFilterParams *filter_params_y,
182                             const int subpel_y_qn) {
183   if (filter_params_y->taps > 8) {
184     if (w < 8) {
185       av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
186                           filter_params_y, subpel_y_qn);
187     } else {
188       av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
189                                    filter_params_y, subpel_y_qn);
190     }
191   } else {
192     const int fo_vert = filter_params_y->taps / 2 - 1;
193     const uint8_t *src_ptr = src - fo_vert * src_stride;
194     const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
195     const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
196     __m128i coeffs[4];
197 
198     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
199 
200     if (w <= 4) {
201       __m128i s[8], src6, res, res_round, res16;
202       int res_int;
203       src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride));
204       s[0] = _mm_unpacklo_epi8(
205           _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)),
206           _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)));
207       s[1] = _mm_unpacklo_epi8(
208           _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)),
209           _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)));
210       s[2] = _mm_unpacklo_epi8(
211           _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)),
212           _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)));
213       s[3] = _mm_unpacklo_epi8(
214           _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)),
215           _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)));
216       s[4] = _mm_unpacklo_epi8(
217           _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)),
218           _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)));
219       s[5] = _mm_unpacklo_epi8(
220           _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6);
221 
222       do {
223         s[6] = _mm_unpacklo_epi8(
224             src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)));
225         src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride));
226         s[7] = _mm_unpacklo_epi8(
227             _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6);
228 
229         res = convolve_lo_y(s + 0, coeffs);
230         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
231         res16 = _mm_packs_epi32(res_round, res_round);
232         res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
233 
234         if (w == 2)
235           *(uint16_t *)dst = (uint16_t)res_int;
236         else
237           *(int *)dst = res_int;
238 
239         src_ptr += src_stride;
240         dst += dst_stride;
241 
242         res = convolve_lo_y(s + 1, coeffs);
243         res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
244         res16 = _mm_packs_epi32(res_round, res_round);
245         res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
246 
247         if (w == 2)
248           *(uint16_t *)dst = (uint16_t)res_int;
249         else
250           *(int *)dst = res_int;
251 
252         src_ptr += src_stride;
253         dst += dst_stride;
254 
255         s[0] = s[2];
256         s[1] = s[3];
257         s[2] = s[4];
258         s[3] = s[5];
259         s[4] = s[6];
260         s[5] = s[7];
261         h -= 2;
262       } while (h);
263     } else {
264       assert(!(w % 8));
265       int j = 0;
266       do {
267         __m128i s[8], src6, res_lo, res_hi;
268         __m128i res_lo_round, res_hi_round, res16, res;
269         const uint8_t *data = &src_ptr[j];
270 
271         src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
272         s[0] = _mm_unpacklo_epi8(
273             _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
274             _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
275         s[1] = _mm_unpacklo_epi8(
276             _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
277             _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
278         s[2] = _mm_unpacklo_epi8(
279             _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
280             _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
281         s[3] = _mm_unpacklo_epi8(
282             _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
283             _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
284         s[4] = _mm_unpacklo_epi8(
285             _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
286             _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
287         s[5] = _mm_unpacklo_epi8(
288             _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
289 
290         int i = 0;
291         do {
292           data = &src_ptr[i * src_stride + j];
293           s[6] = _mm_unpacklo_epi8(
294               src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
295           src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
296           s[7] = _mm_unpacklo_epi8(
297               _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
298 
299           res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
300           res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
301 
302           res_lo_round =
303               _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
304           res_hi_round =
305               _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
306 
307           res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
308           res = _mm_packus_epi16(res16, res16);
309 
310           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
311           i++;
312 
313           res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
314           res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
315 
316           res_lo_round =
317               _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
318           res_hi_round =
319               _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
320 
321           res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
322           res = _mm_packus_epi16(res16, res16);
323 
324           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
325           i++;
326 
327           s[0] = s[2];
328           s[1] = s[3];
329           s[2] = s[4];
330           s[3] = s[5];
331           s[4] = s[6];
332           s[5] = s[7];
333         } while (i < h);
334         j += 8;
335       } while (j < w);
336     }
337   }
338 }
339 
av1_convolve_x_sr_12tap_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,int subpel_x_qn,ConvolveParams * conv_params)340 void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
341                                   uint8_t *dst, int dst_stride, int w, int h,
342                                   const InterpFilterParams *filter_params_x,
343                                   int subpel_x_qn,
344                                   ConvolveParams *conv_params) {
345   const int fo_horiz = filter_params_x->taps / 2 - 1;
346   const uint8_t *src_ptr = src - fo_horiz;
347   const int bits = FILTER_BITS - conv_params->round_0;
348   const __m128i round_0_const =
349       _mm_set1_epi32((1 << conv_params->round_0) >> 1);
350   const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
351   const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
352   const __m128i round_shift = _mm_cvtsi32_si128(bits);
353   const __m128i zero = _mm_setzero_si128();
354   __m128i coeffs[6];
355 
356   assert(bits >= 0);
357   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
358          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
359 
360   prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
361 
362   int i = 0;
363   do {
364     int j = 0;
365     do {
366       const __m128i data =
367           _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
368       __m128i s[4];
369 
370       s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1));
371       s[1] =
372           _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
373       s[2] =
374           _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
375       s[3] =
376           _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
377 
378       const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero);
379 
380       __m128i res32_round =
381           _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift);
382       res32_round =
383           _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift);
384 
385       const __m128i res16 = _mm_packs_epi32(res32_round, zero);
386       const __m128i res = _mm_packus_epi16(res16, zero);
387 
388       const int val = _mm_cvtsi128_si32(res);
389       memcpy((dst + i * dst_stride + j), &val, sizeof(val));
390       j += 4;
391     } while (j < w);
392   } while (++i < h);
393 }
394 
av1_convolve_x_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)395 void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
396                             int dst_stride, int w, int h,
397                             const InterpFilterParams *filter_params_x,
398                             const int subpel_x_qn,
399                             ConvolveParams *conv_params) {
400   if (filter_params_x->taps > 8) {
401     if (w < 4) {
402       av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
403                           filter_params_x, subpel_x_qn, conv_params);
404     } else {
405       av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
406                                    filter_params_x, subpel_x_qn, conv_params);
407     }
408   } else {
409     const int fo_horiz = filter_params_x->taps / 2 - 1;
410     const uint8_t *src_ptr = src - fo_horiz;
411     const int bits = FILTER_BITS - conv_params->round_0;
412     const __m128i round_0_const =
413         _mm_set1_epi32((1 << conv_params->round_0) >> 1);
414     const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
415     const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
416     const __m128i round_shift = _mm_cvtsi32_si128(bits);
417     __m128i coeffs[4];
418 
419     assert(bits >= 0);
420     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
421            ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
422 
423     prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
424 
425     if (w <= 4) {
426       do {
427         const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
428         __m128i s[4];
429 
430         s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
431         s[1] =
432             _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
433         s[2] =
434             _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
435         s[3] =
436             _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
437         const __m128i res_lo = convolve_lo_x(s, coeffs);
438         __m128i res_lo_round =
439             _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
440         res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
441                                      round_shift);
442 
443         const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
444         const __m128i res = _mm_packus_epi16(res16, res16);
445 
446         int r = _mm_cvtsi128_si32(res);
447         if (w == 2)
448           *(uint16_t *)dst = (uint16_t)r;
449         else
450           *(int *)dst = r;
451 
452         src_ptr += src_stride;
453         dst += dst_stride;
454       } while (--h);
455     } else {
456       assert(!(w % 8));
457       int i = 0;
458       do {
459         int j = 0;
460         do {
461           const __m128i data =
462               _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
463           __m128i s[4];
464 
465           // Filter even-index pixels
466           s[0] = data;
467           s[1] = _mm_srli_si128(data, 2);
468           s[2] = _mm_srli_si128(data, 4);
469           s[3] = _mm_srli_si128(data, 6);
470           const __m128i res_even = convolve_lo_x(s, coeffs);
471 
472           // Filter odd-index pixels
473           s[0] = _mm_srli_si128(data, 1);
474           s[1] = _mm_srli_si128(data, 3);
475           s[2] = _mm_srli_si128(data, 5);
476           s[3] = _mm_srli_si128(data, 7);
477           const __m128i res_odd = convolve_lo_x(s, coeffs);
478 
479           // Rearrange pixels back into the order 0 ... 7
480           const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
481           const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
482           __m128i res_lo_round = _mm_sra_epi32(
483               _mm_add_epi32(res_lo, round_0_const), round_0_shift);
484           res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
485                                        round_shift);
486           __m128i res_hi_round = _mm_sra_epi32(
487               _mm_add_epi32(res_hi, round_0_const), round_0_shift);
488           res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
489                                        round_shift);
490 
491           const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
492           const __m128i res = _mm_packus_epi16(res16, res16);
493 
494           _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
495           j += 8;
496         } while (j < w);
497       } while (++i < h);
498     }
499   }
500 }
501