• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15 
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 
19 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 
highbd_convolve8_4_h(int16x8_t s[4],int16x8_t filter,uint16x4_t max)22 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
23                                               uint16x4_t max) {
24   int64x2_t sum[4];
25 
26   sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter);
27   sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter);
28   sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter);
29   sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter);
30 
31   int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
32   int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
33 
34   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
35 
36   uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
37   return vmin_u16(res, max);
38 }
39 
highbd_convolve8_8_h(int16x8_t s[8],int16x8_t filter,uint16x8_t max)40 static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
41                                               uint16x8_t max) {
42   int64x2_t sum[8];
43 
44   sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter);
45   sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter);
46   sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter);
47   sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter);
48   sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter);
49   sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter);
50   sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter);
51   sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter);
52 
53   int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
54   int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
55   int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
56   int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
57 
58   int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
59   int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
60 
61   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
62                                 vqrshrun_n_s32(sum4567, FILTER_BITS));
63   return vminq_u16(res, max);
64 }
65 
aom_highbd_convolve8_horiz_sve(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int width,int height,int bd)66 void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
67                                     uint8_t *dst8, ptrdiff_t dst_stride,
68                                     const int16_t *filter_x, int x_step_q4,
69                                     const int16_t *filter_y, int y_step_q4,
70                                     int width, int height, int bd) {
71   assert(x_step_q4 == 16);
72   assert(width >= 4 && height >= 4);
73   (void)filter_y;
74   (void)x_step_q4;
75   (void)y_step_q4;
76 
77   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
78   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
79 
80   src -= SUBPEL_TAPS / 2 - 1;
81 
82   const int16x8_t filter = vld1q_s16(filter_x);
83 
84   if (width == 4) {
85     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
86     const int16_t *s = (const int16_t *)src;
87     uint16_t *d = dst;
88 
89     do {
90       int16x8_t s0[4], s1[4], s2[4], s3[4];
91       load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
92       load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
93       load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
94       load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
95 
96       uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max);
97       uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max);
98       uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max);
99       uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max);
100 
101       store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
102 
103       s += 4 * src_stride;
104       d += 4 * dst_stride;
105       height -= 4;
106     } while (height > 0);
107   } else {
108     do {
109       const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
110       const int16_t *s = (const int16_t *)src;
111       uint16_t *d = dst;
112       int w = width;
113 
114       do {
115         int16x8_t s0[8], s1[8], s2[8], s3[8];
116         load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
117                      &s0[4], &s0[5], &s0[6], &s0[7]);
118         load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
119                      &s1[4], &s1[5], &s1[6], &s1[7]);
120         load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
121                      &s2[4], &s2[5], &s2[6], &s2[7]);
122         load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
123                      &s3[4], &s3[5], &s3[6], &s3[7]);
124 
125         uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max);
126         uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max);
127         uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max);
128         uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max);
129 
130         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
131 
132         s += 8;
133         d += 8;
134         w -= 8;
135       } while (w != 0);
136       src += 4 * src_stride;
137       dst += 4 * dst_stride;
138       height -= 4;
139     } while (height > 0);
140   }
141 }
142 
143 DECLARE_ALIGNED(16, static const uint8_t, kDotProdTranConcatTbl[32]) = {
144   0, 1, 8,  9,  16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27,
145   4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
146 };
147 
148 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
149   // Shift left and insert new last column in transposed 4x4 block.
150   2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25,
151   // Shift left and insert two new columns in transposed 4x4 block.
152   4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27,
153   // Shift left and insert three new columns in transposed 4x4 block.
154   6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
155 };
156 
transpose_concat_4x4(int16x4_t s0,int16x4_t s1,int16x4_t s2,int16x4_t s3,int16x8_t res[2],uint8x16_t permute_tbl[2])157 static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
158                                         int16x4_t s2, int16x4_t s3,
159                                         int16x8_t res[2],
160                                         uint8x16_t permute_tbl[2]) {
161   // Transpose 16-bit elements and concatenate result rows as follows:
162   // s0: 00, 01, 02, 03
163   // s1: 10, 11, 12, 13
164   // s2: 20, 21, 22, 23
165   // s3: 30, 31, 32, 33
166   //
167   // res[0]: 00 10 20 30 01 11 21 31
168   // res[1]: 02 12 22 32 03 13 23 33
169   //
170   // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
171   // as an argument is preferable to loading it directly from memory as this
172   // inline helper is called many times from the same parent function.
173 
174   int8x16x2_t samples = { vreinterpretq_s8_s16(vcombine_s16(s0, s1)),
175                           vreinterpretq_s8_s16(vcombine_s16(s2, s3)) };
176 
177   res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[0]));
178   res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[1]));
179 }
180 
transpose_concat_8x4(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t s3,int16x8_t res[4],uint8x16_t permute_tbl[2])181 static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
182                                         int16x8_t s2, int16x8_t s3,
183                                         int16x8_t res[4],
184                                         uint8x16_t permute_tbl[2]) {
185   // Transpose 16-bit elements and concatenate result rows as follows:
186   // s0: 00, 01, 02, 03, 04, 05, 06, 07
187   // s1: 10, 11, 12, 13, 14, 15, 16, 17
188   // s2: 20, 21, 22, 23, 24, 25, 26, 27
189   // s3: 30, 31, 32, 33, 34, 35, 36, 37
190   //
191   // res_lo[0]: 00 10 20 30 01 11 21 31
192   // res_lo[1]: 02 12 22 32 03 13 23 33
193   // res_hi[0]: 04 14 24 34 05 15 25 35
194   // res_hi[1]: 06 16 26 36 07 17 27 37
195   //
196   // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
197   // as an argument is preferable to loading it directly from memory as this
198   // inline helper is called many times from the same parent function.
199 
200   int8x16x2_t samples_lo = {
201     vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s0), vget_low_s16(s1))),
202     vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s2), vget_low_s16(s3)))
203   };
204 
205   res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[0]));
206   res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[1]));
207 
208   int8x16x2_t samples_hi = {
209     vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s0), vget_high_s16(s1))),
210     vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s2), vget_high_s16(s3)))
211   };
212 
213   res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[0]));
214   res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[1]));
215 }
216 
aom_tbl2x4_s16(int16x8_t t0[4],int16x8_t t1[4],uint8x16_t tbl,int16x8_t res[4])217 static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
218                                   uint8x16_t tbl, int16x8_t res[4]) {
219   int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
220                            vreinterpretq_s8_s16(t1[0]) };
221   int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
222                            vreinterpretq_s8_s16(t1[1]) };
223   int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]),
224                            vreinterpretq_s8_s16(t1[2]) };
225   int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]),
226                            vreinterpretq_s8_s16(t1[3]) };
227 
228   res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
229   res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
230   res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl));
231   res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl));
232 }
233 
aom_tbl2x2_s16(int16x8_t t0[2],int16x8_t t1[2],uint8x16_t tbl,int16x8_t res[2])234 static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
235                                   uint8x16_t tbl, int16x8_t res[2]) {
236   int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
237                            vreinterpretq_s8_s16(t1[0]) };
238   int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
239                            vreinterpretq_s8_s16(t1[1]) };
240 
241   res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
242   res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
243 }
244 
highbd_convolve8_4_v(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,uint16x4_t max)245 static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
246                                               int16x8_t samples_hi[2],
247                                               int16x8_t filter,
248                                               uint16x4_t max) {
249   int64x2_t sum[2];
250 
251   sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
252   sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
253 
254   sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
255   sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
256 
257   int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
258 
259   uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
260 
261   return vmin_u16(res, max);
262 }
263 
highbd_convolve8_8_v(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,uint16x8_t max)264 static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
265                                               int16x8_t samples_hi[4],
266                                               int16x8_t filter,
267                                               uint16x8_t max) {
268   int64x2_t sum[4];
269 
270   sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
271   sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
272 
273   sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
274   sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
275 
276   sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
277   sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1);
278 
279   sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
280   sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1);
281 
282   int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
283   int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3]));
284 
285   uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
286                                 vqrshrun_n_s32(res1, FILTER_BITS));
287 
288   return vminq_u16(res, max);
289 }
290 
aom_highbd_convolve8_vert_sve(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int width,int height,int bd)291 void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
292                                    uint8_t *dst8, ptrdiff_t dst_stride,
293                                    const int16_t *filter_x, int x_step_q4,
294                                    const int16_t *filter_y, int y_step_q4,
295                                    int width, int height, int bd) {
296   assert(y_step_q4 == 16);
297   assert(w >= 4 && h >= 4);
298   (void)filter_x;
299   (void)y_step_q4;
300   (void)x_step_q4;
301 
302   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
303   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
304 
305   src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
306 
307   const int16x8_t y_filter = vld1q_s16(filter_y);
308 
309   uint8x16_t tran_concat_tbl[2];
310   tran_concat_tbl[0] = vld1q_u8(kDotProdTranConcatTbl);
311   tran_concat_tbl[1] = vld1q_u8(kDotProdTranConcatTbl + 16);
312   uint8x16_t merge_block_tbl[3];
313   merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
314   merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
315   merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
316 
317   if (width == 4) {
318     const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
319     int16_t *s = (int16_t *)src;
320 
321     int16x4_t s0, s1, s2, s3, s4, s5, s6;
322     load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
323     s += 7 * src_stride;
324 
325     // This operation combines a conventional transpose and the sample permute
326     // required before computing the dot product.
327     int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
328     transpose_concat_4x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
329     transpose_concat_4x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
330     transpose_concat_4x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
331     transpose_concat_4x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
332 
333     do {
334       int16x4_t s7, s8, s9, s10;
335       load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
336 
337       int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];
338 
339       // Transpose and shuffle the 4 lines that were loaded.
340       transpose_concat_4x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
341 
342       // Merge new data into block from previous iteration.
343       aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
344       aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678);
345       aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789);
346 
347       uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max);
348       uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max);
349       uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max);
350       uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max);
351 
352       store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
353 
354       // Prepare block for next iteration - re-using as much as possible.
355       // Shuffle everything up four rows.
356       s0123[0] = s4567[0];
357       s0123[1] = s4567[1];
358       s1234[0] = s5678[0];
359       s1234[1] = s5678[1];
360       s2345[0] = s6789[0];
361       s2345[1] = s6789[1];
362       s3456[0] = s78910[0];
363       s3456[1] = s78910[1];
364 
365       s += 4 * src_stride;
366       dst += 4 * dst_stride;
367       height -= 4;
368     } while (height != 0);
369   } else {
370     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
371     do {
372       int h = height;
373       int16_t *s = (int16_t *)src;
374       uint16_t *d = dst;
375 
376       int16x8_t s0, s1, s2, s3, s4, s5, s6;
377       load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
378       s += 7 * src_stride;
379 
380       // This operation combines a conventional transpose and the sample permute
381       // required before computing the dot product.
382       int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
383       transpose_concat_8x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
384       transpose_concat_8x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
385       transpose_concat_8x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
386       transpose_concat_8x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
387 
388       do {
389         int16x8_t s7, s8, s9, s10;
390         load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
391 
392         int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
393 
394         // Transpose and shuffle the 4 lines that were loaded.
395         transpose_concat_8x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
396 
397         // Merge new data into block from previous iteration.
398         aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
399         aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678);
400         aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789);
401 
402         uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max);
403         uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max);
404         uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max);
405         uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max);
406 
407         store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
408 
409         // Prepare block for next iteration - re-using as much as possible.
410         // Shuffle everything up four rows.
411         s0123[0] = s4567[0];
412         s0123[1] = s4567[1];
413         s0123[2] = s4567[2];
414         s0123[3] = s4567[3];
415 
416         s1234[0] = s5678[0];
417         s1234[1] = s5678[1];
418         s1234[2] = s5678[2];
419         s1234[3] = s5678[3];
420 
421         s2345[0] = s6789[0];
422         s2345[1] = s6789[1];
423         s2345[2] = s6789[2];
424         s2345[3] = s6789[3];
425 
426         s3456[0] = s78910[0];
427         s3456[1] = s78910[1];
428         s3456[2] = s78910[2];
429         s3456[3] = s78910[3];
430 
431         s += 4 * src_stride;
432         d += 4 * dst_stride;
433         h -= 4;
434       } while (h != 0);
435       src += 8;
436       dst += 8;
437       width -= 8;
438     } while (width != 0);
439   }
440 }
441