1 /*
2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18
19 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
20 #include "aom_dsp/arm/mem_neon.h"
21
highbd_convolve8_4_h(int16x8_t s[4],int16x8_t filter,uint16x4_t max)22 static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter,
23 uint16x4_t max) {
24 int64x2_t sum[4];
25
26 sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter);
27 sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter);
28 sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter);
29 sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter);
30
31 int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
32 int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
33
34 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
35
36 uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS);
37 return vmin_u16(res, max);
38 }
39
highbd_convolve8_8_h(int16x8_t s[8],int16x8_t filter,uint16x8_t max)40 static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter,
41 uint16x8_t max) {
42 int64x2_t sum[8];
43
44 sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter);
45 sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter);
46 sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter);
47 sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter);
48 sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter);
49 sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter);
50 sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter);
51 sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter);
52
53 int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]);
54 int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]);
55 int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]);
56 int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]);
57
58 int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23));
59 int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67));
60
61 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS),
62 vqrshrun_n_s32(sum4567, FILTER_BITS));
63 return vminq_u16(res, max);
64 }
65
aom_highbd_convolve8_horiz_sve(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int width,int height,int bd)66 void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride,
67 uint8_t *dst8, ptrdiff_t dst_stride,
68 const int16_t *filter_x, int x_step_q4,
69 const int16_t *filter_y, int y_step_q4,
70 int width, int height, int bd) {
71 assert(x_step_q4 == 16);
72 assert(width >= 4 && height >= 4);
73 (void)filter_y;
74 (void)x_step_q4;
75 (void)y_step_q4;
76
77 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
78 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
79
80 src -= SUBPEL_TAPS / 2 - 1;
81
82 const int16x8_t filter = vld1q_s16(filter_x);
83
84 if (width == 4) {
85 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
86 const int16_t *s = (const int16_t *)src;
87 uint16_t *d = dst;
88
89 do {
90 int16x8_t s0[4], s1[4], s2[4], s3[4];
91 load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
92 load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
93 load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
94 load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
95
96 uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max);
97 uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max);
98 uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max);
99 uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max);
100
101 store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
102
103 s += 4 * src_stride;
104 d += 4 * dst_stride;
105 height -= 4;
106 } while (height > 0);
107 } else {
108 do {
109 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
110 const int16_t *s = (const int16_t *)src;
111 uint16_t *d = dst;
112 int w = width;
113
114 do {
115 int16x8_t s0[8], s1[8], s2[8], s3[8];
116 load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3],
117 &s0[4], &s0[5], &s0[6], &s0[7]);
118 load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3],
119 &s1[4], &s1[5], &s1[6], &s1[7]);
120 load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3],
121 &s2[4], &s2[5], &s2[6], &s2[7]);
122 load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3],
123 &s3[4], &s3[5], &s3[6], &s3[7]);
124
125 uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max);
126 uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max);
127 uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max);
128 uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max);
129
130 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
131
132 s += 8;
133 d += 8;
134 w -= 8;
135 } while (w != 0);
136 src += 4 * src_stride;
137 dst += 4 * dst_stride;
138 height -= 4;
139 } while (height > 0);
140 }
141 }
142
143 DECLARE_ALIGNED(16, static const uint8_t, kDotProdTranConcatTbl[32]) = {
144 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27,
145 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
146 };
147
148 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
149 // Shift left and insert new last column in transposed 4x4 block.
150 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25,
151 // Shift left and insert two new columns in transposed 4x4 block.
152 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27,
153 // Shift left and insert three new columns in transposed 4x4 block.
154 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29
155 };
156
transpose_concat_4x4(int16x4_t s0,int16x4_t s1,int16x4_t s2,int16x4_t s3,int16x8_t res[2],uint8x16_t permute_tbl[2])157 static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1,
158 int16x4_t s2, int16x4_t s3,
159 int16x8_t res[2],
160 uint8x16_t permute_tbl[2]) {
161 // Transpose 16-bit elements and concatenate result rows as follows:
162 // s0: 00, 01, 02, 03
163 // s1: 10, 11, 12, 13
164 // s2: 20, 21, 22, 23
165 // s3: 30, 31, 32, 33
166 //
167 // res[0]: 00 10 20 30 01 11 21 31
168 // res[1]: 02 12 22 32 03 13 23 33
169 //
170 // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
171 // as an argument is preferable to loading it directly from memory as this
172 // inline helper is called many times from the same parent function.
173
174 int8x16x2_t samples = { vreinterpretq_s8_s16(vcombine_s16(s0, s1)),
175 vreinterpretq_s8_s16(vcombine_s16(s2, s3)) };
176
177 res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[0]));
178 res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[1]));
179 }
180
transpose_concat_8x4(int16x8_t s0,int16x8_t s1,int16x8_t s2,int16x8_t s3,int16x8_t res[4],uint8x16_t permute_tbl[2])181 static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1,
182 int16x8_t s2, int16x8_t s3,
183 int16x8_t res[4],
184 uint8x16_t permute_tbl[2]) {
185 // Transpose 16-bit elements and concatenate result rows as follows:
186 // s0: 00, 01, 02, 03, 04, 05, 06, 07
187 // s1: 10, 11, 12, 13, 14, 15, 16, 17
188 // s2: 20, 21, 22, 23, 24, 25, 26, 27
189 // s3: 30, 31, 32, 33, 34, 35, 36, 37
190 //
191 // res_lo[0]: 00 10 20 30 01 11 21 31
192 // res_lo[1]: 02 12 22 32 03 13 23 33
193 // res_hi[0]: 04 14 24 34 05 15 25 35
194 // res_hi[1]: 06 16 26 36 07 17 27 37
195 //
196 // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it
197 // as an argument is preferable to loading it directly from memory as this
198 // inline helper is called many times from the same parent function.
199
200 int8x16x2_t samples_lo = {
201 vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s0), vget_low_s16(s1))),
202 vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s2), vget_low_s16(s3)))
203 };
204
205 res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[0]));
206 res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[1]));
207
208 int8x16x2_t samples_hi = {
209 vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s0), vget_high_s16(s1))),
210 vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s2), vget_high_s16(s3)))
211 };
212
213 res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[0]));
214 res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[1]));
215 }
216
aom_tbl2x4_s16(int16x8_t t0[4],int16x8_t t1[4],uint8x16_t tbl,int16x8_t res[4])217 static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4],
218 uint8x16_t tbl, int16x8_t res[4]) {
219 int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
220 vreinterpretq_s8_s16(t1[0]) };
221 int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
222 vreinterpretq_s8_s16(t1[1]) };
223 int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]),
224 vreinterpretq_s8_s16(t1[2]) };
225 int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]),
226 vreinterpretq_s8_s16(t1[3]) };
227
228 res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
229 res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
230 res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl));
231 res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl));
232 }
233
aom_tbl2x2_s16(int16x8_t t0[2],int16x8_t t1[2],uint8x16_t tbl,int16x8_t res[2])234 static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2],
235 uint8x16_t tbl, int16x8_t res[2]) {
236 int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]),
237 vreinterpretq_s8_s16(t1[0]) };
238 int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]),
239 vreinterpretq_s8_s16(t1[1]) };
240
241 res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl));
242 res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl));
243 }
244
highbd_convolve8_4_v(int16x8_t samples_lo[2],int16x8_t samples_hi[2],int16x8_t filter,uint16x4_t max)245 static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2],
246 int16x8_t samples_hi[2],
247 int16x8_t filter,
248 uint16x4_t max) {
249 int64x2_t sum[2];
250
251 sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
252 sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
253
254 sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
255 sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
256
257 int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
258
259 uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS);
260
261 return vmin_u16(res, max);
262 }
263
highbd_convolve8_8_v(int16x8_t samples_lo[4],int16x8_t samples_hi[4],int16x8_t filter,uint16x8_t max)264 static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4],
265 int16x8_t samples_hi[4],
266 int16x8_t filter,
267 uint16x8_t max) {
268 int64x2_t sum[4];
269
270 sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0);
271 sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1);
272
273 sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0);
274 sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1);
275
276 sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0);
277 sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1);
278
279 sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0);
280 sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1);
281
282 int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1]));
283 int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3]));
284
285 uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS),
286 vqrshrun_n_s32(res1, FILTER_BITS));
287
288 return vminq_u16(res, max);
289 }
290
aom_highbd_convolve8_vert_sve(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int width,int height,int bd)291 void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride,
292 uint8_t *dst8, ptrdiff_t dst_stride,
293 const int16_t *filter_x, int x_step_q4,
294 const int16_t *filter_y, int y_step_q4,
295 int width, int height, int bd) {
296 assert(y_step_q4 == 16);
297 assert(w >= 4 && h >= 4);
298 (void)filter_x;
299 (void)y_step_q4;
300 (void)x_step_q4;
301
302 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
303 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
304
305 src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
306
307 const int16x8_t y_filter = vld1q_s16(filter_y);
308
309 uint8x16_t tran_concat_tbl[2];
310 tran_concat_tbl[0] = vld1q_u8(kDotProdTranConcatTbl);
311 tran_concat_tbl[1] = vld1q_u8(kDotProdTranConcatTbl + 16);
312 uint8x16_t merge_block_tbl[3];
313 merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl);
314 merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16);
315 merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32);
316
317 if (width == 4) {
318 const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
319 int16_t *s = (int16_t *)src;
320
321 int16x4_t s0, s1, s2, s3, s4, s5, s6;
322 load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
323 s += 7 * src_stride;
324
325 // This operation combines a conventional transpose and the sample permute
326 // required before computing the dot product.
327 int16x8_t s0123[2], s1234[2], s2345[2], s3456[2];
328 transpose_concat_4x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
329 transpose_concat_4x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
330 transpose_concat_4x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
331 transpose_concat_4x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
332
333 do {
334 int16x4_t s7, s8, s9, s10;
335 load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
336
337 int16x8_t s4567[2], s5678[2], s6789[2], s78910[2];
338
339 // Transpose and shuffle the 4 lines that were loaded.
340 transpose_concat_4x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
341
342 // Merge new data into block from previous iteration.
343 aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567);
344 aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678);
345 aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789);
346
347 uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max);
348 uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max);
349 uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max);
350 uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max);
351
352 store_u16_4x4(dst, dst_stride, d0, d1, d2, d3);
353
354 // Prepare block for next iteration - re-using as much as possible.
355 // Shuffle everything up four rows.
356 s0123[0] = s4567[0];
357 s0123[1] = s4567[1];
358 s1234[0] = s5678[0];
359 s1234[1] = s5678[1];
360 s2345[0] = s6789[0];
361 s2345[1] = s6789[1];
362 s3456[0] = s78910[0];
363 s3456[1] = s78910[1];
364
365 s += 4 * src_stride;
366 dst += 4 * dst_stride;
367 height -= 4;
368 } while (height != 0);
369 } else {
370 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
371 do {
372 int h = height;
373 int16_t *s = (int16_t *)src;
374 uint16_t *d = dst;
375
376 int16x8_t s0, s1, s2, s3, s4, s5, s6;
377 load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
378 s += 7 * src_stride;
379
380 // This operation combines a conventional transpose and the sample permute
381 // required before computing the dot product.
382 int16x8_t s0123[4], s1234[4], s2345[4], s3456[4];
383 transpose_concat_8x4(s0, s1, s2, s3, s0123, tran_concat_tbl);
384 transpose_concat_8x4(s1, s2, s3, s4, s1234, tran_concat_tbl);
385 transpose_concat_8x4(s2, s3, s4, s5, s2345, tran_concat_tbl);
386 transpose_concat_8x4(s3, s4, s5, s6, s3456, tran_concat_tbl);
387
388 do {
389 int16x8_t s7, s8, s9, s10;
390 load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
391
392 int16x8_t s4567[4], s5678[4], s6789[4], s78910[4];
393
394 // Transpose and shuffle the 4 lines that were loaded.
395 transpose_concat_8x4(s7, s8, s9, s10, s78910, tran_concat_tbl);
396
397 // Merge new data into block from previous iteration.
398 aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567);
399 aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678);
400 aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789);
401
402 uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max);
403 uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max);
404 uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max);
405 uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max);
406
407 store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
408
409 // Prepare block for next iteration - re-using as much as possible.
410 // Shuffle everything up four rows.
411 s0123[0] = s4567[0];
412 s0123[1] = s4567[1];
413 s0123[2] = s4567[2];
414 s0123[3] = s4567[3];
415
416 s1234[0] = s5678[0];
417 s1234[1] = s5678[1];
418 s1234[2] = s5678[2];
419 s1234[3] = s5678[3];
420
421 s2345[0] = s6789[0];
422 s2345[1] = s6789[1];
423 s2345[2] = s6789[2];
424 s2345[3] = s6789[3];
425
426 s3456[0] = s78910[0];
427 s3456[1] = s78910[1];
428 s3456[2] = s78910[2];
429 s3456[3] = s78910[3];
430
431 s += 4 * src_stride;
432 d += 4 * dst_stride;
433 h -= 4;
434 } while (h != 0);
435 src += 8;
436 dst += 8;
437 width -= 8;
438 } while (width != 0);
439 }
440 }
441