1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/txfm_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/common.h"
21 #include "av1/common/arm/convolve_neon.h"
22 #include "av1/common/arm/mem_neon.h"
23 #include "av1/common/arm/transpose_neon.h"
24
25 /* Wiener filter 2D
26 Apply horizontal filter and store in a temporary buffer. When applying
27 vertical filter, overwrite the original pixel values.
28 */
av1_wiener_convolve_add_src_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)29 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
30 uint8_t *dst, ptrdiff_t dst_stride,
31 const int16_t *filter_x, int x_step_q4,
32 const int16_t *filter_y, int y_step_q4,
33 int w, int h,
34 const ConvolveParams *conv_params) {
35 uint16_t *d_tmp;
36 uint8_t *d;
37 const uint8_t *src_ptr, *s_tmp;
38 uint16_t *dst_ptr;
39 (void)x_step_q4;
40 (void)y_step_q4;
41
42 int width, height;
43 const int bd = 8;
44 const int intermediate_height = h + SUBPEL_TAPS - 1;
45 const int center_tap = ((SUBPEL_TAPS - 1) / 2);
46 int16_t filter_x_tmp[7], filter_y_tmp[7];
47
48 DECLARE_ALIGNED(16, uint16_t,
49 temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
50
51 assert(x_step_q4 == 16 && y_step_q4 == 16);
52 assert(!(w % 8));
53
54 assert(w <= MAX_SB_SIZE);
55 assert(h <= MAX_SB_SIZE);
56
57 assert(filter_x[7] == 0);
58 assert(filter_y[7] == 0);
59
60 /* assumption of horizontal filtering output will not exceed 15 bit.
61 ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
62 16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
63 */
64 assert((conv_params->round_0) >= 1);
65
66 memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
67 memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
68
69 filter_x_tmp[3] += (1 << FILTER_BITS);
70 filter_y_tmp[3] += (1 << FILTER_BITS);
71
72 s_tmp = src - center_tap * src_stride - center_tap;
73 dst_ptr = temp;
74 src_ptr = s_tmp;
75 height = intermediate_height;
76
77 /* if height is a multiple of 8 */
78 if (!(h & 7)) {
79 int16x8_t res0, res1, res2, res3;
80 uint16x8_t res4;
81 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
82 #if defined(__aarch64__)
83 uint16x8_t res5, res6, res7, res8, res9, res10, res11;
84 uint8x8_t t8, t9, t10, t11, t12, t13, t14;
85
86 do {
87 const uint8_t *s;
88
89 __builtin_prefetch(src_ptr + 0 * src_stride);
90 __builtin_prefetch(src_ptr + 1 * src_stride);
91 __builtin_prefetch(src_ptr + 2 * src_stride);
92 __builtin_prefetch(src_ptr + 3 * src_stride);
93 __builtin_prefetch(src_ptr + 4 * src_stride);
94 __builtin_prefetch(src_ptr + 5 * src_stride);
95 __builtin_prefetch(src_ptr + 6 * src_stride);
96 __builtin_prefetch(src_ptr + 7 * src_stride);
97
98 load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
99 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
100
101 s = src_ptr + 7;
102 d_tmp = dst_ptr;
103 width = w;
104
105 __builtin_prefetch(dst_ptr + 0 * dst_stride);
106 __builtin_prefetch(dst_ptr + 1 * dst_stride);
107 __builtin_prefetch(dst_ptr + 2 * dst_stride);
108 __builtin_prefetch(dst_ptr + 3 * dst_stride);
109 __builtin_prefetch(dst_ptr + 4 * dst_stride);
110 __builtin_prefetch(dst_ptr + 5 * dst_stride);
111 __builtin_prefetch(dst_ptr + 6 * dst_stride);
112 __builtin_prefetch(dst_ptr + 7 * dst_stride);
113
114 do {
115 load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
116 transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
117
118 res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
119 res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
120 res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
121 res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
122 res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
123 bd, conv_params->round_0);
124
125 res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
126 res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
127 res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
128 res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
129 res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
130 bd, conv_params->round_0);
131
132 res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
133 res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
134 res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
135 res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
136 res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
137 bd, conv_params->round_0);
138
139 res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
140 res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
141 res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
142 res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
143 res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
144 bd, conv_params->round_0);
145
146 res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
147 res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
148 res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
149 res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
150 res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
151 bd, conv_params->round_0);
152
153 res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
154 res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
155 res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
156 res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
157 res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
158 bd, conv_params->round_0);
159
160 res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
161 res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
162 res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
163 res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
164 res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
165 bd, conv_params->round_0);
166
167 res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
168 res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
169 res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
170 res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
171 res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
172 bd, conv_params->round_0);
173
174 transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
175 &res11);
176 store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
177 res10, res11);
178
179 t0 = t8;
180 t1 = t9;
181 t2 = t10;
182 t3 = t11;
183 t4 = t12;
184 t5 = t13;
185 t6 = t14;
186 s += 8;
187 d_tmp += 8;
188 width -= 8;
189 } while (width > 0);
190 src_ptr += 8 * src_stride;
191 dst_ptr += 8 * MAX_SB_SIZE;
192 height -= 8;
193 } while (height > 0);
194 #else
195 uint8x8_t temp_0;
196
197 do {
198 const uint8_t *s;
199
200 __builtin_prefetch(src_ptr);
201
202 t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
203 s = src_ptr + 8;
204 d_tmp = dst_ptr;
205 width = w;
206
207 __builtin_prefetch(dst_ptr);
208
209 do {
210 t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
211 temp_0 = t0;
212 t0 = t7;
213
214 t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
215 t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
216 t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
217 t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
218 t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
219 t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
220 t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
221
222 res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
223 res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
224 res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
225 res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
226 res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
227 bd, conv_params->round_0);
228
229 vst1q_u16(d_tmp, res4);
230
231 s += 8;
232 d_tmp += 8;
233 width -= 8;
234 } while (width > 0);
235 src_ptr += src_stride;
236 dst_ptr += MAX_SB_SIZE;
237 height--;
238 } while (height > 0);
239 #endif
240 } else {
241 /*if height is a multiple of 4*/
242 const uint8_t *s;
243 int16x8_t tt0, tt1, tt2, tt3;
244 uint16x8_t d0;
245 uint8x8_t t0, t1, t2, t3;
246
247 #if defined(__aarch64__)
248 uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
249 uint16x8_t d1, d2, d3;
250 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
251 int16x4_t s11, s12, s13, s14;
252 do {
253 __builtin_prefetch(src_ptr + 0 * src_stride);
254 __builtin_prefetch(src_ptr + 1 * src_stride);
255 __builtin_prefetch(src_ptr + 2 * src_stride);
256 __builtin_prefetch(src_ptr + 3 * src_stride);
257
258 load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
259 transpose_u8_8x4(&t0, &t1, &t2,
260 &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
261
262 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
263 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
264 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
265 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
266
267 s0 = vget_low_s16(tt0); /*pa0 pb0 pc0 pd0 -- pixel_a0*/
268 s1 = vget_low_s16(tt1); /*pa1 pb1 pc1 pd1 */
269 s2 = vget_low_s16(tt2); /*pa2 pb2 pc2 pd2 */
270 s3 = vget_low_s16(tt3); /*pa3 pb3 pc3 pd3 */
271 s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
272 s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
273 s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
274
275 __builtin_prefetch(dst_ptr + 0 * dst_stride);
276 __builtin_prefetch(dst_ptr + 1 * dst_stride);
277 __builtin_prefetch(dst_ptr + 2 * dst_stride);
278 __builtin_prefetch(dst_ptr + 3 * dst_stride);
279
280 s = src_ptr + 7;
281 d_tmp = dst_ptr;
282 width = w;
283
284 do {
285 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
286 transpose_u8_8x4(&t0, &t1, &t2, &t3);
287
288 tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
289 tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
290 tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
291 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
292
293 s7 = vget_low_s16(tt0); /*pa7 pb7 pc7 pd7 */ /*4x8*/
294 s8 = vget_low_s16(tt1); /*pa8 pb8 pc8 pd8 */
295 s9 = vget_low_s16(tt2); /*pa9 pb9 pc9 pd9 */
296 s10 = vget_low_s16(tt3); /*pa10 pb10 pc10 pd10 */
297 s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
298 s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
299 s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
300 s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
301
302 res0 = wiener_convolve8_horiz_4x8(
303 s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
304 res1 = wiener_convolve8_horiz_4x8(
305 s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
306 res2 = wiener_convolve8_horiz_4x8(
307 s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
308 res3 = wiener_convolve8_horiz_4x8(
309 s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
310 res4 =
311 wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
312 filter_x_tmp, bd, conv_params->round_0);
313 res5 =
314 wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
315 filter_x_tmp, bd, conv_params->round_0);
316 res6 =
317 wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
318 filter_x_tmp, bd, conv_params->round_0);
319 res7 =
320 wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
321 filter_x_tmp, bd, conv_params->round_0);
322
323 transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
324 &res7, &d0, &d1, &d2, &d3);
325
326 store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
327
328 s0 = s8;
329 s1 = s9;
330 s2 = s10;
331 s3 = s11;
332 s4 = s12;
333 s5 = s13;
334 s6 = s14;
335 s += 8;
336 d_tmp += 8;
337 width -= 8;
338 } while (width > 0);
339
340 src_ptr += 4 * src_stride;
341 dst_ptr += 4 * MAX_SB_SIZE;
342 height -= 4;
343 } while (height > 0);
344 #else
345 uint8x8_t temp_0, t4, t5, t6, t7;
346
347 do {
348 __builtin_prefetch(src_ptr);
349
350 t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
351
352 __builtin_prefetch(dst_ptr);
353
354 s = src_ptr + 8;
355 d_tmp = dst_ptr;
356 width = w;
357
358 do {
359 t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
360 temp_0 = t0;
361 t0 = t7;
362
363 t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
364 t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
365 t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
366 t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
367 t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
368 t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
369 t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
370
371 tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
372 tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
373 tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
374 tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
375 d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
376 conv_params->round_0);
377
378 vst1q_u16(d_tmp, d0);
379
380 s += 8;
381 d_tmp += 8;
382 width -= 8;
383 } while (width > 0);
384
385 src_ptr += src_stride;
386 dst_ptr += MAX_SB_SIZE;
387 height -= 1;
388 } while (height > 0);
389 #endif
390 }
391
392 {
393 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
394 uint8x8_t t0;
395 #if defined(__aarch64__)
396 int16x8_t s8, s9, s10;
397 uint8x8_t t1, t2, t3;
398 #endif
399 int16_t *src_tmp_ptr, *s;
400 uint8_t *dst_tmp_ptr;
401 height = h;
402 width = w;
403 src_tmp_ptr = (int16_t *)temp;
404 dst_tmp_ptr = dst;
405 src_stride = MAX_SB_SIZE;
406
407 do {
408 s = src_tmp_ptr;
409 s0 = vld1q_s16(s);
410 s += src_stride;
411 s1 = vld1q_s16(s);
412 s += src_stride;
413 s2 = vld1q_s16(s);
414 s += src_stride;
415 s3 = vld1q_s16(s);
416 s += src_stride;
417 s4 = vld1q_s16(s);
418 s += src_stride;
419 s5 = vld1q_s16(s);
420 s += src_stride;
421 s6 = vld1q_s16(s);
422 s += src_stride;
423 d = dst_tmp_ptr;
424 height = h;
425
426 #if defined(__aarch64__)
427 do {
428 __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
429 __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
430 __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
431 __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
432
433 s7 = vld1q_s16(s);
434 s += src_stride;
435 s8 = vld1q_s16(s);
436 s += src_stride;
437 s9 = vld1q_s16(s);
438 s += src_stride;
439 s10 = vld1q_s16(s);
440 s += src_stride;
441
442 t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
443 bd, conv_params->round_1);
444 t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
445 bd, conv_params->round_1);
446 t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
447 bd, conv_params->round_1);
448 t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
449 bd, conv_params->round_1);
450
451 vst1_u8(d, t0);
452 d += dst_stride;
453 vst1_u8(d, t1);
454 d += dst_stride;
455 vst1_u8(d, t2);
456 d += dst_stride;
457 vst1_u8(d, t3);
458 d += dst_stride;
459
460 s0 = s4;
461 s1 = s5;
462 s2 = s6;
463 s3 = s7;
464 s4 = s8;
465 s5 = s9;
466 s6 = s10;
467 height -= 4;
468 } while (height > 3);
469
470 if (height != 0) {
471 __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
472 __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
473
474 do {
475 s7 = vld1q_s16(s);
476 s += src_stride;
477
478 t0 =
479 wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
480 filter_y_tmp, bd, conv_params->round_1);
481 vst1_u8(d, t0);
482 d += dst_stride;
483
484 s0 = s1;
485 s1 = s2;
486 s2 = s3;
487 s3 = s4;
488 s4 = s5;
489 s5 = s6;
490 s6 = s7;
491 height -= 1;
492 } while (height > 0);
493 }
494
495 src_tmp_ptr += 8;
496 dst_tmp_ptr += 8;
497
498 w -= 8;
499 } while (w > 0);
500 #else
501 do {
502 __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
503
504 s7 = vld1q_s16(s);
505 s += src_stride;
506
507 t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
508 bd, conv_params->round_1);
509
510 vst1_u8(d, t0);
511 d += dst_stride;
512
513 s0 = s1;
514 s1 = s2;
515 s2 = s3;
516 s3 = s4;
517 s4 = s5;
518 s5 = s6;
519 s6 = s7;
520 height -= 1;
521 } while (height > 0);
522
523 src_tmp_ptr += 8;
524 dst_tmp_ptr += 8;
525
526 w -= 8;
527 } while (w > 0);
528 #endif
529 }
530 }
531