• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "config/aom_config.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/txfm_common.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/common.h"
21 #include "av1/common/arm/convolve_neon.h"
22 #include "av1/common/arm/mem_neon.h"
23 #include "av1/common/arm/transpose_neon.h"
24 
25 /* Wiener filter 2D
26    Apply horizontal filter and store in a temporary buffer. When applying
27    vertical filter, overwrite the original pixel values.
28  */
av1_wiener_convolve_add_src_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,const ConvolveParams * conv_params)29 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
30                                       uint8_t *dst, ptrdiff_t dst_stride,
31                                       const int16_t *filter_x, int x_step_q4,
32                                       const int16_t *filter_y, int y_step_q4,
33                                       int w, int h,
34                                       const ConvolveParams *conv_params) {
35   uint16_t *d_tmp;
36   uint8_t *d;
37   const uint8_t *src_ptr, *s_tmp;
38   uint16_t *dst_ptr;
39   (void)x_step_q4;
40   (void)y_step_q4;
41 
42   int width, height;
43   const int bd = 8;
44   const int intermediate_height = h + SUBPEL_TAPS - 1;
45   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
46   int16_t filter_x_tmp[7], filter_y_tmp[7];
47 
48   DECLARE_ALIGNED(16, uint16_t,
49                   temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
50 
51   assert(x_step_q4 == 16 && y_step_q4 == 16);
52   assert(!(w % 8));
53 
54   assert(w <= MAX_SB_SIZE);
55   assert(h <= MAX_SB_SIZE);
56 
57   assert(filter_x[7] == 0);
58   assert(filter_y[7] == 0);
59 
60   /* assumption of horizontal filtering output will not exceed 15 bit.
61      ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
62      16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
63    */
64   assert((conv_params->round_0) >= 1);
65 
66   memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
67   memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
68 
69   filter_x_tmp[3] += (1 << FILTER_BITS);
70   filter_y_tmp[3] += (1 << FILTER_BITS);
71 
72   s_tmp = src - center_tap * src_stride - center_tap;
73   dst_ptr = temp;
74   src_ptr = s_tmp;
75   height = intermediate_height;
76 
77   /* if height is a multiple of 8 */
78   if (!(h & 7)) {
79     int16x8_t res0, res1, res2, res3;
80     uint16x8_t res4;
81     uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
82 #if defined(__aarch64__)
83     uint16x8_t res5, res6, res7, res8, res9, res10, res11;
84     uint8x8_t t8, t9, t10, t11, t12, t13, t14;
85 
86     do {
87       const uint8_t *s;
88 
89       __builtin_prefetch(src_ptr + 0 * src_stride);
90       __builtin_prefetch(src_ptr + 1 * src_stride);
91       __builtin_prefetch(src_ptr + 2 * src_stride);
92       __builtin_prefetch(src_ptr + 3 * src_stride);
93       __builtin_prefetch(src_ptr + 4 * src_stride);
94       __builtin_prefetch(src_ptr + 5 * src_stride);
95       __builtin_prefetch(src_ptr + 6 * src_stride);
96       __builtin_prefetch(src_ptr + 7 * src_stride);
97 
98       load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
99       transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
100 
101       s = src_ptr + 7;
102       d_tmp = dst_ptr;
103       width = w;
104 
105       __builtin_prefetch(dst_ptr + 0 * dst_stride);
106       __builtin_prefetch(dst_ptr + 1 * dst_stride);
107       __builtin_prefetch(dst_ptr + 2 * dst_stride);
108       __builtin_prefetch(dst_ptr + 3 * dst_stride);
109       __builtin_prefetch(dst_ptr + 4 * dst_stride);
110       __builtin_prefetch(dst_ptr + 5 * dst_stride);
111       __builtin_prefetch(dst_ptr + 6 * dst_stride);
112       __builtin_prefetch(dst_ptr + 7 * dst_stride);
113 
114       do {
115         load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
116         transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
117 
118         res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
119         res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
120         res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
121         res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
122         res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
123                                           bd, conv_params->round_0);
124 
125         res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
126         res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
127         res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
128         res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
129         res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
130                                           bd, conv_params->round_0);
131 
132         res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
133         res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
134         res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
135         res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
136         res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
137                                           bd, conv_params->round_0);
138 
139         res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
140         res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
141         res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
142         res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
143         res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
144                                           bd, conv_params->round_0);
145 
146         res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
147         res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
148         res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
149         res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
150         res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
151                                           bd, conv_params->round_0);
152 
153         res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
154         res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
155         res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
156         res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
157         res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
158                                           bd, conv_params->round_0);
159 
160         res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
161         res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
162         res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
163         res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
164         res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
165                                            bd, conv_params->round_0);
166 
167         res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
168         res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
169         res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
170         res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
171         res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
172                                            bd, conv_params->round_0);
173 
174         transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
175                           &res11);
176         store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
177                       res10, res11);
178 
179         t0 = t8;
180         t1 = t9;
181         t2 = t10;
182         t3 = t11;
183         t4 = t12;
184         t5 = t13;
185         t6 = t14;
186         s += 8;
187         d_tmp += 8;
188         width -= 8;
189       } while (width > 0);
190       src_ptr += 8 * src_stride;
191       dst_ptr += 8 * MAX_SB_SIZE;
192       height -= 8;
193     } while (height > 0);
194 #else
195     uint8x8_t temp_0;
196 
197     do {
198       const uint8_t *s;
199 
200       __builtin_prefetch(src_ptr);
201 
202       t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
203       s = src_ptr + 8;
204       d_tmp = dst_ptr;
205       width = w;
206 
207       __builtin_prefetch(dst_ptr);
208 
209       do {
210         t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
211         temp_0 = t0;
212         t0 = t7;
213 
214         t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
215         t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
216         t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
217         t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
218         t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
219         t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
220         t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
221 
222         res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
223         res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
224         res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
225         res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
226         res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
227                                           bd, conv_params->round_0);
228 
229         vst1q_u16(d_tmp, res4);
230 
231         s += 8;
232         d_tmp += 8;
233         width -= 8;
234       } while (width > 0);
235       src_ptr += src_stride;
236       dst_ptr += MAX_SB_SIZE;
237       height--;
238     } while (height > 0);
239 #endif
240   } else {
241     /*if height is a multiple of 4*/
242     const uint8_t *s;
243     int16x8_t tt0, tt1, tt2, tt3;
244     uint16x8_t d0;
245     uint8x8_t t0, t1, t2, t3;
246 
247 #if defined(__aarch64__)
248     uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
249     uint16x8_t d1, d2, d3;
250     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
251     int16x4_t s11, s12, s13, s14;
252     do {
253       __builtin_prefetch(src_ptr + 0 * src_stride);
254       __builtin_prefetch(src_ptr + 1 * src_stride);
255       __builtin_prefetch(src_ptr + 2 * src_stride);
256       __builtin_prefetch(src_ptr + 3 * src_stride);
257 
258       load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
259       transpose_u8_8x4(&t0, &t1, &t2,
260                        &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
261 
262       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
263       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
264       tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
265       tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
266 
267       s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
268       s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
269       s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
270       s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
271       s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
272       s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
273       s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
274 
275       __builtin_prefetch(dst_ptr + 0 * dst_stride);
276       __builtin_prefetch(dst_ptr + 1 * dst_stride);
277       __builtin_prefetch(dst_ptr + 2 * dst_stride);
278       __builtin_prefetch(dst_ptr + 3 * dst_stride);
279 
280       s = src_ptr + 7;
281       d_tmp = dst_ptr;
282       width = w;
283 
284       do {
285         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
286         transpose_u8_8x4(&t0, &t1, &t2, &t3);
287 
288         tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
289         tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
290         tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
291         tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
292 
293         s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
294         s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
295         s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
296         s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
297         s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
298         s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
299         s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
300         s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
301 
302         res0 = wiener_convolve8_horiz_4x8(
303             s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
304         res1 = wiener_convolve8_horiz_4x8(
305             s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
306         res2 = wiener_convolve8_horiz_4x8(
307             s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
308         res3 = wiener_convolve8_horiz_4x8(
309             s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
310         res4 =
311             wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
312                                        filter_x_tmp, bd, conv_params->round_0);
313         res5 =
314             wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
315                                        filter_x_tmp, bd, conv_params->round_0);
316         res6 =
317             wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
318                                        filter_x_tmp, bd, conv_params->round_0);
319         res7 =
320             wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
321                                        filter_x_tmp, bd, conv_params->round_0);
322 
323         transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
324                           &res7, &d0, &d1, &d2, &d3);
325 
326         store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
327 
328         s0 = s8;
329         s1 = s9;
330         s2 = s10;
331         s3 = s11;
332         s4 = s12;
333         s5 = s13;
334         s6 = s14;
335         s += 8;
336         d_tmp += 8;
337         width -= 8;
338       } while (width > 0);
339 
340       src_ptr += 4 * src_stride;
341       dst_ptr += 4 * MAX_SB_SIZE;
342       height -= 4;
343     } while (height > 0);
344 #else
345     uint8x8_t temp_0, t4, t5, t6, t7;
346 
347     do {
348       __builtin_prefetch(src_ptr);
349 
350       t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
351 
352       __builtin_prefetch(dst_ptr);
353 
354       s = src_ptr + 8;
355       d_tmp = dst_ptr;
356       width = w;
357 
358       do {
359         t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
360         temp_0 = t0;
361         t0 = t7;
362 
363         t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
364         t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
365         t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
366         t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
367         t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
368         t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
369         t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
370 
371         tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
372         tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
373         tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
374         tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
375         d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
376                                         conv_params->round_0);
377 
378         vst1q_u16(d_tmp, d0);
379 
380         s += 8;
381         d_tmp += 8;
382         width -= 8;
383       } while (width > 0);
384 
385       src_ptr += src_stride;
386       dst_ptr += MAX_SB_SIZE;
387       height -= 1;
388     } while (height > 0);
389 #endif
390   }
391 
392   {
393     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
394     uint8x8_t t0;
395 #if defined(__aarch64__)
396     int16x8_t s8, s9, s10;
397     uint8x8_t t1, t2, t3;
398 #endif
399     int16_t *src_tmp_ptr, *s;
400     uint8_t *dst_tmp_ptr;
401     height = h;
402     width = w;
403     src_tmp_ptr = (int16_t *)temp;
404     dst_tmp_ptr = dst;
405     src_stride = MAX_SB_SIZE;
406 
407     do {
408       s = src_tmp_ptr;
409       s0 = vld1q_s16(s);
410       s += src_stride;
411       s1 = vld1q_s16(s);
412       s += src_stride;
413       s2 = vld1q_s16(s);
414       s += src_stride;
415       s3 = vld1q_s16(s);
416       s += src_stride;
417       s4 = vld1q_s16(s);
418       s += src_stride;
419       s5 = vld1q_s16(s);
420       s += src_stride;
421       s6 = vld1q_s16(s);
422       s += src_stride;
423       d = dst_tmp_ptr;
424       height = h;
425 
426 #if defined(__aarch64__)
427       do {
428         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
429         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
430         __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
431         __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
432 
433         s7 = vld1q_s16(s);
434         s += src_stride;
435         s8 = vld1q_s16(s);
436         s += src_stride;
437         s9 = vld1q_s16(s);
438         s += src_stride;
439         s10 = vld1q_s16(s);
440         s += src_stride;
441 
442         t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
443                                        bd, conv_params->round_1);
444         t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
445                                        bd, conv_params->round_1);
446         t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
447                                        bd, conv_params->round_1);
448         t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
449                                        bd, conv_params->round_1);
450 
451         vst1_u8(d, t0);
452         d += dst_stride;
453         vst1_u8(d, t1);
454         d += dst_stride;
455         vst1_u8(d, t2);
456         d += dst_stride;
457         vst1_u8(d, t3);
458         d += dst_stride;
459 
460         s0 = s4;
461         s1 = s5;
462         s2 = s6;
463         s3 = s7;
464         s4 = s8;
465         s5 = s9;
466         s6 = s10;
467         height -= 4;
468       } while (height > 3);
469 
470       if (height != 0) {
471         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
472         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
473 
474         do {
475           s7 = vld1q_s16(s);
476           s += src_stride;
477 
478           t0 =
479               wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
480                                         filter_y_tmp, bd, conv_params->round_1);
481           vst1_u8(d, t0);
482           d += dst_stride;
483 
484           s0 = s1;
485           s1 = s2;
486           s2 = s3;
487           s3 = s4;
488           s4 = s5;
489           s5 = s6;
490           s6 = s7;
491           height -= 1;
492         } while (height > 0);
493       }
494 
495       src_tmp_ptr += 8;
496       dst_tmp_ptr += 8;
497 
498       w -= 8;
499     } while (w > 0);
500 #else
501       do {
502         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
503 
504         s7 = vld1q_s16(s);
505         s += src_stride;
506 
507         t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
508                                        bd, conv_params->round_1);
509 
510         vst1_u8(d, t0);
511         d += dst_stride;
512 
513         s0 = s1;
514         s1 = s2;
515         s2 = s3;
516         s3 = s4;
517         s4 = s5;
518         s5 = s6;
519         s6 = s7;
520         height -= 1;
521       } while (height > 0);
522 
523       src_tmp_ptr += 8;
524       dst_tmp_ptr += 8;
525 
526       w -= 8;
527     } while (w > 0);
528 #endif
529   }
530 }
531