1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <string.h>
13 #include "./vpx_config.h"
14 #include "./vp8_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_ports/mem.h"
17
18 static const int8_t vp8_sub_pel_filters[8][8] = {
19 { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */
20 { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */
21 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
22 { 0, -9, 93, 50, -6, 0, 0, 0 },
23 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
24 { 0, -6, 50, 93, -9, 0, 0, 0 },
25 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
26 { 0, -1, 12, 123, -6, 0, 0, 0 },
27 };
28
29 // This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
30 // Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
31 // Elements 1 and 4 are either 0 or negative. The code accounts for this with
32 // multiply/accumulates which either add or subtract as needed. The other
33 // functions will be updated to use this table later.
34 // It is also expanded to 8 elements to allow loading into 64 bit neon
35 // registers.
36 static const uint8_t abs_filters[8][8] = {
37 { 0, 0, 128, 0, 0, 0, 0, 0 }, { 0, 6, 123, 12, 1, 0, 0, 0 },
38 { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
39 { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
40 { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
41 };
42
load_and_shift(const unsigned char * a)43 static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
44 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
45 }
46
filter_add_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)47 static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
48 const uint8x8_t filter, uint16x8_t *c,
49 uint16x8_t *d) {
50 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
51 vreinterpret_u32_u8(vget_high_u8(a)));
52 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
53 vreinterpret_u32_u8(vget_high_u8(b)));
54 *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
55 *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
56 }
57
filter_sub_accumulate(const uint8x16_t a,const uint8x16_t b,const uint8x8_t filter,uint16x8_t * c,uint16x8_t * d)58 static INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
59 const uint8x8_t filter, uint16x8_t *c,
60 uint16x8_t *d) {
61 const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
62 vreinterpret_u32_u8(vget_high_u8(a)));
63 const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
64 vreinterpret_u32_u8(vget_high_u8(b)));
65 *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
66 *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
67 }
68
yonly4x4(const unsigned char * src,int src_stride,int filter_offset,unsigned char * dst,int dst_stride)69 static INLINE void yonly4x4(const unsigned char *src, int src_stride,
70 int filter_offset, unsigned char *dst,
71 int dst_stride) {
72 uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
73 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
74 uint16x8_t c0, c1, c2, c3;
75 int16x8_t d0, d1;
76 uint8x8_t e0, e1;
77
78 const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
79 const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
80 const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
81 const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
82 const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
83 const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
84 const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
85
86 src -= src_stride * 2;
87 // Shift the even rows to allow using 'vext' to combine the vectors. armv8
88 // has vcopy_lane which would be interesting. This started as just a
89 // horrible workaround for clang adding alignment hints to 32bit loads:
90 // https://llvm.org/bugs/show_bug.cgi?id=24421
91 // But it turns out it almost identical to casting the loads.
92 a0 = load_and_shift(src);
93 src += src_stride;
94 a1 = vld1_u8(src);
95 src += src_stride;
96 a2 = load_and_shift(src);
97 src += src_stride;
98 a3 = vld1_u8(src);
99 src += src_stride;
100 a4 = load_and_shift(src);
101 src += src_stride;
102 a5 = vld1_u8(src);
103 src += src_stride;
104 a6 = load_and_shift(src);
105 src += src_stride;
106 a7 = vld1_u8(src);
107 src += src_stride;
108 a8 = vld1_u8(src);
109
110 // Combine the rows so we can operate on 8 at a time.
111 b0 = vext_u8(a0, a1, 4);
112 b2 = vext_u8(a2, a3, 4);
113 b4 = vext_u8(a4, a5, 4);
114 b6 = vext_u8(a6, a7, 4);
115 b8 = a8;
116
117 // To keep with the 8-at-a-time theme, combine *alternate* rows. This
118 // allows combining the odd rows with the even.
119 b1 = vext_u8(b0, b2, 4);
120 b3 = vext_u8(b2, b4, 4);
121 b5 = vext_u8(b4, b6, 4);
122 b7 = vext_u8(b6, b8, 4);
123
124 // Multiply and expand to 16 bits.
125 c0 = vmull_u8(b0, filter0);
126 c1 = vmull_u8(b2, filter0);
127 c2 = vmull_u8(b5, filter5);
128 c3 = vmull_u8(b7, filter5);
129
130 // Multiply, subtract and accumulate for filters 1 and 4 (the negative
131 // ones).
132 c0 = vmlsl_u8(c0, b4, filter4);
133 c1 = vmlsl_u8(c1, b6, filter4);
134 c2 = vmlsl_u8(c2, b1, filter1);
135 c3 = vmlsl_u8(c3, b3, filter1);
136
137 // Add more positive ones. vmlal should really return a signed type.
138 // It's doing signed math internally, as evidenced by the fact we can do
139 // subtractions followed by more additions. Ideally we could use
140 // vqmlal/sl but that instruction doesn't exist. Might be able to
141 // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
142 c0 = vmlal_u8(c0, b2, filter2);
143 c1 = vmlal_u8(c1, b4, filter2);
144 c2 = vmlal_u8(c2, b3, filter3);
145 c3 = vmlal_u8(c3, b5, filter3);
146
147 // Use signed saturation math because vmlsl may have left some negative
148 // numbers in there.
149 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
150 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
151
152 // Use signed again because numbers like -200 need to be saturated to 0.
153 e0 = vqrshrun_n_s16(d0, 7);
154 e1 = vqrshrun_n_s16(d1, 7);
155
156 store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
157 }
158
vp8_sixtap_predict4x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)159 void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
160 int xoffset, int yoffset,
161 unsigned char *dst_ptr, int dst_pitch) {
162 uint8x16_t s0, s1, s2, s3, s4;
163 uint64x2_t s01, s23;
164 // Variables to hold src[] elements for the given filter[]
165 uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
166 uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
167 uint8x16_t s01_f0, s23_f0;
168 uint64x2_t s01_f3, s23_f3;
169 uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
170 // Accumulator variables.
171 uint16x8_t d0123, d4567, d89;
172 uint16x8_t d0123_a, d4567_a, d89_a;
173 int16x8_t e0123, e4567, e89;
174 // Second pass intermediates.
175 uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
176 uint16x8_t c0, c1, c2, c3;
177 int16x8_t d0, d1;
178 uint8x8_t e0, e1;
179 uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
180
181 if (xoffset == 0) { // Second pass only.
182 yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
183 return;
184 }
185
186 if (yoffset == 0) { // First pass only.
187 src_ptr -= 2;
188 } else { // Add context for the second pass. 2 extra lines on top.
189 src_ptr -= 2 + (src_pixels_per_line * 2);
190 }
191
192 filter = vld1_u8(abs_filters[xoffset]);
193 filter0 = vdup_lane_u8(filter, 0);
194 filter1 = vdup_lane_u8(filter, 1);
195 filter2 = vdup_lane_u8(filter, 2);
196 filter3 = vdup_lane_u8(filter, 3);
197 filter4 = vdup_lane_u8(filter, 4);
198 filter5 = vdup_lane_u8(filter, 5);
199
200 // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
201 // garbage. So much effort for that last single bit.
202 // The low values of each pair are for filter0.
203 s0 = vld1q_u8(src_ptr);
204 src_ptr += src_pixels_per_line;
205 s1 = vld1q_u8(src_ptr);
206 src_ptr += src_pixels_per_line;
207 s2 = vld1q_u8(src_ptr);
208 src_ptr += src_pixels_per_line;
209 s3 = vld1q_u8(src_ptr);
210 src_ptr += src_pixels_per_line;
211
212 // Shift to extract values for filter[5]
213 // If src[] is 0, this puts:
214 // 3 4 5 6 7 8 9 10 in s0_f5
215 // Can't use vshr.u64 because it crosses the double word boundary.
216 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
217 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
218 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
219 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
220
221 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
222 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
223
224 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
225 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
226 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
227 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
228
229 // Keep original src data as 64 bits to simplify shifting and extracting.
230 s01 = vreinterpretq_u64_u8(s01_f0);
231 s23 = vreinterpretq_u64_u8(s23_f0);
232
233 // 3 4 5 6 * filter0
234 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
235
236 // Shift over one to use -1, 0, 1, 2 for filter1
237 // -1 0 1 2 * filter1
238 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
239 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
240 &d0123, &d4567);
241
242 // 2 3 4 5 * filter4
243 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
244 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
245 &d0123, &d4567);
246
247 // 0 1 2 3 * filter2
248 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
249 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
250 &d0123, &d4567);
251
252 // 1 2 3 4 * filter3
253 s01_f3 = vshrq_n_u64(s01, 24);
254 s23_f3 = vshrq_n_u64(s23, 24);
255 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
256 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
257 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
258 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
259 // Accumulate into different registers so it can use saturated addition.
260 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
261 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
262
263 e0123 =
264 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
265 e4567 =
266 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
267
268 // Shift and narrow.
269 b0 = vqrshrun_n_s16(e0123, 7);
270 b2 = vqrshrun_n_s16(e4567, 7);
271
272 if (yoffset == 0) { // firstpass_filter4x4_only
273 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
274 return;
275 }
276
277 // Load additional context when doing both filters.
278 s0 = vld1q_u8(src_ptr);
279 src_ptr += src_pixels_per_line;
280 s1 = vld1q_u8(src_ptr);
281 src_ptr += src_pixels_per_line;
282 s2 = vld1q_u8(src_ptr);
283 src_ptr += src_pixels_per_line;
284 s3 = vld1q_u8(src_ptr);
285 src_ptr += src_pixels_per_line;
286 s4 = vld1q_u8(src_ptr);
287
288 s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
289 s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
290 s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
291 s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
292 s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
293
294 // 3 4 5 6 * filter0
295 s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
296 s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
297
298 s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
299 s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
300 // But this time instead of 16 pixels to filter, there are 20. So an extra
301 // run with a doubleword register.
302 d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
303 d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
304 d89 = vmull_u8(s4_f5, filter5);
305
306 // Save a copy as u64 for shifting.
307 s01 = vreinterpretq_u64_u8(s01_f0);
308 s23 = vreinterpretq_u64_u8(s23_f0);
309
310 filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
311 d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
312
313 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
314 vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
315 &d0123, &d4567);
316 s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
317 d89 = vmlsl_u8(d89, s4_f1, filter1);
318
319 filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
320 vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
321 &d0123, &d4567);
322 s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
323 d89 = vmlsl_u8(d89, s4_f4, filter4);
324
325 filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
326 vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
327 &d0123, &d4567);
328 s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
329 d89 = vmlal_u8(d89, s4_f2, filter2);
330
331 s01_f3 = vshrq_n_u64(s01, 24);
332 s23_f3 = vshrq_n_u64(s23, 24);
333 s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
334 vreinterpret_u32_u64(vget_high_u64(s01_f3)));
335 s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
336 vreinterpret_u32_u64(vget_high_u64(s23_f3)));
337 s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
338 d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
339 d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
340 d89_a = vmull_u8(s4_f3, filter3);
341
342 e0123 =
343 vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
344 e4567 =
345 vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
346 e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
347
348 b4 = vqrshrun_n_s16(e0123, 7);
349 b6 = vqrshrun_n_s16(e4567, 7);
350 b8 = vqrshrun_n_s16(e89, 7);
351
352 // Second pass: 4x4
353 filter = vld1_u8(abs_filters[yoffset]);
354 filter0 = vdup_lane_u8(filter, 0);
355 filter1 = vdup_lane_u8(filter, 1);
356 filter2 = vdup_lane_u8(filter, 2);
357 filter3 = vdup_lane_u8(filter, 3);
358 filter4 = vdup_lane_u8(filter, 4);
359 filter5 = vdup_lane_u8(filter, 5);
360
361 b1 = vext_u8(b0, b2, 4);
362 b3 = vext_u8(b2, b4, 4);
363 b5 = vext_u8(b4, b6, 4);
364 b7 = vext_u8(b6, b8, 4);
365
366 c0 = vmull_u8(b0, filter0);
367 c1 = vmull_u8(b2, filter0);
368 c2 = vmull_u8(b5, filter5);
369 c3 = vmull_u8(b7, filter5);
370
371 c0 = vmlsl_u8(c0, b4, filter4);
372 c1 = vmlsl_u8(c1, b6, filter4);
373 c2 = vmlsl_u8(c2, b1, filter1);
374 c3 = vmlsl_u8(c3, b3, filter1);
375
376 c0 = vmlal_u8(c0, b2, filter2);
377 c1 = vmlal_u8(c1, b4, filter2);
378 c2 = vmlal_u8(c2, b3, filter3);
379 c3 = vmlal_u8(c3, b5, filter3);
380
381 d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
382 d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
383
384 e0 = vqrshrun_n_s16(d0, 7);
385 e1 = vqrshrun_n_s16(d1, 7);
386
387 store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
388 }
389
vp8_sixtap_predict8x4_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)390 void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
391 int xoffset, int yoffset,
392 unsigned char *dst_ptr, int dst_pitch) {
393 unsigned char *src;
394 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
395 uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
396 uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
397 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
398 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
399 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
400 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
401 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
402 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
403
404 if (xoffset == 0) { // secondpass_filter8x4_only
405 // load second_pass filter
406 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
407 d0s8 = vdup_lane_s8(dtmps8, 0);
408 d1s8 = vdup_lane_s8(dtmps8, 1);
409 d2s8 = vdup_lane_s8(dtmps8, 2);
410 d3s8 = vdup_lane_s8(dtmps8, 3);
411 d4s8 = vdup_lane_s8(dtmps8, 4);
412 d5s8 = vdup_lane_s8(dtmps8, 5);
413 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
414 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
415 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
416 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
417 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
418 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
419
420 // load src data
421 src = src_ptr - src_pixels_per_line * 2;
422 d22u8 = vld1_u8(src);
423 src += src_pixels_per_line;
424 d23u8 = vld1_u8(src);
425 src += src_pixels_per_line;
426 d24u8 = vld1_u8(src);
427 src += src_pixels_per_line;
428 d25u8 = vld1_u8(src);
429 src += src_pixels_per_line;
430 d26u8 = vld1_u8(src);
431 src += src_pixels_per_line;
432 d27u8 = vld1_u8(src);
433 src += src_pixels_per_line;
434 d28u8 = vld1_u8(src);
435 src += src_pixels_per_line;
436 d29u8 = vld1_u8(src);
437 src += src_pixels_per_line;
438 d30u8 = vld1_u8(src);
439
440 q3u16 = vmull_u8(d22u8, d0u8);
441 q4u16 = vmull_u8(d23u8, d0u8);
442 q5u16 = vmull_u8(d24u8, d0u8);
443 q6u16 = vmull_u8(d25u8, d0u8);
444
445 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
446 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
447 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
448 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
449
450 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
451 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
452 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
453 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
454
455 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
456 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
457 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
458 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
459
460 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
461 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
462 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
463 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
464
465 q7u16 = vmull_u8(d25u8, d3u8);
466 q8u16 = vmull_u8(d26u8, d3u8);
467 q9u16 = vmull_u8(d27u8, d3u8);
468 q10u16 = vmull_u8(d28u8, d3u8);
469
470 q3s16 = vreinterpretq_s16_u16(q3u16);
471 q4s16 = vreinterpretq_s16_u16(q4u16);
472 q5s16 = vreinterpretq_s16_u16(q5u16);
473 q6s16 = vreinterpretq_s16_u16(q6u16);
474 q7s16 = vreinterpretq_s16_u16(q7u16);
475 q8s16 = vreinterpretq_s16_u16(q8u16);
476 q9s16 = vreinterpretq_s16_u16(q9u16);
477 q10s16 = vreinterpretq_s16_u16(q10u16);
478
479 q7s16 = vqaddq_s16(q7s16, q3s16);
480 q8s16 = vqaddq_s16(q8s16, q4s16);
481 q9s16 = vqaddq_s16(q9s16, q5s16);
482 q10s16 = vqaddq_s16(q10s16, q6s16);
483
484 d6u8 = vqrshrun_n_s16(q7s16, 7);
485 d7u8 = vqrshrun_n_s16(q8s16, 7);
486 d8u8 = vqrshrun_n_s16(q9s16, 7);
487 d9u8 = vqrshrun_n_s16(q10s16, 7);
488
489 vst1_u8(dst_ptr, d6u8);
490 dst_ptr += dst_pitch;
491 vst1_u8(dst_ptr, d7u8);
492 dst_ptr += dst_pitch;
493 vst1_u8(dst_ptr, d8u8);
494 dst_ptr += dst_pitch;
495 vst1_u8(dst_ptr, d9u8);
496 return;
497 }
498
499 // load first_pass filter
500 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
501 d0s8 = vdup_lane_s8(dtmps8, 0);
502 d1s8 = vdup_lane_s8(dtmps8, 1);
503 d2s8 = vdup_lane_s8(dtmps8, 2);
504 d3s8 = vdup_lane_s8(dtmps8, 3);
505 d4s8 = vdup_lane_s8(dtmps8, 4);
506 d5s8 = vdup_lane_s8(dtmps8, 5);
507 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
508 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
509 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
510 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
511 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
512 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
513
514 // First pass: output_height lines x output_width columns (9x4)
515 if (yoffset == 0) // firstpass_filter4x4_only
516 src = src_ptr - 2;
517 else
518 src = src_ptr - 2 - (src_pixels_per_line * 2);
519 q3u8 = vld1q_u8(src);
520 src += src_pixels_per_line;
521 q4u8 = vld1q_u8(src);
522 src += src_pixels_per_line;
523 q5u8 = vld1q_u8(src);
524 src += src_pixels_per_line;
525 q6u8 = vld1q_u8(src);
526
527 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
528 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
529 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
530 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
531
532 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
533 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
534 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
535 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
536
537 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
538 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
539 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
540 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
541
542 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
543 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
544 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
545 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
546
547 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
548 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
549 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
550 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
551
552 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
553 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
554 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
555 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
556
557 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
558 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
559 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
560 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
561
562 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
563 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
564 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
565 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
566
567 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
568 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
569 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
570 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
571
572 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
573 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
574 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
575 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
576
577 q3u16 = vmull_u8(d28u8, d3u8);
578 q4u16 = vmull_u8(d29u8, d3u8);
579 q5u16 = vmull_u8(d30u8, d3u8);
580 q6u16 = vmull_u8(d31u8, d3u8);
581
582 q3s16 = vreinterpretq_s16_u16(q3u16);
583 q4s16 = vreinterpretq_s16_u16(q4u16);
584 q5s16 = vreinterpretq_s16_u16(q5u16);
585 q6s16 = vreinterpretq_s16_u16(q6u16);
586 q7s16 = vreinterpretq_s16_u16(q7u16);
587 q8s16 = vreinterpretq_s16_u16(q8u16);
588 q9s16 = vreinterpretq_s16_u16(q9u16);
589 q10s16 = vreinterpretq_s16_u16(q10u16);
590
591 q7s16 = vqaddq_s16(q7s16, q3s16);
592 q8s16 = vqaddq_s16(q8s16, q4s16);
593 q9s16 = vqaddq_s16(q9s16, q5s16);
594 q10s16 = vqaddq_s16(q10s16, q6s16);
595
596 d22u8 = vqrshrun_n_s16(q7s16, 7);
597 d23u8 = vqrshrun_n_s16(q8s16, 7);
598 d24u8 = vqrshrun_n_s16(q9s16, 7);
599 d25u8 = vqrshrun_n_s16(q10s16, 7);
600
601 if (yoffset == 0) { // firstpass_filter8x4_only
602 vst1_u8(dst_ptr, d22u8);
603 dst_ptr += dst_pitch;
604 vst1_u8(dst_ptr, d23u8);
605 dst_ptr += dst_pitch;
606 vst1_u8(dst_ptr, d24u8);
607 dst_ptr += dst_pitch;
608 vst1_u8(dst_ptr, d25u8);
609 return;
610 }
611
612 // First Pass on rest 5-line data
613 src += src_pixels_per_line;
614 q3u8 = vld1q_u8(src);
615 src += src_pixels_per_line;
616 q4u8 = vld1q_u8(src);
617 src += src_pixels_per_line;
618 q5u8 = vld1q_u8(src);
619 src += src_pixels_per_line;
620 q6u8 = vld1q_u8(src);
621 src += src_pixels_per_line;
622 q7u8 = vld1q_u8(src);
623
624 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
625 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
626 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
627 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
628 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
629
630 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
631 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
632 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
633 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
634 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
635
636 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
637 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
638 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
639 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
640 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
641
642 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
643 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
644 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
645 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
646 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
647
648 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
649 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
650 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
651 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
652 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
653
654 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
655 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
656 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
657 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
658 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
659
660 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
661 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
662 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
663 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
664 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
665
666 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
667 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
668 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
669 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
670 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
671
672 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
673 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
674 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
675 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
676 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
677
678 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
679 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
680 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
681 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
682 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
683
684 q3u16 = vmull_u8(d27u8, d3u8);
685 q4u16 = vmull_u8(d28u8, d3u8);
686 q5u16 = vmull_u8(d29u8, d3u8);
687 q6u16 = vmull_u8(d30u8, d3u8);
688 q7u16 = vmull_u8(d31u8, d3u8);
689
690 q3s16 = vreinterpretq_s16_u16(q3u16);
691 q4s16 = vreinterpretq_s16_u16(q4u16);
692 q5s16 = vreinterpretq_s16_u16(q5u16);
693 q6s16 = vreinterpretq_s16_u16(q6u16);
694 q7s16 = vreinterpretq_s16_u16(q7u16);
695 q8s16 = vreinterpretq_s16_u16(q8u16);
696 q9s16 = vreinterpretq_s16_u16(q9u16);
697 q10s16 = vreinterpretq_s16_u16(q10u16);
698 q11s16 = vreinterpretq_s16_u16(q11u16);
699 q12s16 = vreinterpretq_s16_u16(q12u16);
700
701 q8s16 = vqaddq_s16(q8s16, q3s16);
702 q9s16 = vqaddq_s16(q9s16, q4s16);
703 q10s16 = vqaddq_s16(q10s16, q5s16);
704 q11s16 = vqaddq_s16(q11s16, q6s16);
705 q12s16 = vqaddq_s16(q12s16, q7s16);
706
707 d26u8 = vqrshrun_n_s16(q8s16, 7);
708 d27u8 = vqrshrun_n_s16(q9s16, 7);
709 d28u8 = vqrshrun_n_s16(q10s16, 7);
710 d29u8 = vqrshrun_n_s16(q11s16, 7);
711 d30u8 = vqrshrun_n_s16(q12s16, 7);
712
713 // Second pass: 8x4
714 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
715 d0s8 = vdup_lane_s8(dtmps8, 0);
716 d1s8 = vdup_lane_s8(dtmps8, 1);
717 d2s8 = vdup_lane_s8(dtmps8, 2);
718 d3s8 = vdup_lane_s8(dtmps8, 3);
719 d4s8 = vdup_lane_s8(dtmps8, 4);
720 d5s8 = vdup_lane_s8(dtmps8, 5);
721 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
722 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
723 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
724 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
725 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
726 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
727
728 q3u16 = vmull_u8(d22u8, d0u8);
729 q4u16 = vmull_u8(d23u8, d0u8);
730 q5u16 = vmull_u8(d24u8, d0u8);
731 q6u16 = vmull_u8(d25u8, d0u8);
732
733 q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
734 q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
735 q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
736 q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
737
738 q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
739 q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
740 q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
741 q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
742
743 q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
744 q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
745 q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
746 q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
747
748 q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
749 q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
750 q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
751 q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
752
753 q7u16 = vmull_u8(d25u8, d3u8);
754 q8u16 = vmull_u8(d26u8, d3u8);
755 q9u16 = vmull_u8(d27u8, d3u8);
756 q10u16 = vmull_u8(d28u8, d3u8);
757
758 q3s16 = vreinterpretq_s16_u16(q3u16);
759 q4s16 = vreinterpretq_s16_u16(q4u16);
760 q5s16 = vreinterpretq_s16_u16(q5u16);
761 q6s16 = vreinterpretq_s16_u16(q6u16);
762 q7s16 = vreinterpretq_s16_u16(q7u16);
763 q8s16 = vreinterpretq_s16_u16(q8u16);
764 q9s16 = vreinterpretq_s16_u16(q9u16);
765 q10s16 = vreinterpretq_s16_u16(q10u16);
766
767 q7s16 = vqaddq_s16(q7s16, q3s16);
768 q8s16 = vqaddq_s16(q8s16, q4s16);
769 q9s16 = vqaddq_s16(q9s16, q5s16);
770 q10s16 = vqaddq_s16(q10s16, q6s16);
771
772 d6u8 = vqrshrun_n_s16(q7s16, 7);
773 d7u8 = vqrshrun_n_s16(q8s16, 7);
774 d8u8 = vqrshrun_n_s16(q9s16, 7);
775 d9u8 = vqrshrun_n_s16(q10s16, 7);
776
777 vst1_u8(dst_ptr, d6u8);
778 dst_ptr += dst_pitch;
779 vst1_u8(dst_ptr, d7u8);
780 dst_ptr += dst_pitch;
781 vst1_u8(dst_ptr, d8u8);
782 dst_ptr += dst_pitch;
783 vst1_u8(dst_ptr, d9u8);
784 return;
785 }
786
vp8_sixtap_predict8x8_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)787 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
788 int xoffset, int yoffset,
789 unsigned char *dst_ptr, int dst_pitch) {
790 unsigned char *src, *tmpp;
791 unsigned char tmp[64];
792 int i;
793 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
794 uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
795 uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
796 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
797 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
798 uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
799 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
800 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
801 uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
802
803 if (xoffset == 0) { // secondpass_filter8x8_only
804 // load second_pass filter
805 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
806 d0s8 = vdup_lane_s8(dtmps8, 0);
807 d1s8 = vdup_lane_s8(dtmps8, 1);
808 d2s8 = vdup_lane_s8(dtmps8, 2);
809 d3s8 = vdup_lane_s8(dtmps8, 3);
810 d4s8 = vdup_lane_s8(dtmps8, 4);
811 d5s8 = vdup_lane_s8(dtmps8, 5);
812 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
813 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
814 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
815 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
816 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
817 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
818
819 // load src data
820 src = src_ptr - src_pixels_per_line * 2;
821 d18u8 = vld1_u8(src);
822 src += src_pixels_per_line;
823 d19u8 = vld1_u8(src);
824 src += src_pixels_per_line;
825 d20u8 = vld1_u8(src);
826 src += src_pixels_per_line;
827 d21u8 = vld1_u8(src);
828 src += src_pixels_per_line;
829 d22u8 = vld1_u8(src);
830 src += src_pixels_per_line;
831 d23u8 = vld1_u8(src);
832 src += src_pixels_per_line;
833 d24u8 = vld1_u8(src);
834 src += src_pixels_per_line;
835 d25u8 = vld1_u8(src);
836 src += src_pixels_per_line;
837 d26u8 = vld1_u8(src);
838 src += src_pixels_per_line;
839 d27u8 = vld1_u8(src);
840 src += src_pixels_per_line;
841 d28u8 = vld1_u8(src);
842 src += src_pixels_per_line;
843 d29u8 = vld1_u8(src);
844 src += src_pixels_per_line;
845 d30u8 = vld1_u8(src);
846
847 for (i = 2; i > 0; i--) {
848 q3u16 = vmull_u8(d18u8, d0u8);
849 q4u16 = vmull_u8(d19u8, d0u8);
850 q5u16 = vmull_u8(d20u8, d0u8);
851 q6u16 = vmull_u8(d21u8, d0u8);
852
853 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
854 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
855 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
856 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
857
858 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
859 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
860 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
861 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
862
863 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
864 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
865 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
866 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
867
868 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
869 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
870 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
871 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
872
873 q7u16 = vmull_u8(d21u8, d3u8);
874 q8u16 = vmull_u8(d22u8, d3u8);
875 q9u16 = vmull_u8(d23u8, d3u8);
876 q10u16 = vmull_u8(d24u8, d3u8);
877
878 q3s16 = vreinterpretq_s16_u16(q3u16);
879 q4s16 = vreinterpretq_s16_u16(q4u16);
880 q5s16 = vreinterpretq_s16_u16(q5u16);
881 q6s16 = vreinterpretq_s16_u16(q6u16);
882 q7s16 = vreinterpretq_s16_u16(q7u16);
883 q8s16 = vreinterpretq_s16_u16(q8u16);
884 q9s16 = vreinterpretq_s16_u16(q9u16);
885 q10s16 = vreinterpretq_s16_u16(q10u16);
886
887 q7s16 = vqaddq_s16(q7s16, q3s16);
888 q8s16 = vqaddq_s16(q8s16, q4s16);
889 q9s16 = vqaddq_s16(q9s16, q5s16);
890 q10s16 = vqaddq_s16(q10s16, q6s16);
891
892 d6u8 = vqrshrun_n_s16(q7s16, 7);
893 d7u8 = vqrshrun_n_s16(q8s16, 7);
894 d8u8 = vqrshrun_n_s16(q9s16, 7);
895 d9u8 = vqrshrun_n_s16(q10s16, 7);
896
897 d18u8 = d22u8;
898 d19u8 = d23u8;
899 d20u8 = d24u8;
900 d21u8 = d25u8;
901 d22u8 = d26u8;
902 d23u8 = d27u8;
903 d24u8 = d28u8;
904 d25u8 = d29u8;
905 d26u8 = d30u8;
906
907 vst1_u8(dst_ptr, d6u8);
908 dst_ptr += dst_pitch;
909 vst1_u8(dst_ptr, d7u8);
910 dst_ptr += dst_pitch;
911 vst1_u8(dst_ptr, d8u8);
912 dst_ptr += dst_pitch;
913 vst1_u8(dst_ptr, d9u8);
914 dst_ptr += dst_pitch;
915 }
916 return;
917 }
918
919 // load first_pass filter
920 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
921 d0s8 = vdup_lane_s8(dtmps8, 0);
922 d1s8 = vdup_lane_s8(dtmps8, 1);
923 d2s8 = vdup_lane_s8(dtmps8, 2);
924 d3s8 = vdup_lane_s8(dtmps8, 3);
925 d4s8 = vdup_lane_s8(dtmps8, 4);
926 d5s8 = vdup_lane_s8(dtmps8, 5);
927 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
928 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
929 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
930 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
931 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
932 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
933
934 // First pass: output_height lines x output_width columns (9x4)
935 if (yoffset == 0) // firstpass_filter4x4_only
936 src = src_ptr - 2;
937 else
938 src = src_ptr - 2 - (src_pixels_per_line * 2);
939
940 tmpp = tmp;
941 for (i = 2; i > 0; i--) {
942 q3u8 = vld1q_u8(src);
943 src += src_pixels_per_line;
944 q4u8 = vld1q_u8(src);
945 src += src_pixels_per_line;
946 q5u8 = vld1q_u8(src);
947 src += src_pixels_per_line;
948 q6u8 = vld1q_u8(src);
949 src += src_pixels_per_line;
950
951 __builtin_prefetch(src);
952 __builtin_prefetch(src + src_pixels_per_line);
953 __builtin_prefetch(src + src_pixels_per_line * 2);
954
955 q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
956 q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
957 q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
958 q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
959
960 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
961 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
962 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
963 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
964
965 q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
966 q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
967 q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
968 q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
969
970 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
971 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
972 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
973 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
974
975 q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
976 q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
977 q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
978 q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
979
980 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
981 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
982 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
983 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
984
985 q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
986 q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
987 q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
988 q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
989
990 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
991 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
992 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
993 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
994
995 q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
996 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
997 q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
998 q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
999
1000 d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1001 d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1002 d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1003 d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1004
1005 q3u16 = vmull_u8(d28u8, d3u8);
1006 q4u16 = vmull_u8(d29u8, d3u8);
1007 q5u16 = vmull_u8(d30u8, d3u8);
1008 q6u16 = vmull_u8(d31u8, d3u8);
1009
1010 q3s16 = vreinterpretq_s16_u16(q3u16);
1011 q4s16 = vreinterpretq_s16_u16(q4u16);
1012 q5s16 = vreinterpretq_s16_u16(q5u16);
1013 q6s16 = vreinterpretq_s16_u16(q6u16);
1014 q7s16 = vreinterpretq_s16_u16(q7u16);
1015 q8s16 = vreinterpretq_s16_u16(q8u16);
1016 q9s16 = vreinterpretq_s16_u16(q9u16);
1017 q10s16 = vreinterpretq_s16_u16(q10u16);
1018
1019 q7s16 = vqaddq_s16(q7s16, q3s16);
1020 q8s16 = vqaddq_s16(q8s16, q4s16);
1021 q9s16 = vqaddq_s16(q9s16, q5s16);
1022 q10s16 = vqaddq_s16(q10s16, q6s16);
1023
1024 d22u8 = vqrshrun_n_s16(q7s16, 7);
1025 d23u8 = vqrshrun_n_s16(q8s16, 7);
1026 d24u8 = vqrshrun_n_s16(q9s16, 7);
1027 d25u8 = vqrshrun_n_s16(q10s16, 7);
1028
1029 if (yoffset == 0) { // firstpass_filter8x4_only
1030 vst1_u8(dst_ptr, d22u8);
1031 dst_ptr += dst_pitch;
1032 vst1_u8(dst_ptr, d23u8);
1033 dst_ptr += dst_pitch;
1034 vst1_u8(dst_ptr, d24u8);
1035 dst_ptr += dst_pitch;
1036 vst1_u8(dst_ptr, d25u8);
1037 dst_ptr += dst_pitch;
1038 } else {
1039 vst1_u8(tmpp, d22u8);
1040 tmpp += 8;
1041 vst1_u8(tmpp, d23u8);
1042 tmpp += 8;
1043 vst1_u8(tmpp, d24u8);
1044 tmpp += 8;
1045 vst1_u8(tmpp, d25u8);
1046 tmpp += 8;
1047 }
1048 }
1049 if (yoffset == 0) return;
1050
1051 // First Pass on rest 5-line data
1052 q3u8 = vld1q_u8(src);
1053 src += src_pixels_per_line;
1054 q4u8 = vld1q_u8(src);
1055 src += src_pixels_per_line;
1056 q5u8 = vld1q_u8(src);
1057 src += src_pixels_per_line;
1058 q6u8 = vld1q_u8(src);
1059 src += src_pixels_per_line;
1060 q7u8 = vld1q_u8(src);
1061
1062 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
1063 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
1064 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
1065 q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
1066 q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
1067
1068 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
1069 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
1070 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
1071 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
1072 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
1073
1074 q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
1075 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1076 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1077 q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
1078 q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
1079
1080 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
1081 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
1082 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
1083 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
1084 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
1085
1086 q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
1087 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1088 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1089 q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
1090 q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
1091
1092 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
1093 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
1094 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
1095 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
1096 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
1097
1098 q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
1099 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1100 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1101 q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
1102 q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
1103
1104 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
1105 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
1106 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
1107 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
1108 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
1109
1110 q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
1111 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1112 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1113 q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
1114 q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
1115
1116 d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
1117 d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
1118 d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
1119 d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
1120 d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
1121
1122 q3u16 = vmull_u8(d27u8, d3u8);
1123 q4u16 = vmull_u8(d28u8, d3u8);
1124 q5u16 = vmull_u8(d29u8, d3u8);
1125 q6u16 = vmull_u8(d30u8, d3u8);
1126 q7u16 = vmull_u8(d31u8, d3u8);
1127
1128 q3s16 = vreinterpretq_s16_u16(q3u16);
1129 q4s16 = vreinterpretq_s16_u16(q4u16);
1130 q5s16 = vreinterpretq_s16_u16(q5u16);
1131 q6s16 = vreinterpretq_s16_u16(q6u16);
1132 q7s16 = vreinterpretq_s16_u16(q7u16);
1133 q8s16 = vreinterpretq_s16_u16(q8u16);
1134 q9s16 = vreinterpretq_s16_u16(q9u16);
1135 q10s16 = vreinterpretq_s16_u16(q10u16);
1136 q11s16 = vreinterpretq_s16_u16(q11u16);
1137 q12s16 = vreinterpretq_s16_u16(q12u16);
1138
1139 q8s16 = vqaddq_s16(q8s16, q3s16);
1140 q9s16 = vqaddq_s16(q9s16, q4s16);
1141 q10s16 = vqaddq_s16(q10s16, q5s16);
1142 q11s16 = vqaddq_s16(q11s16, q6s16);
1143 q12s16 = vqaddq_s16(q12s16, q7s16);
1144
1145 d26u8 = vqrshrun_n_s16(q8s16, 7);
1146 d27u8 = vqrshrun_n_s16(q9s16, 7);
1147 d28u8 = vqrshrun_n_s16(q10s16, 7);
1148 d29u8 = vqrshrun_n_s16(q11s16, 7);
1149 d30u8 = vqrshrun_n_s16(q12s16, 7);
1150
1151 // Second pass: 8x8
1152 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1153 d0s8 = vdup_lane_s8(dtmps8, 0);
1154 d1s8 = vdup_lane_s8(dtmps8, 1);
1155 d2s8 = vdup_lane_s8(dtmps8, 2);
1156 d3s8 = vdup_lane_s8(dtmps8, 3);
1157 d4s8 = vdup_lane_s8(dtmps8, 4);
1158 d5s8 = vdup_lane_s8(dtmps8, 5);
1159 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1160 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1161 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1162 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1163 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1164 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1165
1166 tmpp = tmp;
1167 q9u8 = vld1q_u8(tmpp);
1168 tmpp += 16;
1169 q10u8 = vld1q_u8(tmpp);
1170 tmpp += 16;
1171 q11u8 = vld1q_u8(tmpp);
1172 tmpp += 16;
1173 q12u8 = vld1q_u8(tmpp);
1174
1175 d18u8 = vget_low_u8(q9u8);
1176 d19u8 = vget_high_u8(q9u8);
1177 d20u8 = vget_low_u8(q10u8);
1178 d21u8 = vget_high_u8(q10u8);
1179 d22u8 = vget_low_u8(q11u8);
1180 d23u8 = vget_high_u8(q11u8);
1181 d24u8 = vget_low_u8(q12u8);
1182 d25u8 = vget_high_u8(q12u8);
1183
1184 for (i = 2; i > 0; i--) {
1185 q3u16 = vmull_u8(d18u8, d0u8);
1186 q4u16 = vmull_u8(d19u8, d0u8);
1187 q5u16 = vmull_u8(d20u8, d0u8);
1188 q6u16 = vmull_u8(d21u8, d0u8);
1189
1190 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1191 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1192 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1193 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1194
1195 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1196 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1197 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1198 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1199
1200 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1201 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1202 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1203 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1204
1205 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1206 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1207 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1208 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1209
1210 q7u16 = vmull_u8(d21u8, d3u8);
1211 q8u16 = vmull_u8(d22u8, d3u8);
1212 q9u16 = vmull_u8(d23u8, d3u8);
1213 q10u16 = vmull_u8(d24u8, d3u8);
1214
1215 q3s16 = vreinterpretq_s16_u16(q3u16);
1216 q4s16 = vreinterpretq_s16_u16(q4u16);
1217 q5s16 = vreinterpretq_s16_u16(q5u16);
1218 q6s16 = vreinterpretq_s16_u16(q6u16);
1219 q7s16 = vreinterpretq_s16_u16(q7u16);
1220 q8s16 = vreinterpretq_s16_u16(q8u16);
1221 q9s16 = vreinterpretq_s16_u16(q9u16);
1222 q10s16 = vreinterpretq_s16_u16(q10u16);
1223
1224 q7s16 = vqaddq_s16(q7s16, q3s16);
1225 q8s16 = vqaddq_s16(q8s16, q4s16);
1226 q9s16 = vqaddq_s16(q9s16, q5s16);
1227 q10s16 = vqaddq_s16(q10s16, q6s16);
1228
1229 d6u8 = vqrshrun_n_s16(q7s16, 7);
1230 d7u8 = vqrshrun_n_s16(q8s16, 7);
1231 d8u8 = vqrshrun_n_s16(q9s16, 7);
1232 d9u8 = vqrshrun_n_s16(q10s16, 7);
1233
1234 d18u8 = d22u8;
1235 d19u8 = d23u8;
1236 d20u8 = d24u8;
1237 d21u8 = d25u8;
1238 d22u8 = d26u8;
1239 d23u8 = d27u8;
1240 d24u8 = d28u8;
1241 d25u8 = d29u8;
1242 d26u8 = d30u8;
1243
1244 vst1_u8(dst_ptr, d6u8);
1245 dst_ptr += dst_pitch;
1246 vst1_u8(dst_ptr, d7u8);
1247 dst_ptr += dst_pitch;
1248 vst1_u8(dst_ptr, d8u8);
1249 dst_ptr += dst_pitch;
1250 vst1_u8(dst_ptr, d9u8);
1251 dst_ptr += dst_pitch;
1252 }
1253 return;
1254 }
1255
vp8_sixtap_predict16x16_neon(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)1256 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
1257 int src_pixels_per_line, int xoffset,
1258 int yoffset, unsigned char *dst_ptr,
1259 int dst_pitch) {
1260 unsigned char *src, *src_tmp, *dst, *tmpp;
1261 unsigned char tmp[336];
1262 int i, j;
1263 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
1264 uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
1265 uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
1266 uint8x8_t d28u8, d29u8, d30u8, d31u8;
1267 int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
1268 uint8x16_t q3u8, q4u8;
1269 uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
1270 uint16x8_t q11u16, q12u16, q13u16, q15u16;
1271 int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
1272 int16x8_t q11s16, q12s16, q13s16, q15s16;
1273
1274 if (xoffset == 0) { // secondpass_filter8x8_only
1275 // load second_pass filter
1276 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1277 d0s8 = vdup_lane_s8(dtmps8, 0);
1278 d1s8 = vdup_lane_s8(dtmps8, 1);
1279 d2s8 = vdup_lane_s8(dtmps8, 2);
1280 d3s8 = vdup_lane_s8(dtmps8, 3);
1281 d4s8 = vdup_lane_s8(dtmps8, 4);
1282 d5s8 = vdup_lane_s8(dtmps8, 5);
1283 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1284 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1285 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1286 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1287 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1288 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1289
1290 // load src data
1291 src_tmp = src_ptr - src_pixels_per_line * 2;
1292 for (i = 0; i < 2; ++i) {
1293 src = src_tmp + i * 8;
1294 dst = dst_ptr + i * 8;
1295 d18u8 = vld1_u8(src);
1296 src += src_pixels_per_line;
1297 d19u8 = vld1_u8(src);
1298 src += src_pixels_per_line;
1299 d20u8 = vld1_u8(src);
1300 src += src_pixels_per_line;
1301 d21u8 = vld1_u8(src);
1302 src += src_pixels_per_line;
1303 d22u8 = vld1_u8(src);
1304 src += src_pixels_per_line;
1305 for (j = 0; j < 4; ++j) {
1306 d23u8 = vld1_u8(src);
1307 src += src_pixels_per_line;
1308 d24u8 = vld1_u8(src);
1309 src += src_pixels_per_line;
1310 d25u8 = vld1_u8(src);
1311 src += src_pixels_per_line;
1312 d26u8 = vld1_u8(src);
1313 src += src_pixels_per_line;
1314
1315 q3u16 = vmull_u8(d18u8, d0u8);
1316 q4u16 = vmull_u8(d19u8, d0u8);
1317 q5u16 = vmull_u8(d20u8, d0u8);
1318 q6u16 = vmull_u8(d21u8, d0u8);
1319
1320 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1321 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1322 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1323 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1324
1325 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1326 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1327 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1328 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1329
1330 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1331 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1332 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1333 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1334
1335 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1336 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1337 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1338 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1339
1340 q7u16 = vmull_u8(d21u8, d3u8);
1341 q8u16 = vmull_u8(d22u8, d3u8);
1342 q9u16 = vmull_u8(d23u8, d3u8);
1343 q10u16 = vmull_u8(d24u8, d3u8);
1344
1345 q3s16 = vreinterpretq_s16_u16(q3u16);
1346 q4s16 = vreinterpretq_s16_u16(q4u16);
1347 q5s16 = vreinterpretq_s16_u16(q5u16);
1348 q6s16 = vreinterpretq_s16_u16(q6u16);
1349 q7s16 = vreinterpretq_s16_u16(q7u16);
1350 q8s16 = vreinterpretq_s16_u16(q8u16);
1351 q9s16 = vreinterpretq_s16_u16(q9u16);
1352 q10s16 = vreinterpretq_s16_u16(q10u16);
1353
1354 q7s16 = vqaddq_s16(q7s16, q3s16);
1355 q8s16 = vqaddq_s16(q8s16, q4s16);
1356 q9s16 = vqaddq_s16(q9s16, q5s16);
1357 q10s16 = vqaddq_s16(q10s16, q6s16);
1358
1359 d6u8 = vqrshrun_n_s16(q7s16, 7);
1360 d7u8 = vqrshrun_n_s16(q8s16, 7);
1361 d8u8 = vqrshrun_n_s16(q9s16, 7);
1362 d9u8 = vqrshrun_n_s16(q10s16, 7);
1363
1364 d18u8 = d22u8;
1365 d19u8 = d23u8;
1366 d20u8 = d24u8;
1367 d21u8 = d25u8;
1368 d22u8 = d26u8;
1369
1370 vst1_u8(dst, d6u8);
1371 dst += dst_pitch;
1372 vst1_u8(dst, d7u8);
1373 dst += dst_pitch;
1374 vst1_u8(dst, d8u8);
1375 dst += dst_pitch;
1376 vst1_u8(dst, d9u8);
1377 dst += dst_pitch;
1378 }
1379 }
1380 return;
1381 }
1382
1383 // load first_pass filter
1384 dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
1385 d0s8 = vdup_lane_s8(dtmps8, 0);
1386 d1s8 = vdup_lane_s8(dtmps8, 1);
1387 d2s8 = vdup_lane_s8(dtmps8, 2);
1388 d3s8 = vdup_lane_s8(dtmps8, 3);
1389 d4s8 = vdup_lane_s8(dtmps8, 4);
1390 d5s8 = vdup_lane_s8(dtmps8, 5);
1391 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1392 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1393 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1394 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1395 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1396 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1397
1398 // First pass: output_height lines x output_width columns (9x4)
1399 if (yoffset == 0) { // firstpass_filter4x4_only
1400 src = src_ptr - 2;
1401 dst = dst_ptr;
1402 for (i = 0; i < 8; ++i) {
1403 d6u8 = vld1_u8(src);
1404 d7u8 = vld1_u8(src + 8);
1405 d8u8 = vld1_u8(src + 16);
1406 src += src_pixels_per_line;
1407 d9u8 = vld1_u8(src);
1408 d10u8 = vld1_u8(src + 8);
1409 d11u8 = vld1_u8(src + 16);
1410 src += src_pixels_per_line;
1411
1412 __builtin_prefetch(src);
1413 __builtin_prefetch(src + src_pixels_per_line);
1414
1415 q6u16 = vmull_u8(d6u8, d0u8);
1416 q7u16 = vmull_u8(d7u8, d0u8);
1417 q8u16 = vmull_u8(d9u8, d0u8);
1418 q9u16 = vmull_u8(d10u8, d0u8);
1419
1420 d20u8 = vext_u8(d6u8, d7u8, 1);
1421 d21u8 = vext_u8(d9u8, d10u8, 1);
1422 d22u8 = vext_u8(d7u8, d8u8, 1);
1423 d23u8 = vext_u8(d10u8, d11u8, 1);
1424 d24u8 = vext_u8(d6u8, d7u8, 4);
1425 d25u8 = vext_u8(d9u8, d10u8, 4);
1426 d26u8 = vext_u8(d7u8, d8u8, 4);
1427 d27u8 = vext_u8(d10u8, d11u8, 4);
1428 d28u8 = vext_u8(d6u8, d7u8, 5);
1429 d29u8 = vext_u8(d9u8, d10u8, 5);
1430
1431 q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
1432 q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
1433 q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
1434 q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
1435 q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
1436 q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
1437 q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
1438 q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
1439 q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
1440 q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
1441
1442 d20u8 = vext_u8(d7u8, d8u8, 5);
1443 d21u8 = vext_u8(d10u8, d11u8, 5);
1444 d22u8 = vext_u8(d6u8, d7u8, 2);
1445 d23u8 = vext_u8(d9u8, d10u8, 2);
1446 d24u8 = vext_u8(d7u8, d8u8, 2);
1447 d25u8 = vext_u8(d10u8, d11u8, 2);
1448 d26u8 = vext_u8(d6u8, d7u8, 3);
1449 d27u8 = vext_u8(d9u8, d10u8, 3);
1450 d28u8 = vext_u8(d7u8, d8u8, 3);
1451 d29u8 = vext_u8(d10u8, d11u8, 3);
1452
1453 q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
1454 q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
1455 q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
1456 q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
1457 q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
1458 q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
1459
1460 q10u16 = vmull_u8(d26u8, d3u8);
1461 q11u16 = vmull_u8(d27u8, d3u8);
1462 q12u16 = vmull_u8(d28u8, d3u8);
1463 q15u16 = vmull_u8(d29u8, d3u8);
1464
1465 q6s16 = vreinterpretq_s16_u16(q6u16);
1466 q7s16 = vreinterpretq_s16_u16(q7u16);
1467 q8s16 = vreinterpretq_s16_u16(q8u16);
1468 q9s16 = vreinterpretq_s16_u16(q9u16);
1469 q10s16 = vreinterpretq_s16_u16(q10u16);
1470 q11s16 = vreinterpretq_s16_u16(q11u16);
1471 q12s16 = vreinterpretq_s16_u16(q12u16);
1472 q15s16 = vreinterpretq_s16_u16(q15u16);
1473
1474 q6s16 = vqaddq_s16(q6s16, q10s16);
1475 q8s16 = vqaddq_s16(q8s16, q11s16);
1476 q7s16 = vqaddq_s16(q7s16, q12s16);
1477 q9s16 = vqaddq_s16(q9s16, q15s16);
1478
1479 d6u8 = vqrshrun_n_s16(q6s16, 7);
1480 d7u8 = vqrshrun_n_s16(q7s16, 7);
1481 d8u8 = vqrshrun_n_s16(q8s16, 7);
1482 d9u8 = vqrshrun_n_s16(q9s16, 7);
1483
1484 q3u8 = vcombine_u8(d6u8, d7u8);
1485 q4u8 = vcombine_u8(d8u8, d9u8);
1486 vst1q_u8(dst, q3u8);
1487 dst += dst_pitch;
1488 vst1q_u8(dst, q4u8);
1489 dst += dst_pitch;
1490 }
1491 return;
1492 }
1493
1494 src = src_ptr - 2 - src_pixels_per_line * 2;
1495 tmpp = tmp;
1496 for (i = 0; i < 7; ++i) {
1497 d6u8 = vld1_u8(src);
1498 d7u8 = vld1_u8(src + 8);
1499 d8u8 = vld1_u8(src + 16);
1500 src += src_pixels_per_line;
1501 d9u8 = vld1_u8(src);
1502 d10u8 = vld1_u8(src + 8);
1503 d11u8 = vld1_u8(src + 16);
1504 src += src_pixels_per_line;
1505 d12u8 = vld1_u8(src);
1506 d13u8 = vld1_u8(src + 8);
1507 d14u8 = vld1_u8(src + 16);
1508 src += src_pixels_per_line;
1509
1510 __builtin_prefetch(src);
1511 __builtin_prefetch(src + src_pixels_per_line);
1512 __builtin_prefetch(src + src_pixels_per_line * 2);
1513
1514 q8u16 = vmull_u8(d6u8, d0u8);
1515 q9u16 = vmull_u8(d7u8, d0u8);
1516 q10u16 = vmull_u8(d9u8, d0u8);
1517 q11u16 = vmull_u8(d10u8, d0u8);
1518 q12u16 = vmull_u8(d12u8, d0u8);
1519 q13u16 = vmull_u8(d13u8, d0u8);
1520
1521 d28u8 = vext_u8(d6u8, d7u8, 1);
1522 d29u8 = vext_u8(d9u8, d10u8, 1);
1523 d30u8 = vext_u8(d12u8, d13u8, 1);
1524 q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
1525 q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
1526 q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
1527 d28u8 = vext_u8(d7u8, d8u8, 1);
1528 d29u8 = vext_u8(d10u8, d11u8, 1);
1529 d30u8 = vext_u8(d13u8, d14u8, 1);
1530 q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
1531 q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
1532 q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1533
1534 d28u8 = vext_u8(d6u8, d7u8, 4);
1535 d29u8 = vext_u8(d9u8, d10u8, 4);
1536 d30u8 = vext_u8(d12u8, d13u8, 4);
1537 q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
1538 q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
1539 q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
1540 d28u8 = vext_u8(d7u8, d8u8, 4);
1541 d29u8 = vext_u8(d10u8, d11u8, 4);
1542 d30u8 = vext_u8(d13u8, d14u8, 4);
1543 q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
1544 q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
1545 q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1546
1547 d28u8 = vext_u8(d6u8, d7u8, 5);
1548 d29u8 = vext_u8(d9u8, d10u8, 5);
1549 d30u8 = vext_u8(d12u8, d13u8, 5);
1550 q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
1551 q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
1552 q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
1553 d28u8 = vext_u8(d7u8, d8u8, 5);
1554 d29u8 = vext_u8(d10u8, d11u8, 5);
1555 d30u8 = vext_u8(d13u8, d14u8, 5);
1556 q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
1557 q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
1558 q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1559
1560 d28u8 = vext_u8(d6u8, d7u8, 2);
1561 d29u8 = vext_u8(d9u8, d10u8, 2);
1562 d30u8 = vext_u8(d12u8, d13u8, 2);
1563 q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
1564 q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
1565 q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
1566 d28u8 = vext_u8(d7u8, d8u8, 2);
1567 d29u8 = vext_u8(d10u8, d11u8, 2);
1568 d30u8 = vext_u8(d13u8, d14u8, 2);
1569 q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
1570 q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
1571 q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
1572
1573 d28u8 = vext_u8(d6u8, d7u8, 3);
1574 d29u8 = vext_u8(d9u8, d10u8, 3);
1575 d30u8 = vext_u8(d12u8, d13u8, 3);
1576 d15u8 = vext_u8(d7u8, d8u8, 3);
1577 d31u8 = vext_u8(d10u8, d11u8, 3);
1578 d6u8 = vext_u8(d13u8, d14u8, 3);
1579 q4u16 = vmull_u8(d28u8, d3u8);
1580 q5u16 = vmull_u8(d29u8, d3u8);
1581 q6u16 = vmull_u8(d30u8, d3u8);
1582 q4s16 = vreinterpretq_s16_u16(q4u16);
1583 q5s16 = vreinterpretq_s16_u16(q5u16);
1584 q6s16 = vreinterpretq_s16_u16(q6u16);
1585 q8s16 = vreinterpretq_s16_u16(q8u16);
1586 q10s16 = vreinterpretq_s16_u16(q10u16);
1587 q12s16 = vreinterpretq_s16_u16(q12u16);
1588 q8s16 = vqaddq_s16(q8s16, q4s16);
1589 q10s16 = vqaddq_s16(q10s16, q5s16);
1590 q12s16 = vqaddq_s16(q12s16, q6s16);
1591
1592 q6u16 = vmull_u8(d15u8, d3u8);
1593 q7u16 = vmull_u8(d31u8, d3u8);
1594 q3u16 = vmull_u8(d6u8, d3u8);
1595 q3s16 = vreinterpretq_s16_u16(q3u16);
1596 q6s16 = vreinterpretq_s16_u16(q6u16);
1597 q7s16 = vreinterpretq_s16_u16(q7u16);
1598 q9s16 = vreinterpretq_s16_u16(q9u16);
1599 q11s16 = vreinterpretq_s16_u16(q11u16);
1600 q13s16 = vreinterpretq_s16_u16(q13u16);
1601 q9s16 = vqaddq_s16(q9s16, q6s16);
1602 q11s16 = vqaddq_s16(q11s16, q7s16);
1603 q13s16 = vqaddq_s16(q13s16, q3s16);
1604
1605 d6u8 = vqrshrun_n_s16(q8s16, 7);
1606 d7u8 = vqrshrun_n_s16(q9s16, 7);
1607 d8u8 = vqrshrun_n_s16(q10s16, 7);
1608 d9u8 = vqrshrun_n_s16(q11s16, 7);
1609 d10u8 = vqrshrun_n_s16(q12s16, 7);
1610 d11u8 = vqrshrun_n_s16(q13s16, 7);
1611
1612 vst1_u8(tmpp, d6u8);
1613 tmpp += 8;
1614 vst1_u8(tmpp, d7u8);
1615 tmpp += 8;
1616 vst1_u8(tmpp, d8u8);
1617 tmpp += 8;
1618 vst1_u8(tmpp, d9u8);
1619 tmpp += 8;
1620 vst1_u8(tmpp, d10u8);
1621 tmpp += 8;
1622 vst1_u8(tmpp, d11u8);
1623 tmpp += 8;
1624 }
1625
1626 // Second pass: 16x16
1627 dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1628 d0s8 = vdup_lane_s8(dtmps8, 0);
1629 d1s8 = vdup_lane_s8(dtmps8, 1);
1630 d2s8 = vdup_lane_s8(dtmps8, 2);
1631 d3s8 = vdup_lane_s8(dtmps8, 3);
1632 d4s8 = vdup_lane_s8(dtmps8, 4);
1633 d5s8 = vdup_lane_s8(dtmps8, 5);
1634 d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1635 d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1636 d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1637 d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1638 d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1639 d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1640
1641 for (i = 0; i < 2; ++i) {
1642 dst = dst_ptr + 8 * i;
1643 tmpp = tmp + 8 * i;
1644 d18u8 = vld1_u8(tmpp);
1645 tmpp += 16;
1646 d19u8 = vld1_u8(tmpp);
1647 tmpp += 16;
1648 d20u8 = vld1_u8(tmpp);
1649 tmpp += 16;
1650 d21u8 = vld1_u8(tmpp);
1651 tmpp += 16;
1652 d22u8 = vld1_u8(tmpp);
1653 tmpp += 16;
1654 for (j = 0; j < 4; ++j) {
1655 d23u8 = vld1_u8(tmpp);
1656 tmpp += 16;
1657 d24u8 = vld1_u8(tmpp);
1658 tmpp += 16;
1659 d25u8 = vld1_u8(tmpp);
1660 tmpp += 16;
1661 d26u8 = vld1_u8(tmpp);
1662 tmpp += 16;
1663
1664 q3u16 = vmull_u8(d18u8, d0u8);
1665 q4u16 = vmull_u8(d19u8, d0u8);
1666 q5u16 = vmull_u8(d20u8, d0u8);
1667 q6u16 = vmull_u8(d21u8, d0u8);
1668
1669 q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1670 q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1671 q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1672 q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1673
1674 q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1675 q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1676 q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1677 q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1678
1679 q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1680 q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1681 q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1682 q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1683
1684 q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1685 q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1686 q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1687 q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1688
1689 q7u16 = vmull_u8(d21u8, d3u8);
1690 q8u16 = vmull_u8(d22u8, d3u8);
1691 q9u16 = vmull_u8(d23u8, d3u8);
1692 q10u16 = vmull_u8(d24u8, d3u8);
1693
1694 q3s16 = vreinterpretq_s16_u16(q3u16);
1695 q4s16 = vreinterpretq_s16_u16(q4u16);
1696 q5s16 = vreinterpretq_s16_u16(q5u16);
1697 q6s16 = vreinterpretq_s16_u16(q6u16);
1698 q7s16 = vreinterpretq_s16_u16(q7u16);
1699 q8s16 = vreinterpretq_s16_u16(q8u16);
1700 q9s16 = vreinterpretq_s16_u16(q9u16);
1701 q10s16 = vreinterpretq_s16_u16(q10u16);
1702
1703 q7s16 = vqaddq_s16(q7s16, q3s16);
1704 q8s16 = vqaddq_s16(q8s16, q4s16);
1705 q9s16 = vqaddq_s16(q9s16, q5s16);
1706 q10s16 = vqaddq_s16(q10s16, q6s16);
1707
1708 d6u8 = vqrshrun_n_s16(q7s16, 7);
1709 d7u8 = vqrshrun_n_s16(q8s16, 7);
1710 d8u8 = vqrshrun_n_s16(q9s16, 7);
1711 d9u8 = vqrshrun_n_s16(q10s16, 7);
1712
1713 d18u8 = d22u8;
1714 d19u8 = d23u8;
1715 d20u8 = d24u8;
1716 d21u8 = d25u8;
1717 d22u8 = d26u8;
1718
1719 vst1_u8(dst, d6u8);
1720 dst += dst_pitch;
1721 vst1_u8(dst, d7u8);
1722 dst += dst_pitch;
1723 vst1_u8(dst, d8u8);
1724 dst += dst_pitch;
1725 vst1_u8(dst, d9u8);
1726 dst += dst_pitch;
1727 }
1728 }
1729 return;
1730 }
1731