1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/mem_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
20 #include "vpx_dsp/vpx_filter.h"
21 #include "vpx_ports/mem.h"
22
23 // Note:
24 // 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src).
25 // 2. After refactoring the shared code in kernel loops with inline functions,
26 // the decoder speed dropped a lot when using gcc compiler. Therefore there is
27 // no refactoring for those parts by now.
28 // 3. For horizontal convolve, there is an alternative optimization that
29 // convolves a single row in each loop. For each row, 8 sample banks with 4 or 8
30 // samples in each are read from memory: src, (src+1), (src+2), (src+3),
31 // (src+4), (src+5), (src+6), (src+7), or prepared by vector extract
32 // instructions. This optimization is much faster in speed unit test, but slowed
33 // down the whole decoder by 5%.
34
vpx_convolve8_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)35 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
36 uint8_t *dst, ptrdiff_t dst_stride,
37 const InterpKernel *filter, int x0_q4,
38 int x_step_q4, int y0_q4, int y_step_q4, int w,
39 int h) {
40 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
41 uint8x8_t t0, t1, t2, t3;
42
43 assert((intptr_t)dst % 4 == 0);
44 assert(dst_stride % 4 == 0);
45 assert(x_step_q4 == 16);
46
47 (void)x_step_q4;
48 (void)y0_q4;
49 (void)y_step_q4;
50
51 src -= 3;
52
53 if (h == 4) {
54 uint8x8_t d01, d23;
55 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
56
57 __builtin_prefetch(src + 0 * src_stride);
58 __builtin_prefetch(src + 1 * src_stride);
59 __builtin_prefetch(src + 2 * src_stride);
60 __builtin_prefetch(src + 3 * src_stride);
61
62 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
63 transpose_u8_8x4(&t0, &t1, &t2, &t3);
64 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
65 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
66 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
67 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
68 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
69 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
70 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
71
72 __builtin_prefetch(dst + 0 * dst_stride);
73 __builtin_prefetch(dst + 1 * dst_stride);
74 __builtin_prefetch(dst + 2 * dst_stride);
75 __builtin_prefetch(dst + 3 * dst_stride);
76 src += 7;
77
78 do {
79 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
80 transpose_u8_8x4(&t0, &t1, &t2, &t3);
81 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
82 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
83 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
84 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
85
86 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
87 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
88 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
89 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
90 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
91 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
92
93 transpose_u8_4x4(&d01, &d23);
94
95 store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
96 store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
97
98 s0 = s4;
99 s1 = s5;
100 s2 = s6;
101 s3 = s7;
102 s4 = s8;
103 s5 = s9;
104 s6 = s10;
105 src += 4;
106 dst += 4;
107 w -= 4;
108 } while (w != 0);
109 } else {
110 int width;
111 const uint8_t *s;
112 uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
113 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
114
115 if (w == 4) {
116 do {
117 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
118 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
119 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
120 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
121 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
122 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
123 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
124 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
125 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
126
127 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
128 &t7);
129 src += 8 * src_stride;
130 __builtin_prefetch(dst + 0 * dst_stride);
131 __builtin_prefetch(dst + 1 * dst_stride);
132 __builtin_prefetch(dst + 2 * dst_stride);
133 __builtin_prefetch(dst + 3 * dst_stride);
134 __builtin_prefetch(dst + 4 * dst_stride);
135 __builtin_prefetch(dst + 5 * dst_stride);
136 __builtin_prefetch(dst + 6 * dst_stride);
137 __builtin_prefetch(dst + 7 * dst_stride);
138 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
139 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
140 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
141 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
142 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
143
144 __builtin_prefetch(src + 0 * src_stride);
145 __builtin_prefetch(src + 1 * src_stride);
146 __builtin_prefetch(src + 2 * src_stride);
147 __builtin_prefetch(src + 3 * src_stride);
148 __builtin_prefetch(src + 4 * src_stride);
149 __builtin_prefetch(src + 5 * src_stride);
150 __builtin_prefetch(src + 6 * src_stride);
151 __builtin_prefetch(src + 7 * src_stride);
152 d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
153 d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
154 d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
155 d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
156
157 transpose_u8_8x4(&d04, &d15, &d26, &d37);
158
159 store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
160 store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
161 store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
162 store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
163
164 dst += 8 * dst_stride;
165 h -= 8;
166 } while (h > 0);
167 } else {
168 uint8_t *d;
169 uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
170 int16x8_t s11, s12, s13, s14;
171
172 do {
173 __builtin_prefetch(src + 0 * src_stride);
174 __builtin_prefetch(src + 1 * src_stride);
175 __builtin_prefetch(src + 2 * src_stride);
176 __builtin_prefetch(src + 3 * src_stride);
177 __builtin_prefetch(src + 4 * src_stride);
178 __builtin_prefetch(src + 5 * src_stride);
179 __builtin_prefetch(src + 6 * src_stride);
180 __builtin_prefetch(src + 7 * src_stride);
181 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
182 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
183 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
184 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
185 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
186 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
187 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
188 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
189 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
190
191 width = w;
192 s = src + 7;
193 d = dst;
194 __builtin_prefetch(dst + 0 * dst_stride);
195 __builtin_prefetch(dst + 1 * dst_stride);
196 __builtin_prefetch(dst + 2 * dst_stride);
197 __builtin_prefetch(dst + 3 * dst_stride);
198 __builtin_prefetch(dst + 4 * dst_stride);
199 __builtin_prefetch(dst + 5 * dst_stride);
200 __builtin_prefetch(dst + 6 * dst_stride);
201 __builtin_prefetch(dst + 7 * dst_stride);
202
203 do {
204 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
205 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
206 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
207 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
208 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
209 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
210 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
211 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
212 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
213 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
214
215 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
216 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
217 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
218 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
219 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
220 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
221 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
222 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
223
224 transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
225
226 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
227
228 s0 = s8;
229 s1 = s9;
230 s2 = s10;
231 s3 = s11;
232 s4 = s12;
233 s5 = s13;
234 s6 = s14;
235 s += 8;
236 d += 8;
237 width -= 8;
238 } while (width != 0);
239 src += 8 * src_stride;
240 dst += 8 * dst_stride;
241 h -= 8;
242 } while (h > 0);
243 }
244 }
245 }
246
vpx_convolve8_avg_horiz_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)247 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
248 uint8_t *dst, ptrdiff_t dst_stride,
249 const InterpKernel *filter, int x0_q4,
250 int x_step_q4, int y0_q4, int y_step_q4,
251 int w, int h) {
252 const int16x8_t filters = vld1q_s16(filter[x0_q4]);
253 uint8x8_t t0, t1, t2, t3;
254
255 assert((intptr_t)dst % 4 == 0);
256 assert(dst_stride % 4 == 0);
257 assert(x_step_q4 == 16);
258
259 (void)x_step_q4;
260 (void)y0_q4;
261 (void)y_step_q4;
262
263 src -= 3;
264
265 if (h == 4) {
266 uint8x8_t d01, d23, dd01, dd23;
267 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
268
269 __builtin_prefetch(src + 0 * src_stride);
270 __builtin_prefetch(src + 1 * src_stride);
271 __builtin_prefetch(src + 2 * src_stride);
272 __builtin_prefetch(src + 3 * src_stride);
273 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
274 transpose_u8_8x4(&t0, &t1, &t2, &t3);
275 s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
276 s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
277 s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
278 s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
279 s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
280 s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
281 s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
282
283 __builtin_prefetch(dst + 0 * dst_stride);
284 __builtin_prefetch(dst + 1 * dst_stride);
285 __builtin_prefetch(dst + 2 * dst_stride);
286 __builtin_prefetch(dst + 3 * dst_stride);
287 src += 7;
288
289 do {
290 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
291 transpose_u8_8x4(&t0, &t1, &t2, &t3);
292 s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
293 s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
294 s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
295 s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
296
297 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
298 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
299 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
300 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
301 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
302 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
303
304 transpose_u8_4x4(&d01, &d23);
305
306 dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
307 dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
308
309 d01 = vrhadd_u8(d01, dd01);
310 d23 = vrhadd_u8(d23, dd23);
311
312 store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
313 store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
314
315 s0 = s4;
316 s1 = s5;
317 s2 = s6;
318 s3 = s7;
319 s4 = s8;
320 s5 = s9;
321 s6 = s10;
322 src += 4;
323 dst += 4;
324 w -= 4;
325 } while (w != 0);
326 } else {
327 int width;
328 const uint8_t *s;
329 uint8x8_t t4, t5, t6, t7;
330 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
331
332 if (w == 4) {
333 uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
334
335 do {
336 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
337 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
338 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
339 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
340 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
341 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
342 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
343 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
344 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
345
346 load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
347 &t7);
348 src += 8 * src_stride;
349 __builtin_prefetch(dst + 0 * dst_stride);
350 __builtin_prefetch(dst + 1 * dst_stride);
351 __builtin_prefetch(dst + 2 * dst_stride);
352 __builtin_prefetch(dst + 3 * dst_stride);
353 __builtin_prefetch(dst + 4 * dst_stride);
354 __builtin_prefetch(dst + 5 * dst_stride);
355 __builtin_prefetch(dst + 6 * dst_stride);
356 __builtin_prefetch(dst + 7 * dst_stride);
357 transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
358 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
359 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
360 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
361 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
362
363 __builtin_prefetch(src + 0 * src_stride);
364 __builtin_prefetch(src + 1 * src_stride);
365 __builtin_prefetch(src + 2 * src_stride);
366 __builtin_prefetch(src + 3 * src_stride);
367 __builtin_prefetch(src + 4 * src_stride);
368 __builtin_prefetch(src + 5 * src_stride);
369 __builtin_prefetch(src + 6 * src_stride);
370 __builtin_prefetch(src + 7 * src_stride);
371 d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
372 d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
373 d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
374 d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
375
376 transpose_u8_8x4(&d04, &d15, &d26, &d37);
377
378 dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
379 dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
380 dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
381 dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
382
383 d04 = vrhadd_u8(d04, dd04);
384 d15 = vrhadd_u8(d15, dd15);
385 d26 = vrhadd_u8(d26, dd26);
386 d37 = vrhadd_u8(d37, dd37);
387
388 store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
389 store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
390 store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
391 store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
392
393 dst += 8 * dst_stride;
394 h -= 8;
395 } while (h != 0);
396 } else {
397 uint8_t *d;
398 uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
399 int16x8_t s11, s12, s13, s14;
400
401 do {
402 __builtin_prefetch(src + 0 * src_stride);
403 __builtin_prefetch(src + 1 * src_stride);
404 __builtin_prefetch(src + 2 * src_stride);
405 __builtin_prefetch(src + 3 * src_stride);
406 __builtin_prefetch(src + 4 * src_stride);
407 __builtin_prefetch(src + 5 * src_stride);
408 __builtin_prefetch(src + 6 * src_stride);
409 __builtin_prefetch(src + 7 * src_stride);
410 load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
411 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
412 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
413 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
414 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
415 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
416 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
417 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
418 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
419
420 width = w;
421 s = src + 7;
422 d = dst;
423 __builtin_prefetch(dst + 0 * dst_stride);
424 __builtin_prefetch(dst + 1 * dst_stride);
425 __builtin_prefetch(dst + 2 * dst_stride);
426 __builtin_prefetch(dst + 3 * dst_stride);
427 __builtin_prefetch(dst + 4 * dst_stride);
428 __builtin_prefetch(dst + 5 * dst_stride);
429 __builtin_prefetch(dst + 6 * dst_stride);
430 __builtin_prefetch(dst + 7 * dst_stride);
431
432 do {
433 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
434 transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
435 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
436 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
437 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
438 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
439 s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
440 s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
441 s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
442 s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
443
444 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
445 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
446 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
447 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
448 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
449 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
450 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
451 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
452
453 transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
454
455 d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
456 d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
457 d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
458 d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
459 d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
460 d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
461 d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
462 d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
463
464 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
465
466 s0 = s8;
467 s1 = s9;
468 s2 = s10;
469 s3 = s11;
470 s4 = s12;
471 s5 = s13;
472 s6 = s14;
473 s += 8;
474 d += 8;
475 width -= 8;
476 } while (width != 0);
477 src += 8 * src_stride;
478 dst += 8 * dst_stride;
479 h -= 8;
480 } while (h != 0);
481 }
482 }
483 }
484
vpx_convolve8_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)485 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
486 uint8_t *dst, ptrdiff_t dst_stride,
487 const InterpKernel *filter, int x0_q4,
488 int x_step_q4, int y0_q4, int y_step_q4, int w,
489 int h) {
490 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
491
492 assert((intptr_t)dst % 4 == 0);
493 assert(dst_stride % 4 == 0);
494 assert(y_step_q4 == 16);
495
496 (void)x0_q4;
497 (void)x_step_q4;
498 (void)y_step_q4;
499
500 src -= 3 * src_stride;
501
502 if (w == 4) {
503 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
504 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
505
506 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
507 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
508 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
509 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
510 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
511 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
512 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
513 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
514
515 src += 7 * src_stride;
516
517 do {
518 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
519 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
520 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
521 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
522 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
523
524 __builtin_prefetch(dst + 0 * dst_stride);
525 __builtin_prefetch(dst + 1 * dst_stride);
526 __builtin_prefetch(dst + 2 * dst_stride);
527 __builtin_prefetch(dst + 3 * dst_stride);
528 __builtin_prefetch(src + 0 * src_stride);
529 __builtin_prefetch(src + 1 * src_stride);
530 __builtin_prefetch(src + 2 * src_stride);
531 __builtin_prefetch(src + 3 * src_stride);
532
533 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
534 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
535 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
536 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
537 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
538 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
539
540 store_u8(dst + 0 * dst_stride, dst_stride, d01);
541 store_u8(dst + 2 * dst_stride, dst_stride, d23);
542
543 s0 = s4;
544 s1 = s5;
545 s2 = s6;
546 s3 = s7;
547 s4 = s8;
548 s5 = s9;
549 s6 = s10;
550 src += 4 * src_stride;
551 dst += 4 * dst_stride;
552 h -= 4;
553 } while (h != 0);
554 } else {
555 int height;
556 const uint8_t *s;
557 uint8_t *d;
558 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
559 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
560
561 do {
562 __builtin_prefetch(src + 0 * src_stride);
563 __builtin_prefetch(src + 1 * src_stride);
564 __builtin_prefetch(src + 2 * src_stride);
565 __builtin_prefetch(src + 3 * src_stride);
566 __builtin_prefetch(src + 4 * src_stride);
567 __builtin_prefetch(src + 5 * src_stride);
568 __builtin_prefetch(src + 6 * src_stride);
569
570 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
571 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
572 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
573 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
574 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
575 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
576 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
577 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
578
579 s = src + 7 * src_stride;
580 d = dst;
581 height = h;
582
583 do {
584 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
585 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
586 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
587 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
588 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
589
590 __builtin_prefetch(d + 0 * dst_stride);
591 __builtin_prefetch(d + 1 * dst_stride);
592 __builtin_prefetch(d + 2 * dst_stride);
593 __builtin_prefetch(d + 3 * dst_stride);
594 __builtin_prefetch(s + 0 * src_stride);
595 __builtin_prefetch(s + 1 * src_stride);
596 __builtin_prefetch(s + 2 * src_stride);
597 __builtin_prefetch(s + 3 * src_stride);
598
599 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
600 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
601 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
602 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
603
604 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
605
606 s0 = s4;
607 s1 = s5;
608 s2 = s6;
609 s3 = s7;
610 s4 = s8;
611 s5 = s9;
612 s6 = s10;
613 s += 4 * src_stride;
614 d += 4 * dst_stride;
615 height -= 4;
616 } while (height != 0);
617 src += 8;
618 dst += 8;
619 w -= 8;
620 } while (w != 0);
621 }
622 }
623
vpx_convolve8_avg_vert_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)624 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
625 uint8_t *dst, ptrdiff_t dst_stride,
626 const InterpKernel *filter, int x0_q4,
627 int x_step_q4, int y0_q4, int y_step_q4, int w,
628 int h) {
629 const int16x8_t filters = vld1q_s16(filter[y0_q4]);
630
631 assert((intptr_t)dst % 4 == 0);
632 assert(dst_stride % 4 == 0);
633 assert(y_step_q4 == 16);
634
635 (void)x0_q4;
636 (void)x_step_q4;
637 (void)y_step_q4;
638
639 src -= 3 * src_stride;
640
641 if (w == 4) {
642 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
643 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
644
645 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
646 s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
647 s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
648 s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
649 s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
650 s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
651 s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
652 s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
653
654 src += 7 * src_stride;
655
656 do {
657 load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
658 s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
659 s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
660 s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
661 s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
662
663 __builtin_prefetch(dst + 0 * dst_stride);
664 __builtin_prefetch(dst + 1 * dst_stride);
665 __builtin_prefetch(dst + 2 * dst_stride);
666 __builtin_prefetch(dst + 3 * dst_stride);
667 __builtin_prefetch(src + 0 * src_stride);
668 __builtin_prefetch(src + 1 * src_stride);
669 __builtin_prefetch(src + 2 * src_stride);
670 __builtin_prefetch(src + 3 * src_stride);
671
672 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
673 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
674 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
675 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
676 d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
677 d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
678
679 dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
680 dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
681
682 d01 = vrhadd_u8(d01, dd01);
683 d23 = vrhadd_u8(d23, dd23);
684
685 store_u8(dst + 0 * dst_stride, dst_stride, d01);
686 store_u8(dst + 2 * dst_stride, dst_stride, d23);
687
688 s0 = s4;
689 s1 = s5;
690 s2 = s6;
691 s3 = s7;
692 s4 = s8;
693 s5 = s9;
694 s6 = s10;
695 src += 4 * src_stride;
696 dst += 4 * dst_stride;
697 h -= 4;
698 } while (h != 0);
699 } else {
700 int height;
701 const uint8_t *s;
702 uint8_t *d;
703 uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
704 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
705
706 do {
707 __builtin_prefetch(src + 0 * src_stride);
708 __builtin_prefetch(src + 1 * src_stride);
709 __builtin_prefetch(src + 2 * src_stride);
710 __builtin_prefetch(src + 3 * src_stride);
711 __builtin_prefetch(src + 4 * src_stride);
712 __builtin_prefetch(src + 5 * src_stride);
713 __builtin_prefetch(src + 6 * src_stride);
714
715 load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
716 s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
717 s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
718 s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
719 s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
720 s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
721 s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
722 s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
723
724 s = src + 7 * src_stride;
725 d = dst;
726 height = h;
727
728 do {
729 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
730 s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
731 s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
732 s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
733 s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
734
735 __builtin_prefetch(d + 0 * dst_stride);
736 __builtin_prefetch(d + 1 * dst_stride);
737 __builtin_prefetch(d + 2 * dst_stride);
738 __builtin_prefetch(d + 3 * dst_stride);
739 __builtin_prefetch(s + 0 * src_stride);
740 __builtin_prefetch(s + 1 * src_stride);
741 __builtin_prefetch(s + 2 * src_stride);
742 __builtin_prefetch(s + 3 * src_stride);
743
744 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
745 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
746 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
747 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
748
749 d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
750 d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
751 d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
752 d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
753
754 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
755
756 s0 = s4;
757 s1 = s5;
758 s2 = s6;
759 s3 = s7;
760 s4 = s8;
761 s5 = s9;
762 s6 = s10;
763 height -= 4;
764 s += 4 * src_stride;
765 d += 4 * dst_stride;
766 } while (height != 0);
767 src += 8;
768 dst += 8;
769 w -= 8;
770 } while (w != 0);
771 }
772 }
773