• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23@ All public functions in this file have the following signature:
24@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25@                            const uint8_t *ref, ptrdiff_t ref_stride,
26@                            int h, int mx, int my);
27
28function ff_vp9_copy128_neon, export=1
29        ldr             r12, [sp]
30        sub             r1,  r1,  #96
31        sub             r3,  r3,  #96
321:
33        subs            r12, r12, #1
34        vld1.16         {q0,  q1},  [r2]!
35        vst1.16         {q0,  q1},  [r0, :128]!
36        vld1.16         {q2,  q3},  [r2]!
37        vst1.16         {q2,  q3},  [r0, :128]!
38        vld1.16         {q8,  q9},  [r2]!
39        vst1.16         {q8,  q9},  [r0, :128]!
40        vld1.16         {q10, q11}, [r2], r3
41        vst1.16         {q10, q11}, [r0, :128], r1
42        bne             1b
43        bx              lr
44endfunc
45
46function ff_vp9_avg64_16_neon, export=1
47        push            {lr}
48        ldr             r12, [sp, #4]
49        sub             r1,  r1,  #96
50        sub             r3,  r3,  #96
51        mov             lr,  r0
521:
53        subs            r12, r12, #1
54        vld1.16         {q8,  q9},  [r2]!
55        vld1.16         {q0,  q1},  [r0, :128]!
56        vld1.16         {q10, q11}, [r2]!
57        vrhadd.u16      q0,  q0,  q8
58        vld1.16         {q2,  q3},  [r0, :128]!
59        vrhadd.u16      q1,  q1,  q9
60        vld1.16         {q12, q13}, [r2]!
61        vrhadd.u16      q2,  q2,  q10
62        vst1.16         {q0,  q1},  [lr, :128]!
63        vrhadd.u16      q3,  q3,  q11
64        vld1.16         {q8,  q9},  [r0, :128]!
65        vst1.16         {q2,  q3},  [lr, :128]!
66        vrhadd.u16      q8,  q8,  q12
67        vld1.16         {q14, q15}, [r2], r3
68        vrhadd.u16      q9,  q9,  q13
69        vld1.16         {q10, q11}, [r0, :128], r1
70        vrhadd.u16      q10, q10, q14
71        vst1.16         {q8,  q9},  [lr, :128]!
72        vrhadd.u16      q11, q11, q15
73        vst1.16         {q10, q11}, [lr, :128], r1
74        bne             1b
75        pop             {pc}
76endfunc
77
78function ff_vp9_avg32_16_neon, export=1
79        push            {lr}
80        ldr             r12, [sp, #4]
81        sub             r1,  r1,  #32
82        sub             r3,  r3,  #32
83        mov             lr,  r0
841:
85        subs            r12, r12, #1
86        vld1.16         {q8,  q9},  [r2]!
87        vld1.16         {q0,  q1},  [r0, :128]!
88        vld1.16         {q10, q11}, [r2], r3
89        vrhadd.u16      q0,  q0,  q8
90        vld1.16         {q2,  q3},  [r0, :128], r1
91        vrhadd.u16      q1,  q1,  q9
92        vrhadd.u16      q2,  q2,  q10
93        vst1.16         {q0, q1},  [lr, :128]!
94        vrhadd.u16      q3,  q3,  q11
95        vst1.16         {q2, q3},  [lr, :128], r1
96        bne             1b
97        pop             {pc}
98endfunc
99
100function ff_vp9_avg16_16_neon, export=1
101        ldr             r12, [sp]
1021:
103        subs            r12, r12, #1
104        vld1.16         {q2,  q3},  [r2], r3
105        vld1.16         {q0,  q1},  [r0, :128]
106        vrhadd.u16      q0,  q0,  q2
107        vrhadd.u16      q1,  q1,  q3
108        vst1.16         {q0,  q1},  [r0, :128], r1
109        bne             1b
110        bx              lr
111endfunc
112
113function ff_vp9_avg8_16_neon, export=1
114        push            {lr}
115        ldr             r12, [sp, #4]
116        mov             lr,  r0
1171:
118        subs            r12, r12, #2
119        vld1.16         {q2},  [r2], r3
120        vld1.16         {q0},  [r0, :128], r1
121        vld1.16         {q3},  [r2], r3
122        vrhadd.u16      q0,  q0,  q2
123        vld1.16         {q1},  [r0, :128], r1
124        vrhadd.u16      q1,  q1,  q3
125        vst1.16         {q0},  [lr, :128], r1
126        vst1.16         {q1},  [lr, :128], r1
127        bne             1b
128        pop             {pc}
129endfunc
130
131function ff_vp9_avg4_16_neon, export=1
132        ldr             r12, [sp]
1331:
134        subs            r12, r12, #2
135        vld1.16         {d2},  [r2], r3
136        vld1.16         {d0},  [r0, :64], r1
137        vld1.16         {d3},  [r2], r3
138        vrhadd.u16      d0,  d0,  d2
139        vld1.16         {d1},  [r0, :64]
140        sub             r0,  r0,  r1
141        vrhadd.u16      d1,  d1,  d3
142        vst1.16         {d0},  [r0, :64], r1
143        vst1.16         {d1},  [r0, :64], r1
144        bne             1b
145        bx              lr
146endfunc
147
148@ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
149.macro vmull_lane dst, src, idx
150.if \idx < 4
151       vmull.s16        \dst, \src, d0[\idx]
152.else
153       vmull.s16        \dst, \src, d1[\idx - 4]
154.endif
155.endm
156.macro vmlal_lane dst, src, idx
157.if \idx < 4
158       vmlal.s16        \dst, \src, d0[\idx]
159.else
160       vmlal.s16        \dst, \src, d1[\idx - 4]
161.endif
162.endm
163
164@ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
165@ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
166.macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
167        vext.8          q14, \src1, \src2, #(2*\offset)
168        vext.8          q15, \src3, \src4, #(2*\offset)
169        vmlal_lane      \dst1,  d28, \offset
170        vmlal_lane      \dst3,  d30, \offset
171.if \size >= 8
172        vmlal_lane      \dst2,  d29, \offset
173        vmlal_lane      \dst4,  d31, \offset
174.endif
175.endm
176
177
178@ Instantiate a horizontal filter function for the given size.
179@ This can work on 4 or 8 pixels in parallel; for larger
180@ widths it will do 8 pixels at a time and loop horizontally.
181@ The actual width (in bytes) is passed in r5, the height in r4 and
182@ the filter coefficients in r12.
183.macro do_8tap_h type, size
184function \type\()_8tap_\size\()h
185        sub             r2,  r2,  #6
186        add             r6,  r0,  r1
187        add             r7,  r2,  r3
188        add             r1,  r1,  r1
189        add             r3,  r3,  r3
190        @ Only size >= 8 loops horizontally and needs
191        @ reduced dst stride
192.if \size >= 8
193        sub             r1,  r1,  r5
194.endif
195        @ size >= 8 loads two qwords and increments r2,
196        @ for size 4 it's enough with three dwords and no
197        @ postincrement
198.if \size >= 8
199        sub             r3,  r3,  r5
200        sub             r3,  r3,  #16
201.endif
202        @ Load the filter vector
203        vld1.16         {q0},  [r12,:128]
2041:
205.if \size >= 8
206        mov             r12, r5
207.endif
208        @ Load src
209.if \size >= 8
210        vld1.16         {q8,  q9},  [r2]!
211        vld1.16         {q10, q11}, [r7]!
212.else
213        vld1.16         {d16, d17, d18}, [r2]
214        vld1.16         {d20, d21, d22}, [r7]
215.endif
2162:
217
218        vmull.s16       q1,  d16, d0[0]
219        vmull.s16       q12, d20, d0[0]
220.if \size >= 8
221        vmull.s16       q2,  d17, d0[0]
222        vmull.s16       q13, d21, d0[0]
223.endif
224        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 1, \size
225        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 2, \size
226        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 3, \size
227        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 4, \size
228        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 5, \size
229        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 6, \size
230        extmlal         q1,  q2,  q12, q13, q8,  q9,  q10, q11, 7, \size
231
232        @ Round, shift and saturate.
233        @ The vqrshrun takes care of clamping negative values to zero, but
234        @ we manually need to do vmin with the max pixel value.
235        vqrshrun.s32    d2,  q1,  #7
236        vqrshrun.s32    d24, q12, #7
237.if \size >= 8
238        vqrshrun.s32    d3,  q2,  #7
239        vqrshrun.s32    d25, q13, #7
240        vmin.u16        q1,  q1,  q3
241        vmin.u16        q12, q12, q3
242.else
243        vmin.u16        d2,  d2,  d6
244        vmin.u16        d24, d24, d6
245.endif
246        @ Average
247.ifc \type,avg
248.if \size >= 8
249        vld1.16         {q14}, [r0,:128]
250        vld1.16         {q15}, [r6,:128]
251        vrhadd.u16      q1,  q1,  q14
252        vrhadd.u16      q12, q12, q15
253.else
254        vld1.16         {d28}, [r0,:64]
255        vld1.16         {d30}, [r6,:64]
256        vrhadd.u16      d2,  d2,  d28
257        vrhadd.u16      d24, d24, d30
258.endif
259.endif
260        @ Store and loop horizontally (for size >= 8)
261.if \size >= 8
262        subs            r12, r12, #16
263        vst1.16         {q1},  [r0,:128]!
264        vst1.16         {q12}, [r6,:128]!
265        beq             3f
266        vmov            q8,  q9
267        vmov            q10, q11
268        vld1.16         {q9},  [r2]!
269        vld1.16         {q11}, [r7]!
270        b               2b
271.else @ \size == 4
272        vst1.16         {d2},  [r0,:64]
273        vst1.16         {d24}, [r6,:64]
274.endif
2753:
276        @ Loop vertically
277        add             r0,  r0,  r1
278        add             r6,  r6,  r1
279        add             r2,  r2,  r3
280        add             r7,  r7,  r3
281        subs            r4,  r4,  #2
282        bne             1b
283        pop             {r4-r7}
284        bx              lr
285endfunc
286.endm
287
288.macro do_8tap_h_size size
289do_8tap_h put, \size
290do_8tap_h avg, \size
291.endm
292
293do_8tap_h_size 4
294do_8tap_h_size 8
295
296.macro do_8tap_h_func type, filter, offset, size, bpp
297function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
298        push            {r4-r7}
299        ldr             r4,  [sp, #16]
300        ldr             r5,  [sp, #20]
301        vmvn.u16        q3,  #((0xffff << \bpp) & 0xffff)
302        movrelx         r12, X(ff_vp9_subpel_filters), r6
303        add             r12, r12, 256*\offset
304        add             r12, r12, r5, lsl #4
305        mov             r5,  #2*\size
306.if \size >= 8
307        b               \type\()_8tap_8h
308.else
309        b               \type\()_8tap_4h
310.endif
311endfunc
312.endm
313
314.macro do_8tap_h_filters size, bpp
315do_8tap_h_func put, regular, 1, \size, \bpp
316do_8tap_h_func avg, regular, 1, \size, \bpp
317do_8tap_h_func put, sharp,   2, \size, \bpp
318do_8tap_h_func avg, sharp,   2, \size, \bpp
319do_8tap_h_func put, smooth,  0, \size, \bpp
320do_8tap_h_func avg, smooth,  0, \size, \bpp
321.endm
322
323.macro do_8tap_h_filters_bpp bpp
324do_8tap_h_filters 64, \bpp
325do_8tap_h_filters 32, \bpp
326do_8tap_h_filters 16, \bpp
327do_8tap_h_filters 8,  \bpp
328do_8tap_h_filters 4,  \bpp
329.endm
330
331do_8tap_h_filters_bpp 10
332do_8tap_h_filters_bpp 12
333
334.ltorg
335
336@ Vertical filters
337
338@ Round, shift and saturate and store qreg1-4
339.macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
340        vqrshrun.s32    \dreg1,  \qreg1, #7
341        vqrshrun.s32    \dreg2,  \qreg2, #7
342        vqrshrun.s32    \dreg3,  \qreg3, #7
343        vqrshrun.s32    \dreg4,  \qreg4, #7
344.ifc \type,avg
345        vld1.16         {\tmp1},  [r6,:64], r1
346        vld1.16         {\tmp2},  [r6,:64], r1
347        vld1.16         {\tmp3},  [r6,:64], r1
348        vld1.16         {\tmp4},  [r6,:64], r1
349.endif
350        vmin.u16        \dreg1,  \dreg1,  \minreg
351        vmin.u16        \dreg2,  \dreg2,  \minreg
352        vmin.u16        \dreg3,  \dreg3,  \minreg
353        vmin.u16        \dreg4,  \dreg4,  \minreg
354.ifc \type,avg
355        vrhadd.u16      \dreg1,  \dreg1,  \tmp1
356        vrhadd.u16      \dreg2,  \dreg2,  \tmp2
357        vrhadd.u16      \dreg3,  \dreg3,  \tmp3
358        vrhadd.u16      \dreg4,  \dreg4,  \tmp4
359.endif
360        vst1.16         {\dreg1}, [r0,:64], r1
361        vst1.16         {\dreg2}, [r0,:64], r1
362        vst1.16         {\dreg3}, [r0,:64], r1
363        vst1.16         {\dreg4}, [r0,:64], r1
364.endm
365
366@ Round, shift and saturate and store qreg1-4
367@ qreg1-2 belong to one line and qreg3-4 to the second line.
368@ dreg1-2 == qreg1, dreg3-4 == qreg2.
369.macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
370        vqrshrun.s32    \dreg1,  \qreg1, #7
371        vqrshrun.s32    \dreg2,  \qreg2, #7
372        vqrshrun.s32    \dreg3,  \qreg3, #7
373        vqrshrun.s32    \dreg4,  \qreg4, #7
374.ifc \type,avg
375        vld1.16         {\qreg3},  [r6,:128], r1
376        vld1.16         {\qreg4},  [r6,:128], r1
377.endif
378        vmin.u16        \qreg1,  \qreg1,  \minreg
379        vmin.u16        \qreg2,  \qreg2,  \minreg
380.ifc \type,avg
381        vrhadd.u16      \qreg1,  \qreg1,  \qreg3
382        vrhadd.u16      \qreg2,  \qreg2,  \qreg4
383.endif
384        vst1.16         {\qreg1}, [r0,:128], r1
385        vst1.16         {\qreg2}, [r0,:128], r1
386.endm
387
388@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
389@ (src1-src8 into dst1, src2-src9 into dst2).
390.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
391        vmull.s16       \dst1, \src1, d0[0]
392        vmull.s16       \dst2, \src2, d0[0]
393        vmull.s16       \tmp1, \src2, d0[1]
394        vmull.s16       \tmp2, \src3, d0[1]
395        vmlal.s16       \dst1, \src3, d0[2]
396        vmlal.s16       \dst2, \src4, d0[2]
397        vmlal.s16       \tmp1, \src4, d0[3]
398        vmlal.s16       \tmp2, \src5, d0[3]
399        vmlal.s16       \dst1, \src5, d1[0]
400        vmlal.s16       \dst2, \src6, d1[0]
401        vmlal.s16       \tmp1, \src6, d1[1]
402        vmlal.s16       \tmp2, \src7, d1[1]
403        vmlal.s16       \dst1, \src7, d1[2]
404        vmlal.s16       \dst2, \src8, d1[2]
405        vmlal.s16       \tmp1, \src8, d1[3]
406        vmlal.s16       \tmp2, \src9, d1[3]
407        vadd.s32        \dst1, \dst1, \tmp1
408        vadd.s32        \dst2, \dst2, \tmp2
409.endm
410
411@ Evaluate the filter twice in parallel. This does the same as convolve4 above,
412@ but with double width (two input/output registers per row).
413.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
414        vmull.s16       \dst1, \src1,  d0[0]
415        vmull.s16       \dst2, \src2,  d0[0]
416        vmull.s16       \dst3, \src3,  d0[0]
417        vmull.s16       \dst4, \src4,  d0[0]
418        vmlal.s16       \dst1, \src3,  d0[1]
419        vmlal.s16       \dst2, \src4,  d0[1]
420        vmlal.s16       \dst3, \src5,  d0[1]
421        vmlal.s16       \dst4, \src6,  d0[1]
422        vmlal.s16       \dst1, \src5,  d0[2]
423        vmlal.s16       \dst2, \src6,  d0[2]
424        vmlal.s16       \dst3, \src7,  d0[2]
425        vmlal.s16       \dst4, \src8,  d0[2]
426        vmlal.s16       \dst1, \src7,  d0[3]
427        vmlal.s16       \dst2, \src8,  d0[3]
428        vmlal.s16       \dst3, \src9,  d0[3]
429        vmlal.s16       \dst4, \src10, d0[3]
430        vmlal.s16       \dst1, \src9,  d1[0]
431        vmlal.s16       \dst2, \src10, d1[0]
432        vmlal.s16       \dst3, \src11, d1[0]
433        vmlal.s16       \dst4, \src12, d1[0]
434        vmlal.s16       \dst1, \src11, d1[1]
435        vmlal.s16       \dst2, \src12, d1[1]
436        vmlal.s16       \dst3, \src13, d1[1]
437        vmlal.s16       \dst4, \src14, d1[1]
438        vmlal.s16       \dst1, \src13, d1[2]
439        vmlal.s16       \dst2, \src14, d1[2]
440        vmlal.s16       \dst3, \src15, d1[2]
441        vmlal.s16       \dst4, \src16, d1[2]
442        vmlal.s16       \dst1, \src15, d1[3]
443        vmlal.s16       \dst2, \src16, d1[3]
444        vmlal.s16       \dst3, \src17, d1[3]
445        vmlal.s16       \dst4, \src18, d1[3]
446.endm
447
448@ Instantiate a vertical filter function for filtering 8 pixels at a time.
449@ The height is passed in r4, the width in r5 and the filter coefficients
450@ in r12.
451.macro do_8tap_8v type
452function \type\()_8tap_8v
453        sub             r2,  r2,  r3, lsl #1
454        sub             r2,  r2,  r3
455        vld1.16         {q0},  [r12, :128]
4561:
457.ifc \type,avg
458        mov             r6,  r0
459.endif
460        mov             r12, r4
461
462        vld1.16         {q5},  [r2], r3
463        vld1.16         {q6},  [r2], r3
464        vld1.16         {q7},  [r2], r3
465        vld1.16         {q8},  [r2], r3
466        vld1.16         {q9},  [r2], r3
467        vld1.16         {q10}, [r2], r3
468        vld1.16         {q11}, [r2], r3
4692:
470        vld1.16         {q12}, [r2], r3
471        vld1.16         {q13}, [r2], r3
472        vld1.16         {q14}, [r2], r3
473        vld1.16         {q15}, [r2], r3
474        convolve8       q2,  q3,  q4,  q5,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
475        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
476        convolve8       q2,  q3,  q4,  q5,  d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
477        do_store8       q2,  q3,  q4,  q5,  d4,  d5,  d6,  d7,  q1,  \type
478
479        subs            r12, r12, #4
480        beq             8f
481
482        vld1.16         {q4},  [r2], r3
483        vld1.16         {q5},  [r2], r3
484        vld1.16         {q6},  [r2], r3
485        vld1.16         {q7},  [r2], r3
486        convolve8       q2,  q3,  q8,  q9,  d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11
487        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
488        convolve8       q2,  q3,  q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15
489        do_store8       q2,  q3,  q8,  q9,  d4,  d5,  d6,  d7,  q1,  \type
490
491        subs            r12, r12, #4
492        beq             8f
493
494        vld1.16         {q8},  [r2], r3
495        vld1.16         {q9},  [r2], r3
496        vld1.16         {q10}, [r2], r3
497        vld1.16         {q11}, [r2], r3
498        convolve8       q2,  q3,  q12, q13, d26, d27, d28, d29, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
499        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
500        convolve8       q2,  q3,  q12, q13, d30, d31, d8,  d9,  d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
501        do_store8       q2,  q3,  q12, q13, d4,  d5,  d6,  d7,  q1,  \type
502
503        subs            r12, r12, #4
504        bne             2b
505
5068:
507        subs            r5,  r5,  #8
508        beq             9f
509        @ r0 -= h * dst_stride
510        mls             r0,  r1,  r4, r0
511        @ r2 -= h * src_stride
512        mls             r2,  r3,  r4, r2
513        @ r2 -= 8 * src_stride
514        sub             r2,  r2,  r3, lsl #3
515        @ r2 += 1 * src_stride
516        add             r2,  r2,  r3
517        add             r2,  r2,  #16
518        add             r0,  r0,  #16
519        b               1b
5209:
521        vpop            {q4-q7}
522        pop             {r4-r6}
523        bx              lr
524endfunc
525.endm
526
527do_8tap_8v put
528do_8tap_8v avg
529
530@ Instantiate a vertical filter function for filtering a 4 pixels wide
531@ slice. This only is designed to work for 4 or 8 output lines.
532.macro do_8tap_4v type
533function \type\()_8tap_4v
534        sub             r2,  r2,  r3, lsl #1
535        sub             r2,  r2,  r3
536        vld1.16         {q0},  [r12, :128]
537.ifc \type,avg
538        mov             r6,  r0
539.endif
540
541        vld1.16         {d16}, [r2], r3
542        vld1.16         {d17}, [r2], r3
543        vld1.16         {d18}, [r2], r3
544        vld1.16         {d19}, [r2], r3
545        vld1.16         {d20}, [r2], r3
546        vld1.16         {d21}, [r2], r3
547        vld1.16         {d22}, [r2], r3
548        vld1.16         {d23}, [r2], r3
549        vld1.16         {d24}, [r2], r3
550        vld1.16         {d25}, [r2], r3
551        vld1.16         {d26}, [r2], r3
552        convolve4       q2,  q3,  d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
553        convolve4       q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8,  q9
554        do_store4       q2,  d4,  q3,  d6,  q14, d28, q15, d30, d5,  d7,  d29, d31, d2,  \type
555
556        subs            r4,  r4,  #4
557        beq             9f
558
559        vld1.16         {d27}, [r2], r3
560        vld1.16         {d28}, [r2], r3
561        vld1.16         {d29}, [r2], r3
562        vld1.16         {d30}, [r2], r3
563        convolve4       q2,  q3,  d20, d21, d22, d23, d24, d25, d26, d27, d28, q8,  q9
564        convolve4       q8,  q9,  d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
565        do_store4       q2,  d4,  q3,  d6,  q8,  d16, q9,  d18, d5,  d7,  d17, d19, d2,  \type
566
5679:
568        pop             {r4-r6}
569        bx              lr
570endfunc
571.endm
572
573do_8tap_4v put
574do_8tap_4v avg
575
576.macro do_8tap_v_func type, filter, offset, size, bpp
577function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
578        push            {r4-r6}
579        ldr             r4,  [sp, #12]
580        ldr             r5,  [sp, #20]
581.if \size >= 8
582        vpush           {q4-q7}
583.endif
584        vmvn.u16        q1,  #((0xffff << \bpp) & 0xffff)
585        movrelx         r12, X(ff_vp9_subpel_filters), r6
586        add             r12, r12, 256*\offset
587        add             r12, r12, r5, lsl #4
588        mov             r5,  #\size
589.if \size >= 8
590        b               \type\()_8tap_8v
591.else
592        b               \type\()_8tap_4v
593.endif
594endfunc
595.endm
596
597.macro do_8tap_v_filters size, bpp
598do_8tap_v_func put, regular, 1, \size, \bpp
599do_8tap_v_func avg, regular, 1, \size, \bpp
600do_8tap_v_func put, sharp,   2, \size, \bpp
601do_8tap_v_func avg, sharp,   2, \size, \bpp
602do_8tap_v_func put, smooth,  0, \size, \bpp
603do_8tap_v_func avg, smooth,  0, \size, \bpp
604.endm
605
606.macro do_8tap_v_filters_bpp bpp
607do_8tap_v_filters 64, \bpp
608do_8tap_v_filters 32, \bpp
609do_8tap_v_filters 16, \bpp
610do_8tap_v_filters 8,  \bpp
611do_8tap_v_filters 4,  \bpp
612.endm
613
614do_8tap_v_filters_bpp 10
615do_8tap_v_filters_bpp 12
616