• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23@ All public functions in this file have the following signature:
24@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25@                            const uint8_t *ref, ptrdiff_t ref_stride,
26@                            int h, int mx, int my);
27
28function ff_vp9_copy64_neon, export=1
29        ldr             r12, [sp]
30        sub             r1,  r1,  #32
31        sub             r3,  r3,  #32
321:
33        vld1.8          {q0,  q1},  [r2]!
34        vst1.8          {q0,  q1},  [r0, :128]!
35        vld1.8          {q2,  q3},  [r2], r3
36        subs            r12, r12, #1
37        vst1.8          {q2,  q3},  [r0, :128], r1
38        bne             1b
39        bx              lr
40endfunc
41
42function ff_vp9_avg64_neon, export=1
43        push            {lr}
44        ldr             r12, [sp, #4]
45        sub             r1,  r1,  #32
46        sub             r3,  r3,  #32
47        mov             lr,  r0
481:
49        vld1.8          {q8,  q9},  [r2]!
50        vld1.8          {q0,  q1},  [r0, :128]!
51        vld1.8          {q10, q11}, [r2], r3
52        vrhadd.u8       q0,  q0,  q8
53        vld1.8          {q2,  q3},  [r0, :128], r1
54        vrhadd.u8       q1,  q1,  q9
55        vrhadd.u8       q2,  q2,  q10
56        vst1.8          {q0,  q1},  [lr, :128]!
57        vrhadd.u8       q3,  q3,  q11
58        vst1.8          {q2,  q3},  [lr, :128], r1
59        subs            r12, r12, #1
60        bne             1b
61        pop             {pc}
62endfunc
63
64function ff_vp9_copy32_neon, export=1
65        ldr             r12, [sp]
661:
67        vld1.8          {q0,  q1},  [r2], r3
68        subs            r12, r12, #1
69        vst1.8          {q0,  q1},  [r0, :128], r1
70        bne             1b
71        bx              lr
72endfunc
73
74function ff_vp9_avg32_neon, export=1
75        ldr             r12, [sp]
761:
77        vld1.8          {q2,  q3},  [r2], r3
78        vld1.8          {q0,  q1},  [r0, :128]
79        vrhadd.u8       q0,  q0,  q2
80        vrhadd.u8       q1,  q1,  q3
81        subs            r12, r12, #1
82        vst1.8          {q0,  q1},  [r0, :128], r1
83        bne             1b
84        bx              lr
85endfunc
86
87function ff_vp9_copy16_neon, export=1
88        push            {r4,lr}
89        ldr             r12, [sp, #8]
90        add             r4,  r0,  r1
91        add             lr,  r2,  r3
92        add             r1,  r1,  r1
93        add             r3,  r3,  r3
941:
95        vld1.8          {q0},  [r2], r3
96        vld1.8          {q1},  [lr], r3
97        subs            r12, r12, #2
98        vst1.8          {q0},  [r0, :128], r1
99        vst1.8          {q1},  [r4, :128], r1
100        bne             1b
101        pop             {r4,pc}
102endfunc
103
104function ff_vp9_avg16_neon, export=1
105        push            {lr}
106        ldr             r12, [sp, #4]
107        mov             lr,  r0
1081:
109        vld1.8          {q2},  [r2], r3
110        vld1.8          {q0},  [r0, :128], r1
111        vld1.8          {q3},  [r2], r3
112        vrhadd.u8       q0,  q0,  q2
113        vld1.8          {q1},  [r0, :128], r1
114        vrhadd.u8       q1,  q1,  q3
115        subs            r12, r12, #2
116        vst1.8          {q0},  [lr, :128], r1
117        vst1.8          {q1},  [lr, :128], r1
118        bne             1b
119        pop             {pc}
120endfunc
121
122function ff_vp9_copy8_neon, export=1
123        ldr             r12, [sp]
1241:
125        vld1.8          {d0},  [r2], r3
126        vld1.8          {d1},  [r2], r3
127        subs            r12, r12, #2
128        vst1.8          {d0},  [r0, :64], r1
129        vst1.8          {d1},  [r0, :64], r1
130        bne             1b
131        bx              lr
132endfunc
133
134function ff_vp9_avg8_neon, export=1
135        ldr             r12, [sp]
1361:
137        vld1.8          {d2},  [r2], r3
138        vld1.8          {d0},  [r0, :64], r1
139        vld1.8          {d3},  [r2], r3
140        vrhadd.u8       d0,  d0,  d2
141        vld1.8          {d1},  [r0, :64]
142        sub             r0,  r0,  r1
143        vrhadd.u8       d1,  d1,  d3
144        subs            r12, r12, #2
145        vst1.8          {d0},  [r0, :64], r1
146        vst1.8          {d1},  [r0, :64], r1
147        bne             1b
148        bx              lr
149endfunc
150
151function ff_vp9_copy4_neon, export=1
152        ldr             r12, [sp]
1531:
154        vld1.32         {d0[]},   [r2], r3
155        vld1.32         {d1[]},   [r2], r3
156        vst1.32         {d0[0]},  [r0, :32], r1
157        vld1.32         {d2[]},   [r2], r3
158        vst1.32         {d1[0]},  [r0, :32], r1
159        vld1.32         {d3[]},   [r2], r3
160        subs            r12, r12, #4
161        vst1.32         {d2[0]},  [r0, :32], r1
162        vst1.32         {d3[0]},  [r0, :32], r1
163        bne             1b
164        bx              lr
165endfunc
166
167function ff_vp9_avg4_neon, export=1
168        push            {lr}
169        ldr             r12, [sp, #4]
170        mov             lr,  r0
1711:
172        vld1.32         {d4[]},   [r2], r3
173        vld1.32         {d0[]},   [r0, :32], r1
174        vld1.32         {d5[]},   [r2], r3
175        vrhadd.u8       d0,  d0,  d4
176        vld1.32         {d1[]},   [r0, :32], r1
177        vld1.32         {d6[]},   [r2], r3
178        vrhadd.u8       d1,  d1,  d5
179        vld1.32         {d2[]},   [r0, :32], r1
180        vld1.32         {d7[]},   [r2], r3
181        vrhadd.u8       d2,  d2,  d6
182        vld1.32         {d3[]},   [r0, :32], r1
183        subs            r12, r12, #4
184        vst1.32         {d0[0]},  [lr, :32], r1
185        vrhadd.u8       d3,  d3,  d7
186        vst1.32         {d1[0]},  [lr, :32], r1
187        vst1.32         {d2[0]},  [lr, :32], r1
188        vst1.32         {d3[0]},  [lr, :32], r1
189        bne             1b
190        pop             {pc}
191endfunc
192
193@ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
194.macro vmul_lane dst, src, idx
195.if \idx < 4
196       vmul.s16         \dst, \src, d0[\idx]
197.else
198       vmul.s16         \dst, \src, d1[\idx - 4]
199.endif
200.endm
201.macro vmla_lane dst, src, idx
202.if \idx < 4
203       vmla.s16         \dst, \src, d0[\idx]
204.else
205       vmla.s16         \dst, \src, d1[\idx - 4]
206.endif
207.endm
208
209@ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
210@ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
211@ dst1-dst2 and dst3-dst4 for size >= 16)
212.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
213        vext.8          q14, \src1, \src2, #(2*\offset)
214        vext.8          q15, \src4, \src5, #(2*\offset)
215.if \size >= 16
216        vmla_lane       \dst1,  q14, \offset
217        vext.8          q5,  \src2, \src3, #(2*\offset)
218        vmla_lane       \dst3,  q15, \offset
219        vext.8          q6,  \src5, \src6, #(2*\offset)
220        vmla_lane       \dst2,  q5,  \offset
221        vmla_lane       \dst4,  q6,  \offset
222.elseif \size == 8
223        vmla_lane       \dst1,  q14, \offset
224        vmla_lane       \dst3,  q15, \offset
225.else
226        vmla_lane       \dst1d, d28, \offset
227        vmla_lane       \dst3d, d30, \offset
228.endif
229.endm
230@ The same as above, but don't accumulate straight into the
231@ destination, but use a temp register and accumulate with saturation.
232.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
233        vext.8          q14, \src1, \src2, #(2*\offset)
234        vext.8          q15, \src4, \src5, #(2*\offset)
235.if \size >= 16
236        vmul_lane       q14, q14, \offset
237        vext.8          q5,  \src2, \src3, #(2*\offset)
238        vmul_lane       q15, q15, \offset
239        vext.8          q6,  \src5, \src6, #(2*\offset)
240        vmul_lane       q5,  q5,  \offset
241        vmul_lane       q6,  q6,  \offset
242.elseif \size == 8
243        vmul_lane       q14, q14, \offset
244        vmul_lane       q15, q15, \offset
245.else
246        vmul_lane       d28, d28, \offset
247        vmul_lane       d30, d30, \offset
248.endif
249.if \size == 4
250        vqadd.s16       \dst1d, \dst1d, d28
251        vqadd.s16       \dst3d, \dst3d, d30
252.else
253        vqadd.s16       \dst1,  \dst1,  q14
254        vqadd.s16       \dst3,  \dst3,  q15
255.if \size >= 16
256        vqadd.s16       \dst2,  \dst2,  q5
257        vqadd.s16       \dst4,  \dst4,  q6
258.endif
259.endif
260.endm
261
262
263@ Instantiate a horizontal filter function for the given size.
264@ This can work on 4, 8 or 16 pixels in parallel; for larger
265@ widths it will do 16 pixels at a time and loop horizontally.
266@ The actual width is passed in r5, the height in r4 and
267@ the filter coefficients in r12. idx2 is the index of the largest
268@ filter coefficient (3 or 4) and idx1 is the other one of them.
269.macro do_8tap_h type, size, idx1, idx2
270function \type\()_8tap_\size\()h_\idx1\idx2
271        sub             r2,  r2,  #3
272        add             r6,  r0,  r1
273        add             r7,  r2,  r3
274        add             r1,  r1,  r1
275        add             r3,  r3,  r3
276        @ Only size >= 16 loops horizontally and needs
277        @ reduced dst stride
278.if \size >= 16
279        sub             r1,  r1,  r5
280.endif
281        @ size >= 16 loads two qwords and increments r2,
282        @ for size 4/8 it's enough with one qword and no
283        @ postincrement
284.if \size >= 16
285        sub             r3,  r3,  r5
286        sub             r3,  r3,  #8
287.endif
288        @ Load the filter vector
289        vld1.16         {q0},  [r12,:128]
2901:
291.if \size >= 16
292        mov             r12, r5
293.endif
294        @ Load src
295.if \size >= 16
296        vld1.8          {d18, d19, d20}, [r2]!
297        vld1.8          {d24, d25, d26}, [r7]!
298.else
299        vld1.8          {q9},  [r2]
300        vld1.8          {q12}, [r7]
301.endif
302        vmovl.u8        q8,  d18
303        vmovl.u8        q9,  d19
304        vmovl.u8        q11, d24
305        vmovl.u8        q12, d25
306.if \size >= 16
307        vmovl.u8        q10, d20
308        vmovl.u8        q13, d26
309.endif
3102:
311
312        @ Accumulate, adding idx2 last with a separate
313        @ saturating add. The positive filter coefficients
314        @ for all indices except idx2 must add up to less
315        @ than 127 for this not to overflow.
316        vmul.s16        q1,  q8,  d0[0]
317        vmul.s16        q3,  q11, d0[0]
318.if \size >= 16
319        vmul.s16        q2,  q9,  d0[0]
320        vmul.s16        q4,  q12, d0[0]
321.endif
322        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 1,     \size
323        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 2,     \size
324        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx1, \size
325        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 5,     \size
326        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 6,     \size
327        extmla          q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, 7,     \size
328        extmulqadd      q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, q13, \idx2, \size
329
330        @ Round, shift and saturate
331        vqrshrun.s16    d2,  q1,  #7
332        vqrshrun.s16    d6,  q3,  #7
333.if \size >= 16
334        vqrshrun.s16    d3,  q2,  #7
335        vqrshrun.s16    d7,  q4,  #7
336.endif
337        @ Average
338.ifc \type,avg
339.if \size >= 16
340        vld1.8          {q14}, [r0,:128]
341        vld1.8          {q15}, [r6,:128]
342        vrhadd.u8       q1,  q1,  q14
343        vrhadd.u8       q3,  q3,  q15
344.elseif \size == 8
345        vld1.8          {d28}, [r0,:64]
346        vld1.8          {d30}, [r6,:64]
347        vrhadd.u8       d2,  d2,  d28
348        vrhadd.u8       d6,  d6,  d30
349.else
350        @ We only need d28[0], but [] is faster on some cores
351        vld1.32         {d28[]}, [r0,:32]
352        vld1.32         {d30[]}, [r6,:32]
353        vrhadd.u8       d2,  d2,  d28
354        vrhadd.u8       d6,  d6,  d30
355.endif
356.endif
357        @ Store and loop horizontally (for size >= 16)
358.if \size >= 16
359        subs            r12, r12, #16
360        vst1.8          {q1}, [r0,:128]!
361        vst1.8          {q3}, [r6,:128]!
362        beq             3f
363        vmov            q8,  q10
364        vmov            q11, q13
365        vld1.8          {q10}, [r2]!
366        vld1.8          {q13}, [r7]!
367        vmovl.u8        q9,  d20
368        vmovl.u8        q10, d21
369        vmovl.u8        q12, d26
370        vmovl.u8        q13, d27
371        b               2b
372.elseif \size == 8
373        vst1.8          {d2}, [r0,:64]
374        vst1.8          {d6}, [r6,:64]
375.else @ \size == 4
376        vst1.32         {d2[0]}, [r0,:32]
377        vst1.32         {d6[0]}, [r6,:32]
378.endif
3793:
380        @ Loop vertically
381        add             r0,  r0,  r1
382        add             r6,  r6,  r1
383        add             r2,  r2,  r3
384        add             r7,  r7,  r3
385        subs            r4,  r4,  #2
386        bne             1b
387.if \size >= 16
388        vpop            {q4-q6}
389.endif
390        pop             {r4-r7}
391        bx              lr
392endfunc
393.endm
394
395.macro do_8tap_h_size size
396do_8tap_h put, \size, 3, 4
397do_8tap_h avg, \size, 3, 4
398do_8tap_h put, \size, 4, 3
399do_8tap_h avg, \size, 4, 3
400.endm
401
402do_8tap_h_size 4
403do_8tap_h_size 8
404do_8tap_h_size 16
405
406.macro do_8tap_h_func type, filter, offset, size
407function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
408        push            {r4-r7}
409.if \size >= 16
410        vpush           {q4-q6}
411        ldr             r4,  [sp, #64]
412        ldr             r5,  [sp, #68]
413.else
414        ldr             r4,  [sp, #16]
415        ldr             r5,  [sp, #20]
416.endif
417        movrelx         r12, X(ff_vp9_subpel_filters), r6
418        add             r12, r12, 256*\offset
419        cmp             r5,  #8
420        add             r12, r12, r5, lsl #4
421        mov             r5,  #\size
422.if \size >= 16
423        bge             \type\()_8tap_16h_34
424        b               \type\()_8tap_16h_43
425.else
426        bge             \type\()_8tap_\size\()h_34
427        b               \type\()_8tap_\size\()h_43
428.endif
429endfunc
430.endm
431
432.macro do_8tap_h_filters size
433do_8tap_h_func put, regular, 1, \size
434do_8tap_h_func avg, regular, 1, \size
435do_8tap_h_func put, sharp,   2, \size
436do_8tap_h_func avg, sharp,   2, \size
437do_8tap_h_func put, smooth,  0, \size
438do_8tap_h_func avg, smooth,  0, \size
439.endm
440
441do_8tap_h_filters 64
442do_8tap_h_filters 32
443do_8tap_h_filters 16
444do_8tap_h_filters 8
445do_8tap_h_filters 4
446
447.ltorg
448
449@ Vertical filters
450
451@ Round, shift and saturate and store qreg1-2 over 4 lines
452.macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
453        vqrshrun.s16    \dreg1,  \qreg1, #7
454        vqrshrun.s16    \dreg2,  \qreg2, #7
455.ifc \type,avg
456        vld1.32         {\tmp1[]},   [r0,:32], r1
457        vld1.32         {\tmp2[]},   [r0,:32], r1
458        vld1.32         {\tmp1[1]},  [r0,:32], r1
459        vld1.32         {\tmp2[1]},  [r0,:32], r1
460        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
461        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
462        sub             r0,  r0,  r1, lsl #2
463.endif
464        vst1.32         {\dreg1[0]}, [r0,:32], r1
465        vst1.32         {\dreg2[0]}, [r0,:32], r1
466        vst1.32         {\dreg1[1]}, [r0,:32], r1
467        vst1.32         {\dreg2[1]}, [r0,:32], r1
468.endm
469
470@ Round, shift and saturate and store qreg1-4
471.macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
472        vqrshrun.s16    \dreg1,  \qreg1, #7
473        vqrshrun.s16    \dreg2,  \qreg2, #7
474        vqrshrun.s16    \dreg3,  \qreg3, #7
475        vqrshrun.s16    \dreg4,  \qreg4, #7
476.ifc \type,avg
477        vld1.8          {\tmp1},  [r0,:64], r1
478        vld1.8          {\tmp2},  [r0,:64], r1
479        vld1.8          {\tmp3},  [r0,:64], r1
480        vld1.8          {\tmp4},  [r0,:64], r1
481        vrhadd.u8       \dreg1,  \dreg1,  \tmp1
482        vrhadd.u8       \dreg2,  \dreg2,  \tmp2
483        vrhadd.u8       \dreg3,  \dreg3,  \tmp3
484        vrhadd.u8       \dreg4,  \dreg4,  \tmp4
485        sub             r0,  r0,  r1, lsl #2
486.endif
487        vst1.8          {\dreg1}, [r0,:64], r1
488        vst1.8          {\dreg2}, [r0,:64], r1
489        vst1.8          {\dreg3}, [r0,:64], r1
490        vst1.8          {\dreg4}, [r0,:64], r1
491.endm
492
493@ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
494@ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
495@ at the end with saturation. Indices 0 and 7 always have negative or zero
496@ coefficients, so they can be accumulated into tmp1-tmp2 together with the
497@ largest coefficient.
498.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
499        vmul.s16        \dst1, \src2, d0[1]
500        vmul.s16        \dst2, \src3, d0[1]
501        vmul.s16        \tmp1, \src1, d0[0]
502        vmul.s16        \tmp2, \src2, d0[0]
503        vmla.s16        \dst1, \src3, d0[2]
504        vmla.s16        \dst2, \src4, d0[2]
505.if \idx1 == 3
506        vmla.s16        \dst1, \src4, d0[3]
507        vmla.s16        \dst2, \src5, d0[3]
508.else
509        vmla.s16        \dst1, \src5, d1[0]
510        vmla.s16        \dst2, \src6, d1[0]
511.endif
512        vmla.s16        \dst1, \src6, d1[1]
513        vmla.s16        \dst2, \src7, d1[1]
514        vmla.s16        \tmp1, \src8, d1[3]
515        vmla.s16        \tmp2, \src9, d1[3]
516        vmla.s16        \dst1, \src7, d1[2]
517        vmla.s16        \dst2, \src8, d1[2]
518.if \idx2 == 3
519        vmla.s16        \tmp1, \src4, d0[3]
520        vmla.s16        \tmp2, \src5, d0[3]
521.else
522        vmla.s16        \tmp1, \src5, d1[0]
523        vmla.s16        \tmp2, \src6, d1[0]
524.endif
525        vqadd.s16       \dst1, \dst1, \tmp1
526        vqadd.s16       \dst2, \dst2, \tmp2
527.endm
528
529@ Load pixels and extend them to 16 bit
530.macro loadl dst1, dst2, dst3, dst4
531        vld1.8          {d2}, [r2], r3
532        vld1.8          {d3}, [r2], r3
533        vld1.8          {d4}, [r2], r3
534.ifnb \dst4
535        vld1.8          {d5}, [r2], r3
536.endif
537        vmovl.u8        \dst1, d2
538        vmovl.u8        \dst2, d3
539        vmovl.u8        \dst3, d4
540.ifnb \dst4
541        vmovl.u8        \dst4, d5
542.endif
543.endm
544
545@ Instantiate a vertical filter function for filtering 8 pixels at a time.
546@ The height is passed in r4, the width in r5 and the filter coefficients
547@ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
548@ and idx1 is the other one of them.
549.macro do_8tap_8v type, idx1, idx2
550function \type\()_8tap_8v_\idx1\idx2
551        sub             r2,  r2,  r3, lsl #1
552        sub             r2,  r2,  r3
553        vld1.16         {q0},  [r12, :128]
5541:
555        mov             r12, r4
556
557        loadl           q5,  q6,  q7
558        loadl           q8,  q9,  q10, q11
5592:
560        loadl           q12, q13, q14, q15
561        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4,  q5
562        convolve        q3,  q4,  q7,  q8,  q9,  q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5,  q6
563        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  d3,  d5,  d7,  d9,  \type
564
565        subs            r12, r12, #4
566        beq             8f
567
568        loadl           q4,  q5,  q6,  q7
569        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q4,  q5,  \idx1, \idx2, q8,  q9
570        convolve        q3,  q8,  q11, q12, q13, q14, q15, q4,  q5,  q6,  q7,  \idx1, \idx2, q9,  q10
571        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q8,  d16, d3,  d5,  d7,  d17, \type
572
573        subs            r12, r12, #4
574        beq             8f
575
576        loadl           q8,  q9,  q10, q11
577        convolve        q1,  q2,  q13, q14, q15, q4,  q5,  q6,  q7,  q8,  q9,  \idx1, \idx2, q12, q13
578        convolve        q3,  q12, q15, q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, \idx1, \idx2, q13, q14
579        do_store        q1,  d2,  q2,  d4,  q3,  d6,  q12, d24, d3,  d5,  d7,  d25, \type
580
581        subs            r12, r12, #4
582        bne             2b
583
5848:
585        subs            r5,  r5,  #8
586        beq             9f
587        @ r0 -= h * dst_stride
588        mls             r0,  r1,  r4, r0
589        @ r2 -= h * src_stride
590        mls             r2,  r3,  r4, r2
591        @ r2 -= 8 * src_stride
592        sub             r2,  r2,  r3, lsl #3
593        @ r2 += 1 * src_stride
594        add             r2,  r2,  r3
595        add             r2,  r2,  #8
596        add             r0,  r0,  #8
597        b               1b
5989:
599        vpop            {q4-q7}
600        pop             {r4-r5}
601        bx              lr
602endfunc
603.endm
604
605do_8tap_8v put, 3, 4
606do_8tap_8v put, 4, 3
607do_8tap_8v avg, 3, 4
608do_8tap_8v avg, 4, 3
609
610@ Instantiate a vertical filter function for filtering a 4 pixels wide
611@ slice. The first half of the registers contain one row, while the second
612@ half of a register contains the second-next row (also stored in the first
613@ half of the register two steps ahead). The convolution does two outputs
614@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
615@ The first half of first output is the first output row, the first half
616@ of the other output is the second output row. The second halves of the
617@ registers are rows 3 and 4.
618@ This only is designed to work for 4 or 8 output lines.
619.macro do_8tap_4v type, idx1, idx2
620function \type\()_8tap_4v_\idx1\idx2
621        sub             r2,  r2,  r3, lsl #1
622        sub             r2,  r2,  r3
623        vld1.16         {q0},  [r12, :128]
624
625        vld1.32         {d2[]},   [r2], r3
626        vld1.32         {d3[]},   [r2], r3
627        vld1.32         {d4[]},   [r2], r3
628        vld1.32         {d5[]},   [r2], r3
629        vld1.32         {d6[]},   [r2], r3
630        vld1.32         {d7[]},   [r2], r3
631        vext.8          d2,  d2,  d4,  #4
632        vld1.32         {d8[]},   [r2], r3
633        vext.8          d3,  d3,  d5,  #4
634        vld1.32         {d9[]},   [r2], r3
635        vmovl.u8        q5,  d2
636        vext.8          d4,  d4,  d6,  #4
637        vld1.32         {d28[]},  [r2], r3
638        vmovl.u8        q6,  d3
639        vext.8          d5,  d5,  d7,  #4
640        vld1.32         {d29[]},  [r2], r3
641        vmovl.u8        q7,  d4
642        vext.8          d6,  d6,  d8,  #4
643        vld1.32         {d30[]},  [r2], r3
644        vmovl.u8        q8,  d5
645        vext.8          d7,  d7,  d9,  #4
646        vmovl.u8        q9,  d6
647        vext.8          d8,  d8,  d28, #4
648        vmovl.u8        q10, d7
649        vext.8          d9,  d9,  d29, #4
650        vmovl.u8        q11, d8
651        vext.8          d28, d28, d30, #4
652        vmovl.u8        q12, d9
653        vmovl.u8        q13, d28
654
655        convolve        q1,  q2,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12, q13, \idx1, \idx2, q4, q3
656        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
657        subs            r4,  r4,  #4
658        beq             9f
659
660        vld1.32         {d2[]},   [r2], r3
661        vld1.32         {d3[]},   [r2], r3
662        vext.8          d29, d29, d2,  #4
663        vext.8          d30, d30, d3,  #4
664        vld1.32         {d2[1]},  [r2], r3
665        vmovl.u8        q14, d29
666        vld1.32         {d3[1]},  [r2], r3
667        vmovl.u8        q15, d30
668        vmovl.u8        q5,  d2
669        vmovl.u8        q6,  d3
670
671        convolve        q1,  q2,  q9,  q10, q11, q12, q13, q14, q15, q5,  q6,  \idx1, \idx2, q4, q3
672        do_store4       q1,  d2,  q2,  d4,  d3,  d5,  \type
673
6749:
675        vpop            {q4-q7}
676        pop             {r4-r5}
677        bx              lr
678endfunc
679.endm
680
681do_8tap_4v put, 3, 4
682do_8tap_4v put, 4, 3
683do_8tap_4v avg, 3, 4
684do_8tap_4v avg, 4, 3
685
686.macro do_8tap_v_func type, filter, offset, size
687function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
688        push            {r4-r5}
689        vpush           {q4-q7}
690        ldr             r4,  [sp, #72]
691        movrelx         r12, X(ff_vp9_subpel_filters), r5
692        ldr             r5,  [sp, #80]
693        add             r12, r12, 256*\offset
694        add             r12, r12, r5, lsl #4
695        cmp             r5,  #8
696        mov             r5,  #\size
697.if \size >= 8
698        bge             \type\()_8tap_8v_34
699        b               \type\()_8tap_8v_43
700.else
701        bge             \type\()_8tap_4v_34
702        b               \type\()_8tap_4v_43
703.endif
704endfunc
705.endm
706
707.macro do_8tap_v_filters size
708do_8tap_v_func put, regular, 1, \size
709do_8tap_v_func avg, regular, 1, \size
710do_8tap_v_func put, sharp,   2, \size
711do_8tap_v_func avg, sharp,   2, \size
712do_8tap_v_func put, smooth,  0, \size
713do_8tap_v_func avg, smooth,  0, \size
714.endm
715
716do_8tap_v_filters 64
717do_8tap_v_filters 32
718do_8tap_v_filters 16
719do_8tap_v_filters 8
720do_8tap_v_filters 4
721