• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23// All public functions in this file have the following signature:
24// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25//                            const uint8_t *ref, ptrdiff_t ref_stride,
26//                            int h, int mx, int my);
27
28function ff_vp9_avg64_16_neon, export=1
29        mov             x5,  x0
30        sub             x1,  x1,  #64
31        sub             x3,  x3,  #64
321:
33        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
34        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
35        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
36        urhadd          v0.8h,  v0.8h,  v4.8h
37        urhadd          v1.8h,  v1.8h,  v5.8h
38        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
39        urhadd          v2.8h,  v2.8h,  v6.8h
40        urhadd          v3.8h,  v3.8h,  v7.8h
41        subs            w4,  w4,  #1
42        urhadd          v16.8h, v16.8h, v20.8h
43        urhadd          v17.8h, v17.8h, v21.8h
44        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], #64
45        urhadd          v18.8h, v18.8h, v22.8h
46        urhadd          v19.8h, v19.8h, v23.8h
47        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
48        b.ne            1b
49        ret
50endfunc
51
52function ff_vp9_avg32_16_neon, export=1
53        mov             x5,  x0
541:
55        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
56        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
57        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
58        urhadd          v0.8h,  v0.8h,  v4.8h
59        urhadd          v1.8h,  v1.8h,  v5.8h
60        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
61        urhadd          v2.8h,  v2.8h,  v6.8h
62        urhadd          v3.8h,  v3.8h,  v7.8h
63        subs            w4,  w4,  #2
64        urhadd          v16.8h, v16.8h, v20.8h
65        urhadd          v17.8h, v17.8h, v21.8h
66        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x5], x1
67        urhadd          v18.8h, v18.8h, v22.8h
68        urhadd          v19.8h, v19.8h, v23.8h
69        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
70        b.ne            1b
71        ret
72endfunc
73
74function ff_vp9_avg16_16_neon, export=1
751:
76        ld1             {v2.8h, v3.8h},  [x2], x3
77        ld1             {v0.8h, v1.8h},  [x0]
78        urhadd          v0.8h,  v0.8h,  v2.8h
79        urhadd          v1.8h,  v1.8h,  v3.8h
80        subs            w4,  w4,  #1
81        st1             {v0.8h, v1.8h},  [x0], x1
82        b.ne            1b
83        ret
84endfunc
85
86function ff_vp9_avg8_16_neon, export=1
87        mov             x5,  x0
881:
89        ld1             {v2.8h},  [x2], x3
90        ld1             {v0.8h},  [x0], x1
91        ld1             {v3.8h},  [x2], x3
92        urhadd          v0.8h,  v0.8h,  v2.8h
93        ld1             {v1.8h},  [x0], x1
94        urhadd          v1.8h,  v1.8h,  v3.8h
95        subs            w4,  w4,  #2
96        st1             {v0.8h},  [x5], x1
97        st1             {v1.8h},  [x5], x1
98        b.ne            1b
99        ret
100endfunc
101
102function ff_vp9_avg4_16_neon, export=1
103        mov             x5,  x0
1041:
105        ld1             {v2.4h},  [x2], x3
106        ld1             {v0.4h},  [x0], x1
107        ld1             {v3.4h},  [x2], x3
108        urhadd          v0.4h,  v0.4h,  v2.4h
109        ld1             {v1.4h},  [x0], x1
110        urhadd          v1.4h,  v1.4h,  v3.4h
111        subs            w4,  w4,  #2
112        st1             {v0.4h},  [x5], x1
113        st1             {v1.8b},  [x5], x1
114        b.ne            1b
115        ret
116endfunc
117
118
119// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
120// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
121// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
122// for size >= 16)
123.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
124        ext             v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
125        ext             v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
126        smlal           \dst1\().4s, v20.4h, v0.h[\offset]
127        smlal           \dst5\().4s, v22.4h, v0.h[\offset]
128.if \size >= 16
129        ext             v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
130        ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
131.endif
132.if \size >= 8
133        smlal2          \dst2\().4s, v20.8h, v0.h[\offset]
134        smlal2          \dst6\().4s, v22.8h, v0.h[\offset]
135.endif
136.if \size >= 16
137        smlal           \dst3\().4s, v21.4h, v0.h[\offset]
138        smlal           \dst7\().4s, v23.4h, v0.h[\offset]
139        smlal2          \dst4\().4s, v21.8h, v0.h[\offset]
140        smlal2          \dst8\().4s, v23.8h, v0.h[\offset]
141.endif
142.endm
143
144
145// Instantiate a horizontal filter function for the given size.
146// This can work on 4, 8 or 16 pixels in parallel; for larger
147// widths it will do 16 pixels at a time and loop horizontally.
148// The actual width (in bytes) is passed in x5, the height in w4 and
149// the filter coefficients in x9.
150.macro do_8tap_h type, size
151function \type\()_8tap_\size\()h
152        sub             x2,  x2,  #6
153        add             x6,  x0,  x1
154        add             x7,  x2,  x3
155        add             x1,  x1,  x1
156        add             x3,  x3,  x3
157        // Only size >= 16 loops horizontally and needs
158        // reduced dst stride
159.if \size >= 16
160        sub             x1,  x1,  x5
161.endif
162        // size >= 16 loads two qwords and increments r2,
163        // for size 4/8 it's enough with one qword and no
164        // postincrement
165.if \size >= 16
166        sub             x3,  x3,  x5
167        sub             x3,  x3,  #16
168.endif
169        // Load the filter vector
170        ld1             {v0.8h},  [x9]
1711:
172.if \size >= 16
173        mov             x9,  x5
174.endif
175        // Load src
176.if \size >= 16
177        ld1             {v5.8h,  v6.8h,  v7.8h},  [x2], #48
178        ld1             {v16.8h, v17.8h, v18.8h}, [x7], #48
179.else
180        ld1             {v5.8h,  v6.8h},  [x2]
181        ld1             {v16.8h, v17.8h}, [x7]
182.endif
1832:
184
185        smull           v1.4s,  v5.4h,  v0.h[0]
186        smull           v24.4s, v16.4h, v0.h[0]
187.if \size >= 8
188        smull2          v2.4s,  v5.8h,  v0.h[0]
189        smull2          v25.4s, v16.8h, v0.h[0]
190.endif
191.if \size >= 16
192        smull           v3.4s,  v6.4h,  v0.h[0]
193        smull           v26.4s, v17.4h, v0.h[0]
194        smull2          v4.4s,  v6.8h,  v0.h[0]
195        smull2          v27.4s, v17.8h, v0.h[0]
196.endif
197        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 1, \size
198        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 2, \size
199        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 3, \size
200        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 4, \size
201        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 5, \size
202        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 6, \size
203        extmlal         v1,  v2,  v3,  v4,  v24, v25, v26, v27, v5,  v6,  v7,  v16, v17, v18, 7, \size
204
205        // Round, shift and saturate
206        // The sqrshrun takes care of clamping negative values to zero, but
207        // we manually need to do umin with the max pixel value.
208        sqrshrun        v1.4h,  v1.4s,  #7
209        sqrshrun        v24.4h, v24.4s, #7
210.if \size >= 8
211        sqrshrun2       v1.8h,  v2.4s,  #7
212        sqrshrun2       v24.8h, v25.4s, #7
213        umin            v1.8h,  v1.8h,  v31.8h
214        umin            v24.8h, v24.8h, v31.8h
215.if \size >= 16
216        sqrshrun        v2.4h,  v3.4s,  #7
217        sqrshrun        v25.4h, v26.4s, #7
218        sqrshrun2       v2.8h,  v4.4s,  #7
219        sqrshrun2       v25.8h, v27.4s, #7
220        umin            v2.8h,  v2.8h,  v31.8h
221        umin            v25.8h, v25.8h, v31.8h
222.endif
223.else
224        umin            v1.4h,  v1.4h,  v31.4h
225        umin            v24.4h, v24.4h, v31.4h
226.endif
227        // Average
228.ifc \type,avg
229.if \size >= 16
230        ld1             {v3.8h,  v4.8h},  [x0]
231        ld1             {v29.8h, v30.8h}, [x6]
232        urhadd          v1.8h,  v1.8h,  v3.8h
233        urhadd          v2.8h,  v2.8h,  v4.8h
234        urhadd          v24.8h, v24.8h, v29.8h
235        urhadd          v25.8h, v25.8h, v30.8h
236.elseif \size >= 8
237        ld1             {v3.8h},  [x0]
238        ld1             {v4.8h},  [x6]
239        urhadd          v1.8h,  v1.8h,  v3.8h
240        urhadd          v24.8h, v24.8h, v4.8h
241.else
242        ld1             {v3.4h},  [x0]
243        ld1             {v4.4h},  [x6]
244        urhadd          v1.4h,  v1.4h,  v3.4h
245        urhadd          v24.4h, v24.4h, v4.4h
246.endif
247.endif
248        // Store and loop horizontally (for size >= 16)
249.if \size >= 16
250        subs            x9,  x9,  #32
251        st1             {v1.8h,  v2.8h},  [x0], #32
252        st1             {v24.8h, v25.8h}, [x6], #32
253        b.eq            3f
254        mov             v5.16b,  v7.16b
255        mov             v16.16b, v18.16b
256        ld1             {v6.8h,  v7.8h},  [x2], #32
257        ld1             {v17.8h, v18.8h}, [x7], #32
258        b               2b
259.elseif \size == 8
260        st1             {v1.8h},  [x0]
261        st1             {v24.8h}, [x6]
262.else // \size == 4
263        st1             {v1.4h},  [x0]
264        st1             {v24.4h}, [x6]
265.endif
2663:
267        // Loop vertically
268        add             x0,  x0,  x1
269        add             x6,  x6,  x1
270        add             x2,  x2,  x3
271        add             x7,  x7,  x3
272        subs            w4,  w4,  #2
273        b.ne            1b
274        ret
275endfunc
276.endm
277
278.macro do_8tap_h_size size
279do_8tap_h put, \size
280do_8tap_h avg, \size
281.endm
282
283do_8tap_h_size 4
284do_8tap_h_size 8
285do_8tap_h_size 16
286
287.macro do_8tap_h_func type, filter, offset, size, bpp
288function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
289        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
290        movrel          x6,  X(ff_vp9_subpel_filters), 256*\offset
291        cmp             w5,  #8
292        add             x9,  x6,  w5, uxtw #4
293        mov             x5,  #2*\size
294.if \size >= 16
295        b               \type\()_8tap_16h
296.else
297        b               \type\()_8tap_\size\()h
298.endif
299endfunc
300.endm
301
302.macro do_8tap_h_filters size, bpp
303do_8tap_h_func put, regular, 1, \size, \bpp
304do_8tap_h_func avg, regular, 1, \size, \bpp
305do_8tap_h_func put, sharp,   2, \size, \bpp
306do_8tap_h_func avg, sharp,   2, \size, \bpp
307do_8tap_h_func put, smooth,  0, \size, \bpp
308do_8tap_h_func avg, smooth,  0, \size, \bpp
309.endm
310
311.macro do_8tap_h_filters_bpp bpp
312do_8tap_h_filters 64, \bpp
313do_8tap_h_filters 32, \bpp
314do_8tap_h_filters 16, \bpp
315do_8tap_h_filters 8,  \bpp
316do_8tap_h_filters 4,  \bpp
317.endm
318
319do_8tap_h_filters_bpp 10
320do_8tap_h_filters_bpp 12
321
322
323// Vertical filters
324
325// Round, shift and saturate and store reg1-reg4
326.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
327        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
328        sqrshrun        \reg2\().4h,  \reg2\().4s, #7
329        sqrshrun        \reg3\().4h,  \reg3\().4s, #7
330        sqrshrun        \reg4\().4h,  \reg4\().4s, #7
331.ifc \type,avg
332        ld1             {\tmp1\().4h},  [x7], x1
333        ld1             {\tmp2\().4h},  [x7], x1
334        ld1             {\tmp3\().4h},  [x7], x1
335        ld1             {\tmp4\().4h},  [x7], x1
336.endif
337        umin            \reg1\().4h,  \reg1\().4h,  \minreg\().4h
338        umin            \reg2\().4h,  \reg2\().4h,  \minreg\().4h
339        umin            \reg3\().4h,  \reg3\().4h,  \minreg\().4h
340        umin            \reg4\().4h,  \reg4\().4h,  \minreg\().4h
341.ifc \type,avg
342        urhadd          \reg1\().4h,  \reg1\().4h,  \tmp1\().4h
343        urhadd          \reg2\().4h,  \reg2\().4h,  \tmp2\().4h
344        urhadd          \reg3\().4h,  \reg3\().4h,  \tmp3\().4h
345        urhadd          \reg4\().4h,  \reg4\().4h,  \tmp4\().4h
346.endif
347        st1             {\reg1\().4h},  [x0], x1
348        st1             {\reg2\().4h},  [x0], x1
349        st1             {\reg3\().4h},  [x0], x1
350        st1             {\reg4\().4h},  [x0], x1
351.endm
352
353// Round, shift and saturate and store reg1-8, where
354// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
355.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
356        sqrshrun        \reg1\().4h,  \reg1\().4s, #7
357        sqrshrun2       \reg1\().8h,  \reg2\().4s, #7
358        sqrshrun        \reg2\().4h,  \reg3\().4s, #7
359        sqrshrun2       \reg2\().8h,  \reg4\().4s, #7
360        sqrshrun        \reg3\().4h,  \reg5\().4s, #7
361        sqrshrun2       \reg3\().8h,  \reg6\().4s, #7
362        sqrshrun        \reg4\().4h,  \reg7\().4s, #7
363        sqrshrun2       \reg4\().8h,  \reg8\().4s, #7
364.ifc \type,avg
365        ld1             {\reg5\().8h},  [x7], x1
366        ld1             {\reg6\().8h},  [x7], x1
367        ld1             {\reg7\().8h},  [x7], x1
368        ld1             {\reg8\().8h},  [x7], x1
369.endif
370        umin            \reg1\().8h,  \reg1\().8h,  \minreg\().8h
371        umin            \reg2\().8h,  \reg2\().8h,  \minreg\().8h
372        umin            \reg3\().8h,  \reg3\().8h,  \minreg\().8h
373        umin            \reg4\().8h,  \reg4\().8h,  \minreg\().8h
374.ifc \type,avg
375        urhadd          \reg1\().8h,  \reg1\().8h,  \reg5\().8h
376        urhadd          \reg2\().8h,  \reg2\().8h,  \reg6\().8h
377        urhadd          \reg3\().8h,  \reg3\().8h,  \reg7\().8h
378        urhadd          \reg4\().8h,  \reg4\().8h,  \reg8\().8h
379.endif
380        st1             {\reg1\().8h},  [x0], x1
381        st1             {\reg2\().8h},  [x0], x1
382        st1             {\reg3\().8h},  [x0], x1
383        st1             {\reg4\().8h},  [x0], x1
384.endm
385
386// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
387// (src1-src8 into dst1, src2-src9 into dst2).
388.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
389        smull           \dst1\().4s, \src1\().4h, v0.h[0]
390        smull           \dst2\().4s, \src2\().4h, v0.h[0]
391        smull           \tmp1\().4s, \src2\().4h, v0.h[1]
392        smull           \tmp2\().4s, \src3\().4h, v0.h[1]
393        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
394        smlal           \dst2\().4s, \src4\().4h, v0.h[2]
395        smlal           \tmp1\().4s, \src4\().4h, v0.h[3]
396        smlal           \tmp2\().4s, \src5\().4h, v0.h[3]
397        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
398        smlal           \dst2\().4s, \src6\().4h, v0.h[4]
399        smlal           \tmp1\().4s, \src6\().4h, v0.h[5]
400        smlal           \tmp2\().4s, \src7\().4h, v0.h[5]
401        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
402        smlal           \dst2\().4s, \src8\().4h, v0.h[6]
403        smlal           \tmp1\().4s, \src8\().4h, v0.h[7]
404        smlal           \tmp2\().4s, \src9\().4h, v0.h[7]
405        add             \dst1\().4s, \dst1\().4s, \tmp1\().4s
406        add             \dst2\().4s, \dst2\().4s, \tmp2\().4s
407.endm
408
409// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
410// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
411.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
412        smull           \dst1\().4s, \src1\().4h, v0.h[0]
413        smull2          \dst2\().4s, \src1\().8h, v0.h[0]
414        smull           \dst3\().4s, \src2\().4h, v0.h[0]
415        smull2          \dst4\().4s, \src2\().8h, v0.h[0]
416        smlal           \dst1\().4s, \src2\().4h, v0.h[1]
417        smlal2          \dst2\().4s, \src2\().8h, v0.h[1]
418        smlal           \dst3\().4s, \src3\().4h, v0.h[1]
419        smlal2          \dst4\().4s, \src3\().8h, v0.h[1]
420        smlal           \dst1\().4s, \src3\().4h, v0.h[2]
421        smlal2          \dst2\().4s, \src3\().8h, v0.h[2]
422        smlal           \dst3\().4s, \src4\().4h, v0.h[2]
423        smlal2          \dst4\().4s, \src4\().8h, v0.h[2]
424        smlal           \dst1\().4s, \src4\().4h, v0.h[3]
425        smlal2          \dst2\().4s, \src4\().8h, v0.h[3]
426        smlal           \dst3\().4s, \src5\().4h, v0.h[3]
427        smlal2          \dst4\().4s, \src5\().8h, v0.h[3]
428        smlal           \dst1\().4s, \src5\().4h, v0.h[4]
429        smlal2          \dst2\().4s, \src5\().8h, v0.h[4]
430        smlal           \dst3\().4s, \src6\().4h, v0.h[4]
431        smlal2          \dst4\().4s, \src6\().8h, v0.h[4]
432        smlal           \dst1\().4s, \src6\().4h, v0.h[5]
433        smlal2          \dst2\().4s, \src6\().8h, v0.h[5]
434        smlal           \dst3\().4s, \src7\().4h, v0.h[5]
435        smlal2          \dst4\().4s, \src7\().8h, v0.h[5]
436        smlal           \dst1\().4s, \src7\().4h, v0.h[6]
437        smlal2          \dst2\().4s, \src7\().8h, v0.h[6]
438        smlal           \dst3\().4s, \src8\().4h, v0.h[6]
439        smlal2          \dst4\().4s, \src8\().8h, v0.h[6]
440        smlal           \dst1\().4s, \src8\().4h, v0.h[7]
441        smlal2          \dst2\().4s, \src8\().8h, v0.h[7]
442        smlal           \dst3\().4s, \src9\().4h, v0.h[7]
443        smlal2          \dst4\().4s, \src9\().8h, v0.h[7]
444.endm
445
446// Instantiate a vertical filter function for filtering 8 pixels at a time.
447// The height is passed in x4, the width in x5 and the filter coefficients
448// in x6.
449.macro do_8tap_8v type
450function \type\()_8tap_8v
451        sub             x2,  x2,  x3, lsl #1
452        sub             x2,  x2,  x3
453        ld1             {v0.8h},  [x6]
4541:
455.ifc \type,avg
456        mov             x7,  x0
457.endif
458        mov             x6,  x4
459
460        ld1             {v17.8h}, [x2], x3
461        ld1             {v18.8h}, [x2], x3
462        ld1             {v19.8h}, [x2], x3
463        ld1             {v20.8h}, [x2], x3
464        ld1             {v21.8h}, [x2], x3
465        ld1             {v22.8h}, [x2], x3
466        ld1             {v23.8h}, [x2], x3
4672:
468        ld1             {v24.8h}, [x2], x3
469        ld1             {v25.8h}, [x2], x3
470        ld1             {v26.8h}, [x2], x3
471        ld1             {v27.8h}, [x2], x3
472
473        convolve8       v2,  v3,  v4,  v5,  v17, v18, v19, v20, v21, v22, v23, v24, v25
474        convolve8       v6,  v7,  v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
475        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v30, v31, v1,  \type
476
477        subs            x6,  x6,  #4
478        b.eq            8f
479
480        ld1             {v16.8h}, [x2], x3
481        ld1             {v17.8h}, [x2], x3
482        ld1             {v18.8h}, [x2], x3
483        ld1             {v19.8h}, [x2], x3
484        convolve8       v2,  v3,  v4,  v5,  v21, v22, v23, v24, v25, v26, v27, v16, v17
485        convolve8       v6,  v7,  v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
486        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v20, v21, v1,  \type
487
488        subs            x6,  x6,  #4
489        b.eq            8f
490
491        ld1             {v20.8h}, [x2], x3
492        ld1             {v21.8h}, [x2], x3
493        ld1             {v22.8h}, [x2], x3
494        ld1             {v23.8h}, [x2], x3
495        convolve8       v2,  v3,  v4,  v5,  v25, v26, v27, v16, v17, v18, v19, v20, v21
496        convolve8       v6,  v7,  v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
497        do_store8       v2,  v3,  v4,  v5,  v6,  v7,  v24, v25, v1,  \type
498
499        subs            x6,  x6,  #4
500        b.ne            2b
501
5028:
503        subs            x5,  x5,  #8
504        b.eq            9f
505        // x0 -= h * dst_stride
506        msub            x0,  x1,  x4, x0
507        // x2 -= h * src_stride
508        msub            x2,  x3,  x4, x2
509        // x2 -= 8 * src_stride
510        sub             x2,  x2,  x3, lsl #3
511        // x2 += 1 * src_stride
512        add             x2,  x2,  x3
513        add             x2,  x2,  #16
514        add             x0,  x0,  #16
515        b               1b
5169:
517        ret
518endfunc
519.endm
520
521do_8tap_8v put
522do_8tap_8v avg
523
524
525// Instantiate a vertical filter function for filtering a 4 pixels wide
526// slice. This only is designed to work for 4 or 8 output lines.
527.macro do_8tap_4v type
528function \type\()_8tap_4v
529        sub             x2,  x2,  x3, lsl #1
530        sub             x2,  x2,  x3
531        ld1             {v0.8h},  [x6]
532.ifc \type,avg
533        mov             x7,  x0
534.endif
535
536        ld1             {v16.4h}, [x2], x3
537        ld1             {v17.4h}, [x2], x3
538        ld1             {v18.4h}, [x2], x3
539        ld1             {v19.4h}, [x2], x3
540        ld1             {v20.4h}, [x2], x3
541        ld1             {v21.4h}, [x2], x3
542        ld1             {v22.4h}, [x2], x3
543        ld1             {v23.4h}, [x2], x3
544        ld1             {v24.4h}, [x2], x3
545        ld1             {v25.4h}, [x2], x3
546        ld1             {v26.4h}, [x2], x3
547
548        convolve4       v2,  v3,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
549        convolve4       v4,  v5,  v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
550        do_store4       v2,  v3,  v4,  v5,  v28, v29, v30, v31, v1,  \type
551
552        subs            x4,  x4,  #4
553        b.eq            9f
554
555        ld1             {v27.4h}, [x2], x3
556        ld1             {v28.4h}, [x2], x3
557        ld1             {v29.4h}, [x2], x3
558        ld1             {v30.4h}, [x2], x3
559
560        convolve4       v2,  v3,  v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
561        convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
562        do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
563
5649:
565        ret
566endfunc
567.endm
568
569do_8tap_4v put
570do_8tap_4v avg
571
572
573.macro do_8tap_v_func type, filter, offset, size, bpp
574function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
575        uxtw            x4,  w4
576        mvni            v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
577        movrel          x5,  X(ff_vp9_subpel_filters), 256*\offset
578        add             x6,  x5,  w6, uxtw #4
579        mov             x5,  #\size
580.if \size >= 8
581        b               \type\()_8tap_8v
582.else
583        b               \type\()_8tap_4v
584.endif
585endfunc
586.endm
587
588.macro do_8tap_v_filters size, bpp
589do_8tap_v_func put, regular, 1, \size, \bpp
590do_8tap_v_func avg, regular, 1, \size, \bpp
591do_8tap_v_func put, sharp,   2, \size, \bpp
592do_8tap_v_func avg, sharp,   2, \size, \bpp
593do_8tap_v_func put, smooth,  0, \size, \bpp
594do_8tap_v_func avg, smooth,  0, \size, \bpp
595.endm
596
597.macro do_8tap_v_filters_bpp bpp
598do_8tap_v_filters 64, \bpp
599do_8tap_v_filters 32, \bpp
600do_8tap_v_filters 16, \bpp
601do_8tap_v_filters 8,  \bpp
602do_8tap_v_filters 4,  \bpp
603.endm
604
605do_8tap_v_filters_bpp 10
606do_8tap_v_filters_bpp 12
607