• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25        /* H.264 qpel MC */
26
27.macro  lowpass_const   r
28        movz            \r, #20, lsl #16
29        movk            \r, #5
30        mov             v6.S[0], \r
31.endm
32
33//trashes v0-v5
34.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
35        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
36        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
37        uaddl           v2.8H,      v2.8B,     v3.8B
38        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
39        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
40        uaddl           v4.8H,      v4.8B,     v5.8B
41        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
42        uaddl           \d0\().8H,  \r0\().8B, v1.8B
43        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
44        mla             \d0\().8H,  v2.8H,     v6.H[1]
45        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
46        uaddl           v0.8H,      v0.8B,     v1.8B
47        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
48        mls             \d0\().8H,  v4.8H,     v6.H[0]
49        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
50        uaddl           v1.8H,      v1.8B,     v3.8B
51        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
52        uaddl           \d1\().8H,  \r2\().8B, v2.8B
53        mla             \d1\().8H,  v0.8H,     v6.H[1]
54        mls             \d1\().8H,  v1.8H,     v6.H[0]
55  .if \narrow
56        sqrshrun        \d0\().8B,  \d0\().8H, #5
57        sqrshrun        \d1\().8B,  \d1\().8H, #5
58  .endif
59.endm
60
61//trashes v0-v4
62.macro  lowpass_8_v     r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1,  narrow=1
63        uaddl           v2.8H,      \r2\().8B, \r3\().8B
64        uaddl           v0.8H,      \r3\().8B, \r4\().8B
65        uaddl           v4.8H,      \r1\().8B, \r4\().8B
66        uaddl           v1.8H,      \r2\().8B, \r5\().8B
67        uaddl           \d0\().8H,  \r0\().8B, \r5\().8B
68        uaddl           \d1\().8H,  \r1\().8B, \r6\().8B
69        mla             \d0\().8H,  v2.8H,     v6.H[1]
70        mls             \d0\().8H,  v4.8H,     v6.H[0]
71        mla             \d1\().8H,  v0.8H,     v6.H[1]
72        mls             \d1\().8H,  v1.8H,     v6.H[0]
73  .if \narrow
74        sqrshrun        \d0\().8B,  \d0\().8H, #5
75        sqrshrun        \d1\().8B,  \d1\().8H, #5
76  .endif
77.endm
78
79//trashes v0-v5, v7, v30-v31
80.macro  lowpass_8H      r0,  r1
81        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
82        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
83        uaddl           v0.8H,      v0.8B,      v1.8B
84        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
85        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
86        uaddl           v2.8H,      v2.8B,      v3.8B
87        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
88        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
89        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
90        mla             \r0\().8H,  v0.8H,      v6.H[1]
91        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
92        uaddl           v4.8H,      v4.8B,      v5.8B
93        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
94        mls             \r0\().8H,  v2.8H,      v6.H[0]
95        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
96        uaddl           v7.8H,      v7.8B,      v0.8B
97        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
98        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
99        mla             \r1\().8H,  v4.8H,      v6.H[1]
100        mls             \r1\().8H,  v7.8H,      v6.H[0]
101.endm
102
103// trashes v2-v5, v30
104.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
105        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
106        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
107        uaddl           v2.8H,     v2.8B,     v3.8B
108        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
109        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
110        uaddl           v4.8H,     v4.8B,     v5.8B
111        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
112        uaddl           \d0\().8H, \r0\().8B, v30.8B
113        mla             \d0\().8H, v2.8H,     v6.H[1]
114        mls             \d0\().8H, v4.8H,     v6.H[0]
115  .if \narrow
116        sqrshrun        \d0\().8B, \d0\().8H, #5
117  .endif
118.endm
119
120// trashed v0-v7
121.macro  lowpass_8.16    r0,  r1,  r2,  r3,  r4,  r5
122        saddl           v5.4S,      \r2\().4H,  \r3\().4H
123        saddl2          v1.4S,      \r2\().8H,  \r3\().8H
124        saddl           v6.4S,      \r1\().4H,  \r4\().4H
125        saddl2          v2.4S,      \r1\().8H,  \r4\().8H
126        saddl           v0.4S,      \r0\().4H,  \r5\().4H
127        saddl2          v4.4S,      \r0\().8H,  \r5\().8H
128
129        shl             v3.4S,  v5.4S,  #4
130        shl             v5.4S,  v5.4S,  #2
131        shl             v7.4S,  v6.4S,  #2
132        add             v5.4S,  v5.4S,  v3.4S
133        add             v6.4S,  v6.4S,  v7.4S
134
135        shl             v3.4S,  v1.4S,  #4
136        shl             v1.4S,  v1.4S,  #2
137        shl             v7.4S,  v2.4S,  #2
138        add             v1.4S,  v1.4S,  v3.4S
139        add             v2.4S,  v2.4S,  v7.4S
140
141        add             v5.4S,  v5.4S,  v0.4S
142        sub             v5.4S,  v5.4S,  v6.4S
143
144        add             v1.4S,  v1.4S,  v4.4S
145        sub             v1.4S,  v1.4S,  v2.4S
146
147        rshrn           v5.4H,  v5.4S,  #10
148        rshrn2          v5.8H,  v1.4S,  #10
149
150        sqxtun          \r0\().8B,  v5.8H
151.endm
152
153function put_h264_qpel16_h_lowpass_neon_packed
154        mov             x4,  x30
155        mov             x12, #16
156        mov             x3,  #8
157        bl              put_h264_qpel8_h_lowpass_neon
158        sub             x1,  x1,  x2, lsl #4
159        add             x1,  x1,  #8
160        mov             x12, #16
161        mov             x30, x4
162        b               put_h264_qpel8_h_lowpass_neon
163endfunc
164
165.macro  h264_qpel_h_lowpass type
166function \type\()_h264_qpel16_h_lowpass_neon
167        mov             x13, x30
168        mov             x12, #16
169        bl              \type\()_h264_qpel8_h_lowpass_neon
170        sub             x0,  x0,  x3, lsl #4
171        sub             x1,  x1,  x2, lsl #4
172        add             x0,  x0,  #8
173        add             x1,  x1,  #8
174        mov             x12, #16
175        mov             x30, x13
176endfunc
177
178function \type\()_h264_qpel8_h_lowpass_neon
1791:      ld1             {v28.8B, v29.8B}, [x1], x2
180        ld1             {v16.8B, v17.8B}, [x1], x2
181        subs            x12, x12, #2
182        lowpass_8       v28, v29, v16, v17, v28, v16
183  .ifc \type,avg
184        ld1             {v2.8B},    [x0], x3
185        ld1             {v3.8B},    [x0]
186        urhadd          v28.8B, v28.8B,  v2.8B
187        urhadd          v16.8B, v16.8B, v3.8B
188        sub             x0,  x0,  x3
189  .endif
190        st1             {v28.8B},    [x0], x3
191        st1             {v16.8B},    [x0], x3
192        b.ne            1b
193        ret
194endfunc
195.endm
196
197        h264_qpel_h_lowpass put
198        h264_qpel_h_lowpass avg
199
200.macro  h264_qpel_h_lowpass_l2 type
201function \type\()_h264_qpel16_h_lowpass_l2_neon
202        mov             x13, x30
203        mov             x12, #16
204        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
205        sub             x0,  x0,  x2, lsl #4
206        sub             x1,  x1,  x2, lsl #4
207        sub             x3,  x3,  x2, lsl #4
208        add             x0,  x0,  #8
209        add             x1,  x1,  #8
210        add             x3,  x3,  #8
211        mov             x12, #16
212        mov             x30, x13
213endfunc
214
215function \type\()_h264_qpel8_h_lowpass_l2_neon
2161:      ld1             {v26.8B, v27.8B}, [x1], x2
217        ld1             {v16.8B, v17.8B}, [x1], x2
218        ld1             {v28.8B},     [x3], x2
219        ld1             {v29.8B},     [x3], x2
220        subs            x12, x12, #2
221        lowpass_8       v26, v27, v16, v17, v26, v27
222        urhadd          v26.8B, v26.8B, v28.8B
223        urhadd          v27.8B, v27.8B, v29.8B
224  .ifc \type,avg
225        ld1             {v2.8B},      [x0], x2
226        ld1             {v3.8B},      [x0]
227        urhadd          v26.8B, v26.8B, v2.8B
228        urhadd          v27.8B, v27.8B, v3.8B
229        sub             x0,  x0,  x2
230  .endif
231        st1             {v26.8B},     [x0], x2
232        st1             {v27.8B},     [x0], x2
233        b.ne            1b
234        ret
235endfunc
236.endm
237
238        h264_qpel_h_lowpass_l2 put
239        h264_qpel_h_lowpass_l2 avg
240
241function put_h264_qpel16_v_lowpass_neon_packed
242        mov             x4,  x30
243        mov             x2,  #8
244        bl              put_h264_qpel8_v_lowpass_neon
245        sub             x1,  x1,  x3, lsl #2
246        bl              put_h264_qpel8_v_lowpass_neon
247        sub             x1,  x1,  x3, lsl #4
248        sub             x1,  x1,  x3, lsl #2
249        add             x1,  x1,  #8
250        bl              put_h264_qpel8_v_lowpass_neon
251        sub             x1,  x1,  x3, lsl #2
252        mov             x30, x4
253        b               put_h264_qpel8_v_lowpass_neon
254endfunc
255
256.macro  h264_qpel_v_lowpass type
257function \type\()_h264_qpel16_v_lowpass_neon
258        mov             x4,  x30
259        bl              \type\()_h264_qpel8_v_lowpass_neon
260        sub             x1,  x1,  x3, lsl #2
261        bl              \type\()_h264_qpel8_v_lowpass_neon
262        sub             x0,  x0,  x2, lsl #4
263        add             x0,  x0,  #8
264        sub             x1,  x1,  x3, lsl #4
265        sub             x1,  x1,  x3, lsl #2
266        add             x1,  x1,  #8
267        bl              \type\()_h264_qpel8_v_lowpass_neon
268        sub             x1,  x1,  x3, lsl #2
269        mov             x30, x4
270endfunc
271
272function \type\()_h264_qpel8_v_lowpass_neon
273        ld1             {v16.8B}, [x1], x3
274        ld1             {v17.8B}, [x1], x3
275        ld1             {v18.8B}, [x1], x3
276        ld1             {v19.8B}, [x1], x3
277        ld1             {v20.8B}, [x1], x3
278        ld1             {v21.8B}, [x1], x3
279        ld1             {v22.8B}, [x1], x3
280        ld1             {v23.8B}, [x1], x3
281        ld1             {v24.8B}, [x1], x3
282        ld1             {v25.8B}, [x1], x3
283        ld1             {v26.8B}, [x1], x3
284        ld1             {v27.8B}, [x1], x3
285        ld1             {v28.8B}, [x1]
286
287        lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
288        lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
289        lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
290        lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
291  .ifc \type,avg
292        ld1             {v24.8B},  [x0], x2
293        ld1             {v25.8B}, [x0], x2
294        ld1             {v26.8B}, [x0], x2
295        urhadd          v16.8B, v16.8B, v24.8B
296        ld1             {v27.8B}, [x0], x2
297        urhadd          v17.8B, v17.8B, v25.8B
298        ld1             {v28.8B}, [x0], x2
299        urhadd          v18.8B, v18.8B, v26.8B
300        ld1             {v29.8B}, [x0], x2
301        urhadd          v19.8B, v19.8B, v27.8B
302        ld1             {v30.8B}, [x0], x2
303        urhadd          v20.8B, v20.8B, v28.8B
304        ld1             {v31.8B}, [x0], x2
305        urhadd          v21.8B, v21.8B, v29.8B
306        urhadd          v22.8B, v22.8B, v30.8B
307        urhadd          v23.8B, v23.8B, v31.8B
308        sub             x0,  x0,  x2,  lsl #3
309  .endif
310
311        st1             {v16.8B}, [x0], x2
312        st1             {v17.8B}, [x0], x2
313        st1             {v18.8B}, [x0], x2
314        st1             {v19.8B}, [x0], x2
315        st1             {v20.8B}, [x0], x2
316        st1             {v21.8B}, [x0], x2
317        st1             {v22.8B}, [x0], x2
318        st1             {v23.8B}, [x0], x2
319
320        ret
321endfunc
322.endm
323
324        h264_qpel_v_lowpass put
325        h264_qpel_v_lowpass avg
326
327.macro  h264_qpel_v_lowpass_l2 type
328function \type\()_h264_qpel16_v_lowpass_l2_neon
329        mov             x4,  x30
330        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
331        sub             x1,  x1,  x3, lsl #2
332        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
333        sub             x0,  x0,  x3, lsl #4
334        sub             x12, x12, x2, lsl #4
335        add             x0,  x0,  #8
336        add             x12, x12, #8
337        sub             x1,  x1,  x3, lsl #4
338        sub             x1,  x1,  x3, lsl #2
339        add             x1,  x1,  #8
340        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
341        sub             x1,  x1,  x3, lsl #2
342        mov             x30, x4
343endfunc
344
345function \type\()_h264_qpel8_v_lowpass_l2_neon
346        ld1             {v16.8B}, [x1], x3
347        ld1             {v17.8B}, [x1], x3
348        ld1             {v18.8B}, [x1], x3
349        ld1             {v19.8B}, [x1], x3
350        ld1             {v20.8B}, [x1], x3
351        ld1             {v21.8B}, [x1], x3
352        ld1             {v22.8B}, [x1], x3
353        ld1             {v23.8B}, [x1], x3
354        ld1             {v24.8B}, [x1], x3
355        ld1             {v25.8B}, [x1], x3
356        ld1             {v26.8B}, [x1], x3
357        ld1             {v27.8B}, [x1], x3
358        ld1             {v28.8B}, [x1]
359
360        lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
361        lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
362        lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
363        lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
364
365        ld1             {v24.8B},  [x12], x2
366        ld1             {v25.8B},  [x12], x2
367        ld1             {v26.8B},  [x12], x2
368        ld1             {v27.8B},  [x12], x2
369        ld1             {v28.8B},  [x12], x2
370        urhadd          v16.8B, v24.8B, v16.8B
371        urhadd          v17.8B, v25.8B, v17.8B
372        ld1             {v29.8B},  [x12], x2
373        urhadd          v18.8B, v26.8B, v18.8B
374        urhadd          v19.8B, v27.8B, v19.8B
375        ld1             {v30.8B}, [x12], x2
376        urhadd          v20.8B, v28.8B, v20.8B
377        urhadd          v21.8B, v29.8B, v21.8B
378        ld1             {v31.8B}, [x12], x2
379        urhadd          v22.8B, v30.8B, v22.8B
380        urhadd          v23.8B, v31.8B, v23.8B
381
382  .ifc \type,avg
383        ld1             {v24.8B}, [x0], x3
384        ld1             {v25.8B}, [x0], x3
385        ld1             {v26.8B}, [x0], x3
386        urhadd          v16.8B, v16.8B, v24.8B
387        ld1             {v27.8B}, [x0], x3
388        urhadd          v17.8B, v17.8B, v25.8B
389        ld1             {v28.8B}, [x0], x3
390        urhadd          v18.8B, v18.8B, v26.8B
391        ld1             {v29.8B}, [x0], x3
392        urhadd          v19.8B, v19.8B, v27.8B
393        ld1             {v30.8B}, [x0], x3
394        urhadd          v20.8B, v20.8B, v28.8B
395        ld1             {v31.8B}, [x0], x3
396        urhadd          v21.8B, v21.8B, v29.8B
397        urhadd          v22.8B, v22.8B, v30.8B
398        urhadd          v23.8B, v23.8B, v31.8B
399        sub             x0,  x0,  x3,  lsl #3
400  .endif
401
402        st1             {v16.8B}, [x0], x3
403        st1             {v17.8B}, [x0], x3
404        st1             {v18.8B}, [x0], x3
405        st1             {v19.8B}, [x0], x3
406        st1             {v20.8B}, [x0], x3
407        st1             {v21.8B}, [x0], x3
408        st1             {v22.8B}, [x0], x3
409        st1             {v23.8B}, [x0], x3
410
411        ret
412endfunc
413.endm
414
415        h264_qpel_v_lowpass_l2 put
416        h264_qpel_v_lowpass_l2 avg
417
418function put_h264_qpel8_hv_lowpass_neon_top
419        lowpass_const   w12
420        ld1             {v16.8H}, [x1], x3
421        ld1             {v17.8H}, [x1], x3
422        ld1             {v18.8H}, [x1], x3
423        ld1             {v19.8H}, [x1], x3
424        ld1             {v20.8H}, [x1], x3
425        ld1             {v21.8H}, [x1], x3
426        ld1             {v22.8H}, [x1], x3
427        ld1             {v23.8H}, [x1], x3
428        ld1             {v24.8H}, [x1], x3
429        ld1             {v25.8H}, [x1], x3
430        ld1             {v26.8H}, [x1], x3
431        ld1             {v27.8H}, [x1], x3
432        ld1             {v28.8H}, [x1]
433        lowpass_8H      v16, v17
434        lowpass_8H      v18, v19
435        lowpass_8H      v20, v21
436        lowpass_8H      v22, v23
437        lowpass_8H      v24, v25
438        lowpass_8H      v26, v27
439        lowpass_8H      v28, v29
440
441        lowpass_8.16    v16, v17, v18, v19, v20, v21
442        lowpass_8.16    v17, v18, v19, v20, v21, v22
443
444        lowpass_8.16    v18, v19, v20, v21, v22, v23
445        lowpass_8.16    v19, v20, v21, v22, v23, v24
446
447        lowpass_8.16    v20, v21, v22, v23, v24, v25
448        lowpass_8.16    v21, v22, v23, v24, v25, v26
449
450        lowpass_8.16    v22, v23, v24, v25, v26, v27
451        lowpass_8.16    v23, v24, v25, v26, v27, v28
452
453        ret
454endfunc
455
456.macro  h264_qpel8_hv_lowpass type
457function \type\()_h264_qpel8_hv_lowpass_neon
458        mov             x10, x30
459        bl              put_h264_qpel8_hv_lowpass_neon_top
460  .ifc \type,avg
461        ld1             {v0.8B},      [x0], x2
462        ld1             {v1.8B},      [x0], x2
463        ld1             {v2.8B},      [x0], x2
464        urhadd          v16.8B, v16.8B, v0.8B
465        ld1             {v3.8B},      [x0], x2
466        urhadd          v17.8B, v17.8B, v1.8B
467        ld1             {v4.8B},      [x0], x2
468        urhadd          v18.8B, v18.8B, v2.8B
469        ld1             {v5.8B},      [x0], x2
470        urhadd          v19.8B, v19.8B, v3.8B
471        ld1             {v6.8B},      [x0], x2
472        urhadd          v20.8B, v20.8B, v4.8B
473        ld1             {v7.8B},      [x0], x2
474        urhadd          v21.8B, v21.8B, v5.8B
475        urhadd          v22.8B, v22.8B, v6.8B
476        urhadd          v23.8B, v23.8B, v7.8B
477        sub             x0,  x0,  x2,  lsl #3
478  .endif
479
480        st1             {v16.8B},     [x0], x2
481        st1             {v17.8B},     [x0], x2
482        st1             {v18.8B},     [x0], x2
483        st1             {v19.8B},     [x0], x2
484        st1             {v20.8B},     [x0], x2
485        st1             {v21.8B},     [x0], x2
486        st1             {v22.8B},     [x0], x2
487        st1             {v23.8B},     [x0], x2
488
489        ret             x10
490endfunc
491.endm
492
493        h264_qpel8_hv_lowpass put
494        h264_qpel8_hv_lowpass avg
495
496.macro  h264_qpel8_hv_lowpass_l2 type
497function \type\()_h264_qpel8_hv_lowpass_l2_neon
498        mov             x10, x30
499        bl              put_h264_qpel8_hv_lowpass_neon_top
500
501        ld1             {v0.8B, v1.8B},  [x2], #16
502        ld1             {v2.8B, v3.8B},  [x2], #16
503        urhadd          v0.8B,  v0.8B,  v16.8B
504        urhadd          v1.8B,  v1.8B,  v17.8B
505        ld1             {v4.8B, v5.8B},  [x2], #16
506        urhadd          v2.8B,  v2.8B,  v18.8B
507        urhadd          v3.8B,  v3.8B,  v19.8B
508        ld1             {v6.8B, v7.8B},  [x2], #16
509        urhadd          v4.8B,  v4.8B,  v20.8B
510        urhadd          v5.8B,  v5.8B,  v21.8B
511        urhadd          v6.8B,  v6.8B,  v22.8B
512        urhadd          v7.8B,  v7.8B,  v23.8B
513  .ifc \type,avg
514        ld1             {v16.8B},     [x0], x3
515        ld1             {v17.8B},     [x0], x3
516        ld1             {v18.8B},     [x0], x3
517        urhadd          v0.8B,  v0.8B,  v16.8B
518        ld1             {v19.8B},     [x0], x3
519        urhadd          v1.8B,  v1.8B,  v17.8B
520        ld1             {v20.8B},     [x0], x3
521        urhadd          v2.8B,  v2.8B,  v18.8B
522        ld1             {v21.8B},     [x0], x3
523        urhadd          v3.8B,  v3.8B,  v19.8B
524        ld1             {v22.8B},     [x0], x3
525        urhadd          v4.8B,  v4.8B,  v20.8B
526        ld1             {v23.8B},     [x0], x3
527        urhadd          v5.8B,  v5.8B,  v21.8B
528        urhadd          v6.8B,  v6.8B,  v22.8B
529        urhadd          v7.8B,  v7.8B,  v23.8B
530        sub             x0,  x0,  x3,  lsl #3
531  .endif
532        st1             {v0.8B},      [x0], x3
533        st1             {v1.8B},      [x0], x3
534        st1             {v2.8B},      [x0], x3
535        st1             {v3.8B},      [x0], x3
536        st1             {v4.8B},      [x0], x3
537        st1             {v5.8B},      [x0], x3
538        st1             {v6.8B},      [x0], x3
539        st1             {v7.8B},      [x0], x3
540
541        ret             x10
542endfunc
543.endm
544
545        h264_qpel8_hv_lowpass_l2 put
546        h264_qpel8_hv_lowpass_l2 avg
547
548.macro  h264_qpel16_hv  type
549function \type\()_h264_qpel16_hv_lowpass_neon
550        mov             x13, x30
551        bl              \type\()_h264_qpel8_hv_lowpass_neon
552        sub             x1,  x1,  x3, lsl #2
553        bl              \type\()_h264_qpel8_hv_lowpass_neon
554        sub             x1,  x1,  x3, lsl #4
555        sub             x1,  x1,  x3, lsl #2
556        add             x1,  x1,  #8
557        sub             x0,  x0,  x2, lsl #4
558        add             x0,  x0,  #8
559        bl              \type\()_h264_qpel8_hv_lowpass_neon
560        sub             x1,  x1,  x3, lsl #2
561        mov             x30, x13
562        b               \type\()_h264_qpel8_hv_lowpass_neon
563endfunc
564
565function \type\()_h264_qpel16_hv_lowpass_l2_neon
566        mov             x13, x30
567        sub             x2,  x4,  #256
568        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
569        sub             x1,  x1,  x3, lsl #2
570        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
571        sub             x1,  x1,  x3, lsl #4
572        sub             x1,  x1,  x3, lsl #2
573        add             x1,  x1,  #8
574        sub             x0,  x0,  x3, lsl #4
575        add             x0,  x0,  #8
576        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
577        sub             x1,  x1,  x3, lsl #2
578        mov             x30, x13
579        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
580endfunc
581.endm
582
583        h264_qpel16_hv put
584        h264_qpel16_hv avg
585
586.macro  h264_qpel8      type
587function ff_\type\()_h264_qpel8_mc10_neon, export=1
588        lowpass_const   w3
589        mov             x3,  x1
590        sub             x1,  x1,  #2
591        mov             x12, #8
592        b               \type\()_h264_qpel8_h_lowpass_l2_neon
593endfunc
594
595function ff_\type\()_h264_qpel8_mc20_neon, export=1
596        lowpass_const   w3
597        sub             x1,  x1,  #2
598        mov             x3,  x2
599        mov             x12, #8
600        b               \type\()_h264_qpel8_h_lowpass_neon
601endfunc
602
603function ff_\type\()_h264_qpel8_mc30_neon, export=1
604        lowpass_const   w3
605        add             x3,  x1,  #1
606        sub             x1,  x1,  #2
607        mov             x12, #8
608        b               \type\()_h264_qpel8_h_lowpass_l2_neon
609endfunc
610
611function ff_\type\()_h264_qpel8_mc01_neon, export=1
612        mov             x14, x30
613        mov             x12, x1
614\type\()_h264_qpel8_mc01:
615        lowpass_const   w3
616        mov             x3,  x2
617        sub             x1,  x1,  x2, lsl #1
618        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
619        ret             x14
620endfunc
621
622function ff_\type\()_h264_qpel8_mc11_neon, export=1
623        mov             x14, x30
624        mov             x8,  x0
625        mov             x9,  x1
626\type\()_h264_qpel8_mc11:
627        lowpass_const   w3
628        mov             x11, sp
629        sub             sp,  sp,  #64
630        mov             x0,  sp
631        sub             x1,  x1,  #2
632        mov             x3,  #8
633        mov             x12, #8
634        bl              put_h264_qpel8_h_lowpass_neon
635        mov             x0,  x8
636        mov             x3,  x2
637        mov             x12, sp
638        sub             x1,  x9,  x2, lsl #1
639        mov             x2,  #8
640        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
641        mov             sp,  x11
642        ret             x14
643endfunc
644
645function ff_\type\()_h264_qpel8_mc21_neon, export=1
646        mov             x14, x30
647        mov             x8,  x0
648        mov             x9,  x1
649\type\()_h264_qpel8_mc21:
650        lowpass_const   w3
651        mov             x11, sp
652        sub             sp,  sp,  #(8*8+16*12)
653        sub             x1,  x1,  #2
654        mov             x3,  #8
655        mov             x0,  sp
656        mov             x12, #8
657        bl              put_h264_qpel8_h_lowpass_neon
658        mov             x4,  x0
659        mov             x0,  x8
660        sub             x1,  x9,  x2, lsl #1
661        sub             x1,  x1,  #2
662        mov             x3,  x2
663        sub             x2,  x4,  #64
664        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
665        mov             sp,  x11
666        ret             x14
667endfunc
668
669function ff_\type\()_h264_qpel8_mc31_neon, export=1
670        add             x1,  x1,  #1
671        mov             x14, x30
672        mov             x8,  x0
673        mov             x9,  x1
674        sub             x1,  x1,  #1
675        b               \type\()_h264_qpel8_mc11
676endfunc
677
678function ff_\type\()_h264_qpel8_mc02_neon, export=1
679        mov             x14, x30
680        lowpass_const   w3
681        sub             x1,  x1,  x2, lsl #1
682        mov             x3,  x2
683        bl              \type\()_h264_qpel8_v_lowpass_neon
684        ret             x14
685endfunc
686
687function ff_\type\()_h264_qpel8_mc12_neon, export=1
688        mov             x14, x30
689        mov             x8,  x0
690        mov             x9,  x1
691\type\()_h264_qpel8_mc12:
692        lowpass_const   w3
693        mov             x11, sp
694        sub             sp,  sp,  #(8*8+16*12)
695        sub             x1,  x1,  x2, lsl #1
696        mov             x3,  x2
697        mov             x2,  #8
698        mov             x0,  sp
699        bl              put_h264_qpel8_v_lowpass_neon
700        mov             x4,  x0
701        mov             x0,  x8
702        sub             x1,  x9,  x3, lsl #1
703        sub             x1,  x1,  #2
704        sub             x2,  x4,  #64
705        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
706        mov             sp,  x11
707        ret             x14
708endfunc
709
710function ff_\type\()_h264_qpel8_mc22_neon, export=1
711        mov             x14, x30
712        mov             x11, sp
713        sub             x1,  x1,  x2, lsl #1
714        sub             x1,  x1,  #2
715        mov             x3,  x2
716        bl              \type\()_h264_qpel8_hv_lowpass_neon
717        mov             sp,  x11
718        ret             x14
719endfunc
720
721function ff_\type\()_h264_qpel8_mc32_neon, export=1
722        mov             x14, x30
723        mov             x8,  x0
724        mov             x9,  x1
725        add             x1,  x1,  #1
726        b               \type\()_h264_qpel8_mc12
727endfunc
728
729function ff_\type\()_h264_qpel8_mc03_neon, export=1
730        mov             x14, x30
731        add             x12, x1,  x2
732        b               \type\()_h264_qpel8_mc01
733endfunc
734
735function ff_\type\()_h264_qpel8_mc13_neon, export=1
736        mov             x14, x30
737        mov             x8,  x0
738        mov             x9,  x1
739        add             x1,  x1,  x2
740        b               \type\()_h264_qpel8_mc11
741endfunc
742
743function ff_\type\()_h264_qpel8_mc23_neon, export=1
744        mov             x14, x30
745        mov             x8,  x0
746        mov             x9,  x1
747        add             x1,  x1,  x2
748        b               \type\()_h264_qpel8_mc21
749endfunc
750
751function ff_\type\()_h264_qpel8_mc33_neon, export=1
752        add             x1,  x1,  #1
753        mov             x14, x30
754        mov             x8,  x0
755        mov             x9,  x1
756        add             x1,  x1,  x2
757        sub             x1,  x1,  #1
758        b               \type\()_h264_qpel8_mc11
759endfunc
760.endm
761
762        h264_qpel8 put
763        h264_qpel8 avg
764
765.macro  h264_qpel16     type
766function ff_\type\()_h264_qpel16_mc10_neon, export=1
767        lowpass_const   w3
768        mov             x3,  x1
769        sub             x1,  x1,  #2
770        b               \type\()_h264_qpel16_h_lowpass_l2_neon
771endfunc
772
773function ff_\type\()_h264_qpel16_mc20_neon, export=1
774        lowpass_const   w3
775        sub             x1,  x1,  #2
776        mov             x3,  x2
777        b               \type\()_h264_qpel16_h_lowpass_neon
778endfunc
779
780function ff_\type\()_h264_qpel16_mc30_neon, export=1
781        lowpass_const   w3
782        add             x3,  x1,  #1
783        sub             x1,  x1,  #2
784        b               \type\()_h264_qpel16_h_lowpass_l2_neon
785endfunc
786
787function ff_\type\()_h264_qpel16_mc01_neon, export=1
788        mov             x14, x30
789        mov             x12, x1
790\type\()_h264_qpel16_mc01:
791        lowpass_const   w3
792        mov             x3,  x2
793        sub             x1,  x1,  x2, lsl #1
794        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
795        ret             x14
796endfunc
797
798function ff_\type\()_h264_qpel16_mc11_neon, export=1
799        mov             x14, x30
800        mov             x8,  x0
801        mov             x9,  x1
802\type\()_h264_qpel16_mc11:
803        lowpass_const   w3
804        mov             x11, sp
805        sub             sp,  sp,  #256
806        mov             x0,  sp
807        sub             x1,  x1,  #2
808        mov             x3,  #16
809        bl              put_h264_qpel16_h_lowpass_neon
810        mov             x0,  x8
811        mov             x3,  x2
812        mov             x12, sp
813        sub             x1,  x9,  x2, lsl #1
814        mov             x2,  #16
815        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
816        mov             sp,  x11
817        ret             x14
818endfunc
819
820function ff_\type\()_h264_qpel16_mc21_neon, export=1
821        mov             x14, x30
822        mov             x8,  x0
823        mov             x9,  x1
824\type\()_h264_qpel16_mc21:
825        lowpass_const   w3
826        mov             x11, sp
827        sub             sp,  sp,  #(16*16+16*12)
828        sub             x1,  x1,  #2
829        mov             x0,  sp
830        bl              put_h264_qpel16_h_lowpass_neon_packed
831        mov             x4,  x0
832        mov             x0,  x8
833        sub             x1,  x9,  x2, lsl #1
834        sub             x1,  x1,  #2
835        mov             x3,  x2
836        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
837        mov             sp,  x11
838        ret             x14
839endfunc
840
841function ff_\type\()_h264_qpel16_mc31_neon, export=1
842        add             x1,  x1,  #1
843        mov             x14, x30
844        mov             x8,  x0
845        mov             x9,  x1
846        sub             x1,  x1,  #1
847        b               \type\()_h264_qpel16_mc11
848endfunc
849
850function ff_\type\()_h264_qpel16_mc02_neon, export=1
851        mov             x14, x30
852        lowpass_const   w3
853        sub             x1,  x1,  x2, lsl #1
854        mov             x3,  x2
855        bl              \type\()_h264_qpel16_v_lowpass_neon
856        ret             x14
857endfunc
858
859function ff_\type\()_h264_qpel16_mc12_neon, export=1
860        mov             x14, x30
861        mov             x8,  x0
862        mov             x9,  x1
863\type\()_h264_qpel16_mc12:
864        lowpass_const   w3
865        mov             x11, sp
866        sub             sp,  sp,  #(16*16+16*12)
867        sub             x1,  x1,  x2, lsl #1
868        mov             x0,  sp
869        mov             x3,  x2
870        bl              put_h264_qpel16_v_lowpass_neon_packed
871        mov             x4,  x0
872        mov             x0,  x8
873        sub             x1,  x9,  x3, lsl #1
874        sub             x1,  x1,  #2
875        mov             x2,  x3
876        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
877        mov             sp,  x11
878        ret             x14
879endfunc
880
881function ff_\type\()_h264_qpel16_mc22_neon, export=1
882        mov             x14, x30
883        lowpass_const   w3
884        mov             x11, sp
885        sub             x1,  x1,  x2, lsl #1
886        sub             x1,  x1,  #2
887        mov             x3,  x2
888        bl              \type\()_h264_qpel16_hv_lowpass_neon
889        mov             sp,  x11 // restore stack
890        ret             x14
891endfunc
892
893function ff_\type\()_h264_qpel16_mc32_neon, export=1
894        mov             x14, x30
895        mov             x8,  x0
896        mov             x9,  x1
897        add             x1,  x1,  #1
898        b               \type\()_h264_qpel16_mc12
899endfunc
900
901function ff_\type\()_h264_qpel16_mc03_neon, export=1
902        mov             x14, x30
903        add             x12, x1,  x2
904        b               \type\()_h264_qpel16_mc01
905endfunc
906
907function ff_\type\()_h264_qpel16_mc13_neon, export=1
908        mov             x14, x30
909        mov             x8,  x0
910        mov             x9,  x1
911        add             x1,  x1,  x2
912        b               \type\()_h264_qpel16_mc11
913endfunc
914
915function ff_\type\()_h264_qpel16_mc23_neon, export=1
916        mov             x14, x30
917        mov             x8,  x0
918        mov             x9,  x1
919        add             x1,  x1,  x2
920        b               \type\()_h264_qpel16_mc21
921endfunc
922
923function ff_\type\()_h264_qpel16_mc33_neon, export=1
924        add             x1,  x1,  #1
925        mov             x14, x30
926        mov             x8,  x0
927        mov             x9,  x1
928        add             x1,  x1,  x2
929        sub             x1,  x1,  #1
930        b               \type\()_h264_qpel16_mc11
931endfunc
932.endm
933
934        h264_qpel16 put
935        h264_qpel16 avg
936