• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24#include "neon.S"
25
26.macro  h264_loop_filter_start
27        cmp             w2,  #0
28        ldr             w6,  [x4]
29        ccmp            w3,  #0, #0, ne
30        mov             v24.S[0], w6
31        and             w8,  w6,  w6,  lsl #16
32        b.eq            1f
33        ands            w8,  w8,  w8,  lsl #8
34        b.ge            2f
351:
36        ret
372:
38.endm
39
40.macro  h264_loop_filter_luma
41        dup             v22.16B, w2                     // alpha
42        uxtl            v24.8H,  v24.8B
43        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
44        uxtl            v24.4S,  v24.4H
45        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
46        sli             v24.8H,  v24.8H,  #8
47        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
48        sli             v24.4S,  v24.4S,  #16
49        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
50        dup             v22.16B, w3                     // beta
51        cmlt            v23.16B, v24.16B, #0
52        cmhi            v28.16B, v22.16B, v28.16B       // < beta
53        cmhi            v30.16B, v22.16B, v30.16B       // < beta
54        bic             v21.16B, v21.16B, v23.16B
55        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
56        and             v21.16B, v21.16B, v28.16B
57        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
58        and             v21.16B, v21.16B, v30.16B      // < beta
59        shrn            v30.8b,  v21.8h,  #4
60        mov             x7, v30.d[0]
61        cmhi            v17.16B, v22.16B, v17.16B       // < beta
62        cmhi            v19.16B, v22.16B, v19.16B       // < beta
63        cbz             x7,  9f
64        and             v17.16B, v17.16B, v21.16B
65        and             v19.16B, v19.16B, v21.16B
66        and             v24.16B, v24.16B, v21.16B
67        urhadd          v28.16B, v16.16B,  v0.16B
68        sub             v21.16B, v24.16B, v17.16B
69        uqadd           v23.16B, v18.16B, v24.16B
70        uhadd           v20.16B, v20.16B, v28.16B
71        sub             v21.16B, v21.16B, v19.16B
72        uhadd           v28.16B,  v4.16B, v28.16B
73        umin            v23.16B, v23.16B, v20.16B
74        uqsub           v22.16B, v18.16B, v24.16B
75        uqadd           v4.16B,   v2.16B, v24.16B
76        umax            v23.16B, v23.16B, v22.16B
77        uqsub           v22.16B,  v2.16B, v24.16B
78        umin            v28.16B,  v4.16B, v28.16B
79        uxtl            v4.8H,    v0.8B
80        umax            v28.16B, v28.16B, v22.16B
81        uxtl2           v20.8H,   v0.16B
82        usubw           v4.8H,    v4.8H,  v16.8B
83        usubw2          v20.8H,  v20.8H,  v16.16B
84        shl             v4.8H,    v4.8H,  #2
85        shl             v20.8H,  v20.8H,  #2
86        uaddw           v4.8H,    v4.8H,  v18.8B
87        uaddw2          v20.8H,  v20.8H,  v18.16B
88        usubw           v4.8H,    v4.8H,   v2.8B
89        usubw2          v20.8H,  v20.8H,   v2.16B
90        rshrn           v4.8B,    v4.8H,  #3
91        rshrn2          v4.16B,  v20.8H,  #3
92        bsl             v17.16B, v23.16B, v18.16B
93        bsl             v19.16B, v28.16B,  v2.16B
94        neg             v23.16B, v21.16B
95        uxtl            v28.8H,  v16.8B
96        smin            v4.16B,   v4.16B, v21.16B
97        uxtl2           v21.8H,  v16.16B
98        smax            v4.16B,   v4.16B, v23.16B
99        uxtl            v22.8H,   v0.8B
100        uxtl2           v24.8H,   v0.16B
101        saddw           v28.8H,  v28.8H,  v4.8B
102        saddw2          v21.8H,  v21.8H,  v4.16B
103        ssubw           v22.8H,  v22.8H,  v4.8B
104        ssubw2          v24.8H,  v24.8H,  v4.16B
105        sqxtun          v16.8B,  v28.8H
106        sqxtun2         v16.16B, v21.8H
107        sqxtun          v0.8B,   v22.8H
108        sqxtun2         v0.16B,  v24.8H
109.endm
110
111function ff_h264_v_loop_filter_luma_neon, export=1
112        h264_loop_filter_start
113        sxtw            x1,  w1
114
115        ld1             {v0.16B},  [x0], x1
116        ld1             {v2.16B},  [x0], x1
117        ld1             {v4.16B},  [x0], x1
118        sub             x0,  x0,  x1, lsl #2
119        sub             x0,  x0,  x1, lsl #1
120        ld1             {v20.16B},  [x0], x1
121        ld1             {v18.16B},  [x0], x1
122        ld1             {v16.16B},  [x0], x1
123
124        h264_loop_filter_luma
125
126        sub             x0,  x0,  x1, lsl #1
127        st1             {v17.16B},  [x0], x1
128        st1             {v16.16B}, [x0], x1
129        st1             {v0.16B},  [x0], x1
130        st1             {v19.16B}, [x0]
1319:
132        ret
133endfunc
134
135function ff_h264_h_loop_filter_luma_neon, export=1
136        h264_loop_filter_start
137        sxtw            x1,  w1
138
139        sub             x0,  x0,  #4
140        ld1             {v6.8B},  [x0], x1
141        ld1             {v20.8B}, [x0], x1
142        ld1             {v18.8B}, [x0], x1
143        ld1             {v16.8B}, [x0], x1
144        ld1             {v0.8B},  [x0], x1
145        ld1             {v2.8B},  [x0], x1
146        ld1             {v4.8B},  [x0], x1
147        ld1             {v26.8B}, [x0], x1
148        ld1             {v6.D}[1],  [x0], x1
149        ld1             {v20.D}[1], [x0], x1
150        ld1             {v18.D}[1], [x0], x1
151        ld1             {v16.D}[1], [x0], x1
152        ld1             {v0.D}[1],  [x0], x1
153        ld1             {v2.D}[1],  [x0], x1
154        ld1             {v4.D}[1],  [x0], x1
155        ld1             {v26.D}[1], [x0], x1
156
157        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
158
159        h264_loop_filter_luma
160
161        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
162
163        sub             x0,  x0,  x1, lsl #4
164        add             x0,  x0,  #2
165        st1             {v17.S}[0],  [x0], x1
166        st1             {v16.S}[0], [x0], x1
167        st1             {v0.S}[0],  [x0], x1
168        st1             {v19.S}[0], [x0], x1
169        st1             {v17.S}[1],  [x0], x1
170        st1             {v16.S}[1], [x0], x1
171        st1             {v0.S}[1],  [x0], x1
172        st1             {v19.S}[1], [x0], x1
173        st1             {v17.S}[2],  [x0], x1
174        st1             {v16.S}[2], [x0], x1
175        st1             {v0.S}[2],  [x0], x1
176        st1             {v19.S}[2], [x0], x1
177        st1             {v17.S}[3],  [x0], x1
178        st1             {v16.S}[3], [x0], x1
179        st1             {v0.S}[3],  [x0], x1
180        st1             {v19.S}[3], [x0], x1
1819:
182        ret
183endfunc
184
185
186.macro h264_loop_filter_start_intra
187    orr             w4,  w2,  w3
188    cbnz            w4,  1f
189    ret
1901:
191    sxtw            x1,  w1
192    dup             v30.16b, w2                // alpha
193    dup             v31.16b, w3                // beta
194.endm
195
196.macro h264_loop_filter_luma_intra
197    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
198    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
199    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
200    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
201    cmhi            v17.16b, v31.16b, v17.16b       // < beta
202    cmhi            v18.16b, v31.16b, v18.16b       // < beta
203
204    movi            v29.16b, #2
205    ushr            v30.16b, v30.16b, #2            // alpha >> 2
206    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
207    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
208
209    and             v19.16b, v19.16b, v17.16b
210    and             v19.16b, v19.16b, v18.16b
211    shrn            v20.8b,  v19.8h,  #4
212    mov             x4, v20.d[0]
213    cbz             x4, 9f
214
215    ushll           v20.8h,  v6.8b,   #1
216    ushll           v22.8h,  v1.8b,   #1
217    ushll2          v21.8h,  v6.16b,  #1
218    ushll2          v23.8h,  v1.16b,  #1
219    uaddw           v20.8h,  v20.8h,  v7.8b
220    uaddw           v22.8h,  v22.8h,  v0.8b
221    uaddw2          v21.8h,  v21.8h,  v7.16b
222    uaddw2          v23.8h,  v23.8h,  v0.16b
223    uaddw           v20.8h,  v20.8h,  v1.8b
224    uaddw           v22.8h,  v22.8h,  v6.8b
225    uaddw2          v21.8h,  v21.8h,  v1.16b
226    uaddw2          v23.8h,  v23.8h,  v6.16b
227
228    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
229    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
230    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
231    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
232
233    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
234    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
235    cmhi            v17.16b, v31.16b, v17.16b       // < beta
236    cmhi            v18.16b, v31.16b, v18.16b       // < beta
237
238    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
239    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
240
241    not             v30.16b, v17.16b
242    not             v31.16b, v18.16b
243
244    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
245    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
246
247    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
248    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
249
250    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
251    uaddl           v26.8h,  v5.8b,   v7.8b
252    uaddl2          v27.8h,  v5.16b,  v7.16b
253    uaddw           v26.8h,  v26.8h,  v0.8b
254    uaddw2          v27.8h,  v27.8h,  v0.16b
255    add             v20.8h,  v20.8h,  v26.8h
256    add             v21.8h,  v21.8h,  v27.8h
257    uaddw           v20.8h,  v20.8h,  v0.8b
258    uaddw2          v21.8h,  v21.8h,  v0.16b
259    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
260    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
261    uaddw           v26.8h,  v26.8h,  v6.8b
262    uaddw2          v27.8h,  v27.8h,  v6.16b
263    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
264    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
265    uaddl           v28.8h,  v4.8b,   v5.8b
266    uaddl2          v29.8h,  v4.16b,  v5.16b
267    shl             v28.8h,  v28.8h,  #1
268    shl             v29.8h,  v29.8h,  #1
269    add             v28.8h,  v28.8h,  v26.8h
270    add             v29.8h,  v29.8h,  v27.8h
271    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
272    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
273
274    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
275    uaddl           v26.8h,  v2.8b,   v0.8b
276    uaddl2          v27.8h,  v2.16b,  v0.16b
277    uaddw           v26.8h,  v26.8h,  v7.8b
278    uaddw2          v27.8h,  v27.8h,  v7.16b
279    add             v22.8h,  v22.8h,  v26.8h
280    add             v23.8h,  v23.8h,  v27.8h
281    uaddw           v22.8h,  v22.8h,  v7.8b
282    uaddw2          v23.8h,  v23.8h,  v7.16b
283    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
284    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
285    uaddw           v26.8h,  v26.8h,  v1.8b
286    uaddw2          v27.8h,  v27.8h,  v1.16b
287    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
288    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
289    uaddl           v28.8h,  v2.8b,   v3.8b
290    uaddl2          v29.8h,  v2.16b,  v3.16b
291    shl             v28.8h,  v28.8h,  #1
292    shl             v29.8h,  v29.8h,  #1
293    add             v28.8h,  v28.8h,  v26.8h
294    add             v29.8h,  v29.8h,  v27.8h
295    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
296    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
297
298    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
299    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
300    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
301    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
302    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
303    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
304    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
305    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
306.endm
307
308function ff_h264_v_loop_filter_luma_intra_neon, export=1
309    h264_loop_filter_start_intra
310
311    ld1             {v0.16b},  [x0], x1 // q0
312    ld1             {v1.16b},  [x0], x1 // q1
313    ld1             {v2.16b},  [x0], x1 // q2
314    ld1             {v3.16b},  [x0], x1 // q3
315    sub             x0,  x0,  x1, lsl #3
316    ld1             {v4.16b},  [x0], x1 // p3
317    ld1             {v5.16b},  [x0], x1 // p2
318    ld1             {v6.16b},  [x0], x1 // p1
319    ld1             {v7.16b},  [x0]     // p0
320
321    h264_loop_filter_luma_intra
322
323    sub             x0,  x0,  x1, lsl #1
324    st1             {v5.16b}, [x0], x1  // p2
325    st1             {v6.16b}, [x0], x1  // p1
326    st1             {v7.16b}, [x0], x1  // p0
327    st1             {v0.16b}, [x0], x1  // q0
328    st1             {v1.16b}, [x0], x1  // q1
329    st1             {v2.16b}, [x0]      // q2
3309:
331    ret
332endfunc
333
334function ff_h264_h_loop_filter_luma_intra_neon, export=1
335    h264_loop_filter_start_intra
336
337    sub             x0,  x0,  #4
338    ld1             {v4.8b},  [x0], x1
339    ld1             {v5.8b},  [x0], x1
340    ld1             {v6.8b},  [x0], x1
341    ld1             {v7.8b},  [x0], x1
342    ld1             {v0.8b},  [x0], x1
343    ld1             {v1.8b},  [x0], x1
344    ld1             {v2.8b},  [x0], x1
345    ld1             {v3.8b},  [x0], x1
346    ld1             {v4.d}[1],  [x0], x1
347    ld1             {v5.d}[1],  [x0], x1
348    ld1             {v6.d}[1],  [x0], x1
349    ld1             {v7.d}[1],  [x0], x1
350    ld1             {v0.d}[1],  [x0], x1
351    ld1             {v1.d}[1],  [x0], x1
352    ld1             {v2.d}[1],  [x0], x1
353    ld1             {v3.d}[1],  [x0], x1
354
355    transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
356
357    h264_loop_filter_luma_intra
358
359    transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
360
361    sub             x0,  x0,  x1, lsl #4
362    st1             {v4.8b},  [x0], x1
363    st1             {v5.8b},  [x0], x1
364    st1             {v6.8b},  [x0], x1
365    st1             {v7.8b},  [x0], x1
366    st1             {v0.8b},  [x0], x1
367    st1             {v1.8b},  [x0], x1
368    st1             {v2.8b},  [x0], x1
369    st1             {v3.8b},  [x0], x1
370    st1             {v4.d}[1],  [x0], x1
371    st1             {v5.d}[1],  [x0], x1
372    st1             {v6.d}[1],  [x0], x1
373    st1             {v7.d}[1],  [x0], x1
374    st1             {v0.d}[1],  [x0], x1
375    st1             {v1.d}[1],  [x0], x1
376    st1             {v2.d}[1],  [x0], x1
377    st1             {v3.d}[1],  [x0], x1
3789:
379    ret
380endfunc
381
382.macro  h264_loop_filter_chroma
383        dup             v22.8B, w2              // alpha
384        dup             v23.8B, w3              // beta
385        uxtl            v24.8H, v24.8B
386        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
387        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
388        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
389        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
390        cmhi            v28.8B, v23.8B, v28.8B  // < beta
391        cmhi            v30.8B, v23.8B, v30.8B  // < beta
392        uxtl            v4.8H,  v0.8B
393        and             v26.8B, v26.8B, v28.8B
394        usubw           v4.8H,  v4.8H,  v16.8B
395        and             v26.8B, v26.8B, v30.8B
396        shl             v4.8H,  v4.8H,  #2
397        mov             x8,  v26.d[0]
398        sli             v24.8H, v24.8H, #8
399        uaddw           v4.8H,  v4.8H,  v18.8B
400        cbz             x8,  9f
401        usubw           v4.8H,  v4.8H,  v2.8B
402        rshrn           v4.8B,  v4.8H,  #3
403        smin            v4.8B,  v4.8B,  v24.8B
404        neg             v25.8B, v24.8B
405        smax            v4.8B,  v4.8B,  v25.8B
406        uxtl            v22.8H, v0.8B
407        and             v4.8B,  v4.8B,  v26.8B
408        uxtl            v28.8H, v16.8B
409        saddw           v28.8H, v28.8H, v4.8B
410        ssubw           v22.8H, v22.8H, v4.8B
411        sqxtun          v16.8B, v28.8H
412        sqxtun          v0.8B,  v22.8H
413.endm
414
415function ff_h264_v_loop_filter_chroma_neon, export=1
416        h264_loop_filter_start
417        sxtw            x1,  w1
418
419        sub             x0,  x0,  x1, lsl #1
420        ld1             {v18.8B}, [x0], x1
421        ld1             {v16.8B}, [x0], x1
422        ld1             {v0.8B},  [x0], x1
423        ld1             {v2.8B},  [x0]
424
425        h264_loop_filter_chroma
426
427        sub             x0,  x0,  x1, lsl #1
428        st1             {v16.8B}, [x0], x1
429        st1             {v0.8B},  [x0], x1
4309:
431        ret
432endfunc
433
434function ff_h264_h_loop_filter_chroma_neon, export=1
435        h264_loop_filter_start
436        sxtw            x1,  w1
437
438        sub             x0,  x0,  #2
439h_loop_filter_chroma420:
440        ld1             {v18.S}[0], [x0], x1
441        ld1             {v16.S}[0], [x0], x1
442        ld1             {v0.S}[0],  [x0], x1
443        ld1             {v2.S}[0],  [x0], x1
444        ld1             {v18.S}[1], [x0], x1
445        ld1             {v16.S}[1], [x0], x1
446        ld1             {v0.S}[1],  [x0], x1
447        ld1             {v2.S}[1],  [x0], x1
448
449        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
450
451        h264_loop_filter_chroma
452
453        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
454
455        sub             x0,  x0,  x1, lsl #3
456        st1             {v18.S}[0], [x0], x1
457        st1             {v16.S}[0], [x0], x1
458        st1             {v0.S}[0],  [x0], x1
459        st1             {v2.S}[0],  [x0], x1
460        st1             {v18.S}[1], [x0], x1
461        st1             {v16.S}[1], [x0], x1
462        st1             {v0.S}[1],  [x0], x1
463        st1             {v2.S}[1],  [x0], x1
4649:
465        ret
466endfunc
467
468function ff_h264_h_loop_filter_chroma422_neon, export=1
469        sxtw            x1,  w1
470        h264_loop_filter_start
471        add             x5,  x0,  x1
472        sub             x0,  x0,  #2
473        add             x1,  x1,  x1
474        mov             x7,  x30
475        bl              h_loop_filter_chroma420
476        mov             x30, x7
477        sub             x0,  x5,  #2
478        mov             v24.s[0], w6
479        b               h_loop_filter_chroma420
480endfunc
481
482.macro h264_loop_filter_chroma_intra
483    uabd            v26.8b, v16.8b, v17.8b  // abs(p0 - q0)
484    uabd            v27.8b, v18.8b, v16.8b  // abs(p1 - p0)
485    uabd            v28.8b, v19.8b, v17.8b  // abs(q1 - q0)
486    cmhi            v26.8b, v30.8b, v26.8b  // < alpha
487    cmhi            v27.8b, v31.8b, v27.8b  // < beta
488    cmhi            v28.8b, v31.8b, v28.8b  // < beta
489    and             v26.8b, v26.8b, v27.8b
490    and             v26.8b, v26.8b, v28.8b
491    mov             x2, v26.d[0]
492
493    ushll           v4.8h,   v18.8b,  #1
494    ushll           v6.8h,   v19.8b,  #1
495    cbz             x2, 9f
496    uaddl           v20.8h,  v16.8b,  v19.8b
497    uaddl           v22.8h,  v17.8b,  v18.8b
498    add             v20.8h,  v20.8h,  v4.8h
499    add             v22.8h,  v22.8h,  v6.8h
500    uqrshrn         v24.8b,  v20.8h,  #2
501    uqrshrn         v25.8b,  v22.8h,  #2
502    bit             v16.8b, v24.8b, v26.8b
503    bit             v17.8b, v25.8b, v26.8b
504.endm
505
506function ff_h264_v_loop_filter_chroma_intra_neon, export=1
507    h264_loop_filter_start_intra
508
509    sub             x0,  x0,  x1, lsl #1
510    ld1             {v18.8b}, [x0], x1
511    ld1             {v16.8b}, [x0], x1
512    ld1             {v17.8b}, [x0], x1
513    ld1             {v19.8b}, [x0]
514
515    h264_loop_filter_chroma_intra
516
517    sub             x0,  x0,  x1, lsl #1
518    st1             {v16.8b}, [x0], x1
519    st1             {v17.8b}, [x0], x1
520
5219:
522    ret
523endfunc
524
525function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
526    h264_loop_filter_start_intra
527
528    sub             x4,  x0,  #2
529    sub             x0,  x0,  #1
530    ld1             {v18.8b}, [x4], x1
531    ld1             {v16.8b}, [x4], x1
532    ld1             {v17.8b}, [x4], x1
533    ld1             {v19.8b}, [x4], x1
534
535    transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
536
537    h264_loop_filter_chroma_intra
538
539    st2             {v16.b,v17.b}[0], [x0], x1
540    st2             {v16.b,v17.b}[1], [x0], x1
541    st2             {v16.b,v17.b}[2], [x0], x1
542    st2             {v16.b,v17.b}[3], [x0], x1
543
5449:
545    ret
546endfunc
547
548function ff_h264_h_loop_filter_chroma_intra_neon, export=1
549    h264_loop_filter_start_intra
550
551    sub             x4,  x0,  #2
552    sub             x0,  x0,  #1
553h_loop_filter_chroma420_intra:
554    ld1             {v18.8b}, [x4], x1
555    ld1             {v16.8b}, [x4], x1
556    ld1             {v17.8b}, [x4], x1
557    ld1             {v19.8b}, [x4], x1
558    ld1             {v18.s}[1], [x4], x1
559    ld1             {v16.s}[1], [x4], x1
560    ld1             {v17.s}[1], [x4], x1
561    ld1             {v19.s}[1], [x4], x1
562
563    transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
564
565    h264_loop_filter_chroma_intra
566
567    st2             {v16.b,v17.b}[0], [x0], x1
568    st2             {v16.b,v17.b}[1], [x0], x1
569    st2             {v16.b,v17.b}[2], [x0], x1
570    st2             {v16.b,v17.b}[3], [x0], x1
571    st2             {v16.b,v17.b}[4], [x0], x1
572    st2             {v16.b,v17.b}[5], [x0], x1
573    st2             {v16.b,v17.b}[6], [x0], x1
574    st2             {v16.b,v17.b}[7], [x0], x1
575
5769:
577    ret
578endfunc
579
580function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
581    h264_loop_filter_start_intra
582    sub             x4,  x0,  #2
583    add             x5,  x0,  x1, lsl #3
584    sub             x0,  x0,  #1
585    mov             x7,  x30
586    bl              h_loop_filter_chroma420_intra
587    sub             x0,  x5,  #1
588    mov             x30, x7
589    b               h_loop_filter_chroma420_intra
590endfunc
591
592.macro  biweight_16     macs, macd
593        dup             v0.16B,  w5
594        dup             v1.16B,  w6
595        mov             v4.16B,  v16.16B
596        mov             v6.16B,  v16.16B
5971:      subs            w3,  w3,  #2
598        ld1             {v20.16B}, [x0], x2
599        \macd           v4.8H,   v0.8B,  v20.8B
600        \macd\()2       v6.8H,   v0.16B, v20.16B
601        ld1             {v22.16B}, [x1], x2
602        \macs           v4.8H,   v1.8B,  v22.8B
603        \macs\()2       v6.8H,   v1.16B, v22.16B
604        mov             v24.16B, v16.16B
605        ld1             {v28.16B}, [x0], x2
606        mov             v26.16B, v16.16B
607        \macd           v24.8H,  v0.8B,  v28.8B
608        \macd\()2       v26.8H,  v0.16B, v28.16B
609        ld1             {v30.16B}, [x1], x2
610        \macs           v24.8H,  v1.8B,  v30.8B
611        \macs\()2       v26.8H,  v1.16B, v30.16B
612        sshl            v4.8H,   v4.8H,  v18.8H
613        sshl            v6.8H,   v6.8H,  v18.8H
614        sqxtun          v4.8B,   v4.8H
615        sqxtun2         v4.16B,  v6.8H
616        sshl            v24.8H,  v24.8H, v18.8H
617        sshl            v26.8H,  v26.8H, v18.8H
618        sqxtun          v24.8B,  v24.8H
619        sqxtun2         v24.16B, v26.8H
620        mov             v6.16B,  v16.16B
621        st1             {v4.16B},  [x7], x2
622        mov             v4.16B,  v16.16B
623        st1             {v24.16B}, [x7], x2
624        b.ne            1b
625        ret
626.endm
627
628.macro  biweight_8      macs, macd
629        dup             v0.8B,  w5
630        dup             v1.8B,  w6
631        mov             v2.16B,  v16.16B
632        mov             v20.16B, v16.16B
6331:      subs            w3,  w3,  #2
634        ld1             {v4.8B}, [x0], x2
635        \macd           v2.8H,  v0.8B,  v4.8B
636        ld1             {v5.8B}, [x1], x2
637        \macs           v2.8H,  v1.8B,  v5.8B
638        ld1             {v6.8B}, [x0], x2
639        \macd           v20.8H, v0.8B,  v6.8B
640        ld1             {v7.8B}, [x1], x2
641        \macs           v20.8H, v1.8B,  v7.8B
642        sshl            v2.8H,  v2.8H,  v18.8H
643        sqxtun          v2.8B,  v2.8H
644        sshl            v20.8H, v20.8H, v18.8H
645        sqxtun          v4.8B,  v20.8H
646        mov             v20.16B, v16.16B
647        st1             {v2.8B}, [x7], x2
648        mov             v2.16B,  v16.16B
649        st1             {v4.8B}, [x7], x2
650        b.ne            1b
651        ret
652.endm
653
654.macro  biweight_4      macs, macd
655        dup             v0.8B,  w5
656        dup             v1.8B,  w6
657        mov             v2.16B, v16.16B
658        mov             v20.16B,v16.16B
6591:      subs            w3,  w3,  #4
660        ld1             {v4.S}[0], [x0], x2
661        ld1             {v4.S}[1], [x0], x2
662        \macd           v2.8H,  v0.8B,  v4.8B
663        ld1             {v5.S}[0], [x1], x2
664        ld1             {v5.S}[1], [x1], x2
665        \macs           v2.8H,  v1.8B,  v5.8B
666        b.lt            2f
667        ld1             {v6.S}[0], [x0], x2
668        ld1             {v6.S}[1], [x0], x2
669        \macd           v20.8H, v0.8B,  v6.8B
670        ld1             {v7.S}[0], [x1], x2
671        ld1             {v7.S}[1], [x1], x2
672        \macs           v20.8H, v1.8B,  v7.8B
673        sshl            v2.8H,  v2.8H,  v18.8H
674        sqxtun          v2.8B,  v2.8H
675        sshl            v20.8H, v20.8H, v18.8H
676        sqxtun          v4.8B,  v20.8H
677        mov             v20.16B, v16.16B
678        st1             {v2.S}[0], [x7], x2
679        st1             {v2.S}[1], [x7], x2
680        mov             v2.16B,  v16.16B
681        st1             {v4.S}[0], [x7], x2
682        st1             {v4.S}[1], [x7], x2
683        b.ne            1b
684        ret
6852:      sshl            v2.8H,  v2.8H,  v18.8H
686        sqxtun          v2.8B,  v2.8H
687        st1             {v2.S}[0], [x7], x2
688        st1             {v2.S}[1], [x7], x2
689        ret
690.endm
691
692.macro  biweight_func   w
693function ff_biweight_h264_pixels_\w\()_neon, export=1
694        sxtw            x2,  w2
695        lsr             w8,  w5,  #31
696        add             w7,  w7,  #1
697        eor             w8,  w8,  w6,  lsr #30
698        orr             w7,  w7,  #1
699        dup             v18.8H,   w4
700        lsl             w7,  w7,  w4
701        not             v18.16B,  v18.16B
702        dup             v16.8H,   w7
703        mov             x7,  x0
704        cbz             w8,  10f
705        subs            w8,  w8,  #1
706        b.eq            20f
707        subs            w8,  w8,  #1
708        b.eq            30f
709        b               40f
71010:     biweight_\w     umlal, umlal
71120:     neg             w5, w5
712        biweight_\w     umlal, umlsl
71330:     neg             w5, w5
714        neg             w6, w6
715        biweight_\w     umlsl, umlsl
71640:     neg             w6, w6
717        biweight_\w     umlsl, umlal
718endfunc
719.endm
720
721        biweight_func   16
722        biweight_func   8
723        biweight_func   4
724
725.macro  weight_16       add
726        dup             v0.16B,  w4
7271:      subs            w2,  w2,  #2
728        ld1             {v20.16B}, [x0], x1
729        umull           v4.8H,   v0.8B,  v20.8B
730        umull2          v6.8H,   v0.16B, v20.16B
731        ld1             {v28.16B}, [x0], x1
732        umull           v24.8H,  v0.8B,  v28.8B
733        umull2          v26.8H,  v0.16B, v28.16B
734        \add            v4.8H,   v16.8H, v4.8H
735        srshl           v4.8H,   v4.8H,  v18.8H
736        \add            v6.8H,   v16.8H, v6.8H
737        srshl           v6.8H,   v6.8H,  v18.8H
738        sqxtun          v4.8B,   v4.8H
739        sqxtun2         v4.16B,  v6.8H
740        \add            v24.8H,  v16.8H, v24.8H
741        srshl           v24.8H,  v24.8H, v18.8H
742        \add            v26.8H,  v16.8H, v26.8H
743        srshl           v26.8H,  v26.8H, v18.8H
744        sqxtun          v24.8B,  v24.8H
745        sqxtun2         v24.16B, v26.8H
746        st1             {v4.16B},  [x5], x1
747        st1             {v24.16B}, [x5], x1
748        b.ne            1b
749        ret
750.endm
751
752.macro  weight_8        add
753        dup             v0.8B,  w4
7541:      subs            w2,  w2,  #2
755        ld1             {v4.8B}, [x0], x1
756        umull           v2.8H,  v0.8B,  v4.8B
757        ld1             {v6.8B}, [x0], x1
758        umull           v20.8H, v0.8B,  v6.8B
759        \add            v2.8H,  v16.8H,  v2.8H
760        srshl           v2.8H,  v2.8H,  v18.8H
761        sqxtun          v2.8B,  v2.8H
762        \add            v20.8H, v16.8H,  v20.8H
763        srshl           v20.8H, v20.8H, v18.8H
764        sqxtun          v4.8B,  v20.8H
765        st1             {v2.8B}, [x5], x1
766        st1             {v4.8B}, [x5], x1
767        b.ne            1b
768        ret
769.endm
770
771.macro  weight_4        add
772        dup             v0.8B,  w4
7731:      subs            w2,  w2,  #4
774        ld1             {v4.S}[0], [x0], x1
775        ld1             {v4.S}[1], [x0], x1
776        umull           v2.8H,  v0.8B,  v4.8B
777        b.lt            2f
778        ld1             {v6.S}[0], [x0], x1
779        ld1             {v6.S}[1], [x0], x1
780        umull           v20.8H, v0.8B,  v6.8B
781        \add            v2.8H,  v16.8H,  v2.8H
782        srshl           v2.8H,  v2.8H,  v18.8H
783        sqxtun          v2.8B,  v2.8H
784        \add            v20.8H, v16.8H,  v20.8H
785        srshl           v20.8H, v20.8h, v18.8H
786        sqxtun          v4.8B,  v20.8H
787        st1             {v2.S}[0], [x5], x1
788        st1             {v2.S}[1], [x5], x1
789        st1             {v4.S}[0], [x5], x1
790        st1             {v4.S}[1], [x5], x1
791        b.ne            1b
792        ret
7932:      \add            v2.8H,  v16.8H,  v2.8H
794        srshl           v2.8H,  v2.8H,  v18.8H
795        sqxtun          v2.8B,  v2.8H
796        st1             {v2.S}[0], [x5], x1
797        st1             {v2.S}[1], [x5], x1
798        ret
799.endm
800
801.macro  weight_func     w
802function ff_weight_h264_pixels_\w\()_neon, export=1
803        sxtw            x1,  w1
804        cmp             w3,  #1
805        mov             w6,  #1
806        lsl             w5,  w5,  w3
807        dup             v16.8H,  w5
808        mov             x5,  x0
809        b.le            20f
810        sub             w6,  w6,  w3
811        dup             v18.8H,  w6
812        cmp             w4, #0
813        b.lt            10f
814        weight_\w       shadd
81510:     neg             w4,  w4
816        weight_\w       shsub
81720:     neg             w6,  w3
818        dup             v18.8H,  w6
819        cmp             w4,  #0
820        b.lt            10f
821        weight_\w       add
82210:     neg             w4,  w4
823        weight_\w       sub
824endfunc
825.endm
826
827        weight_func     16
828        weight_func     8
829        weight_func     4
830