• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23
24/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
25.macro  h264_chroma_mc8 type, codec=h264
26function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27  .ifc \type,avg
28        mov             x8,  x0
29  .endif
30        prfm            pldl1strm, [x1]
31        prfm            pldl1strm, [x1, x2]
32  .ifc \codec,rv40
33        movrel          x6,  rv40bias
34        lsr             w9,  w5,  #1
35        lsr             w10, w4,  #1
36        lsl             w9,  w9,  #3
37        lsl             w10, w10, #1
38        add             w9,  w9,  w10
39        add             x6,  x6,  w9, UXTW
40        ld1r            {v22.8H}, [x6]
41  .endif
42  .ifc \codec,vc1
43        movi            v22.8H,   #28
44  .endif
45        mul             w7,  w4,  w5
46        lsl             w14, w5,  #3
47        lsl             w13, w4,  #3
48        cmp             w7,  #0
49        sub             w6,  w14, w7
50        sub             w12, w13, w7
51        sub             w4,  w7,  w13
52        sub             w4,  w4,  w14
53        add             w4,  w4,  #64
54        b.eq            2f
55
56        dup             v0.8B,  w4
57        dup             v1.8B,  w12
58        ld1             {v4.8B, v5.8B}, [x1], x2
59        dup             v2.8B,  w6
60        dup             v3.8B,  w7
61        ext             v5.8B,  v4.8B,  v5.8B,  #1
621:      ld1             {v6.8B, v7.8B}, [x1], x2
63        umull           v16.8H, v4.8B,  v0.8B
64        umlal           v16.8H, v5.8B,  v1.8B
65        ext             v7.8B,  v6.8B,  v7.8B,  #1
66        ld1             {v4.8B, v5.8B}, [x1], x2
67        umlal           v16.8H, v6.8B,  v2.8B
68        prfm            pldl1strm, [x1]
69        ext             v5.8B,  v4.8B,  v5.8B,  #1
70        umlal           v16.8H, v7.8B,  v3.8B
71        umull           v17.8H, v6.8B,  v0.8B
72        subs            w3,  w3,  #2
73        umlal           v17.8H, v7.8B, v1.8B
74        umlal           v17.8H, v4.8B, v2.8B
75        umlal           v17.8H, v5.8B, v3.8B
76        prfm            pldl1strm, [x1, x2]
77  .ifc \codec,h264
78        rshrn           v16.8B, v16.8H, #6
79        rshrn           v17.8B, v17.8H, #6
80  .else
81        add             v16.8H, v16.8H, v22.8H
82        add             v17.8H, v17.8H, v22.8H
83        shrn            v16.8B, v16.8H, #6
84        shrn            v17.8B, v17.8H, #6
85  .endif
86  .ifc \type,avg
87        ld1             {v20.8B}, [x8], x2
88        ld1             {v21.8B}, [x8], x2
89        urhadd          v16.8B, v16.8B, v20.8B
90        urhadd          v17.8B, v17.8B, v21.8B
91  .endif
92        st1             {v16.8B}, [x0], x2
93        st1             {v17.8B}, [x0], x2
94        b.gt            1b
95        ret
96
972:      adds            w12, w12, w6
98        dup             v0.8B, w4
99        b.eq            5f
100        tst             w6,  w6
101        dup             v1.8B, w12
102        b.eq            4f
103
104        ld1             {v4.8B}, [x1], x2
1053:      ld1             {v6.8B}, [x1], x2
106        umull           v16.8H, v4.8B,  v0.8B
107        umlal           v16.8H, v6.8B,  v1.8B
108        ld1             {v4.8B}, [x1], x2
109        umull           v17.8H, v6.8B,  v0.8B
110        umlal           v17.8H, v4.8B,  v1.8B
111        prfm            pldl1strm, [x1]
112  .ifc \codec,h264
113        rshrn           v16.8B, v16.8H, #6
114        rshrn           v17.8B, v17.8H, #6
115  .else
116        add             v16.8H, v16.8H, v22.8H
117        add             v17.8H, v17.8H, v22.8H
118        shrn            v16.8B, v16.8H, #6
119        shrn            v17.8B, v17.8H, #6
120  .endif
121        prfm            pldl1strm, [x1, x2]
122  .ifc \type,avg
123        ld1             {v20.8B}, [x8], x2
124        ld1             {v21.8B}, [x8], x2
125        urhadd          v16.8B, v16.8B, v20.8B
126        urhadd          v17.8B, v17.8B, v21.8B
127  .endif
128        subs            w3,  w3,  #2
129        st1             {v16.8B}, [x0], x2
130        st1             {v17.8B}, [x0], x2
131        b.gt            3b
132        ret
133
1344:      ld1             {v4.8B, v5.8B}, [x1], x2
135        ld1             {v6.8B, v7.8B}, [x1], x2
136        ext             v5.8B,  v4.8B,  v5.8B,  #1
137        ext             v7.8B,  v6.8B,  v7.8B,  #1
138        prfm            pldl1strm, [x1]
139        subs            w3,  w3,  #2
140        umull           v16.8H, v4.8B, v0.8B
141        umlal           v16.8H, v5.8B, v1.8B
142        umull           v17.8H, v6.8B, v0.8B
143        umlal           v17.8H, v7.8B, v1.8B
144        prfm            pldl1strm, [x1, x2]
145  .ifc \codec,h264
146        rshrn           v16.8B, v16.8H, #6
147        rshrn           v17.8B, v17.8H, #6
148  .else
149        add             v16.8H, v16.8H, v22.8H
150        add             v17.8H, v17.8H, v22.8H
151        shrn            v16.8B, v16.8H, #6
152        shrn            v17.8B, v17.8H, #6
153  .endif
154  .ifc \type,avg
155        ld1             {v20.8B}, [x8], x2
156        ld1             {v21.8B}, [x8], x2
157        urhadd          v16.8B, v16.8B, v20.8B
158        urhadd          v17.8B, v17.8B, v21.8B
159  .endif
160        st1             {v16.8B}, [x0], x2
161        st1             {v17.8B}, [x0], x2
162        b.gt            4b
163        ret
164
1655:      ld1             {v4.8B}, [x1], x2
166        ld1             {v5.8B}, [x1], x2
167        prfm            pldl1strm, [x1]
168        subs            w3,  w3,  #2
169        umull           v16.8H, v4.8B, v0.8B
170        umull           v17.8H, v5.8B, v0.8B
171        prfm            pldl1strm, [x1, x2]
172  .ifc \codec,h264
173        rshrn           v16.8B, v16.8H, #6
174        rshrn           v17.8B, v17.8H, #6
175  .else
176        add             v16.8H, v16.8H, v22.8H
177        add             v17.8H, v17.8H, v22.8H
178        shrn            v16.8B, v16.8H, #6
179        shrn            v17.8B, v17.8H, #6
180  .endif
181  .ifc \type,avg
182        ld1             {v20.8B}, [x8], x2
183        ld1             {v21.8B}, [x8], x2
184        urhadd          v16.8B, v16.8B, v20.8B
185        urhadd          v17.8B, v17.8B, v21.8B
186  .endif
187        st1             {v16.8B}, [x0], x2
188        st1             {v17.8B}, [x0], x2
189        b.gt            5b
190        ret
191endfunc
192.endm
193
194/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
195.macro  h264_chroma_mc4 type, codec=h264
196function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
197  .ifc \type,avg
198        mov             x8,  x0
199  .endif
200        prfm            pldl1strm, [x1]
201        prfm            pldl1strm, [x1, x2]
202  .ifc \codec,rv40
203        movrel          x6,  rv40bias
204        lsr             w9,  w5,  #1
205        lsr             w10, w4,  #1
206        lsl             w9,  w9,  #3
207        lsl             w10, w10, #1
208        add             w9,  w9,  w10
209        add             x6,  x6,  w9, UXTW
210        ld1r            {v22.8H}, [x6]
211  .endif
212  .ifc \codec,vc1
213        movi            v22.8H,   #28
214  .endif
215        mul             w7,  w4,  w5
216        lsl             w14, w5,  #3
217        lsl             w13, w4,  #3
218        cmp             w7,  #0
219        sub             w6,  w14, w7
220        sub             w12, w13, w7
221        sub             w4,  w7,  w13
222        sub             w4,  w4,  w14
223        add             w4,  w4,  #64
224        b.eq            2f
225
226        dup             v24.8B,  w4
227        dup             v25.8B,  w12
228        ld1             {v4.8B}, [x1], x2
229        dup             v26.8B,  w6
230        dup             v27.8B,  w7
231        ext             v5.8B,  v4.8B,  v5.8B, #1
232        trn1            v0.2S,  v24.2S, v25.2S
233        trn1            v2.2S,  v26.2S, v27.2S
234        trn1            v4.2S,  v4.2S,  v5.2S
2351:      ld1             {v6.8B}, [x1], x2
236        ext             v7.8B,  v6.8B,  v7.8B, #1
237        trn1            v6.2S,  v6.2S,  v7.2S
238        umull           v18.8H, v4.8B,  v0.8B
239        umlal           v18.8H, v6.8B,  v2.8B
240        ld1             {v4.8B}, [x1], x2
241        ext             v5.8B,  v4.8B,  v5.8B, #1
242        trn1            v4.2S,  v4.2S,  v5.2S
243        prfm            pldl1strm, [x1]
244        umull           v19.8H, v6.8B,  v0.8B
245        umlal           v19.8H, v4.8B,  v2.8B
246        trn1            v30.2D, v18.2D, v19.2D
247        trn2            v31.2D, v18.2D, v19.2D
248        add             v18.8H, v30.8H, v31.8H
249  .ifc \codec,h264
250        rshrn           v16.8B, v18.8H, #6
251  .else
252        add             v18.8H, v18.8H, v22.8H
253        shrn            v16.8B, v18.8H, #6
254  .endif
255        subs            w3,  w3,  #2
256        prfm            pldl1strm, [x1, x2]
257  .ifc \type,avg
258        ld1             {v20.S}[0], [x8], x2
259        ld1             {v20.S}[1], [x8], x2
260        urhadd          v16.8B, v16.8B, v20.8B
261  .endif
262        st1             {v16.S}[0], [x0], x2
263        st1             {v16.S}[1], [x0], x2
264        b.gt            1b
265        ret
266
2672:      adds            w12, w12, w6
268        dup             v30.8B, w4
269        b.eq            5f
270        tst             w6,  w6
271        dup             v31.8B, w12
272        trn1            v0.2S,  v30.2S, v31.2S
273        trn2            v1.2S,  v30.2S, v31.2S
274        b.eq            4f
275
276        ext             v1.8B,  v0.8B,  v1.8B, #4
277        ld1             {v4.S}[0], [x1], x2
2783:      ld1             {v4.S}[1], [x1], x2
279        umull           v18.8H, v4.8B,  v0.8B
280        ld1             {v4.S}[0], [x1], x2
281        umull           v19.8H, v4.8B,  v1.8B
282        trn1            v30.2D, v18.2D, v19.2D
283        trn2            v31.2D, v18.2D, v19.2D
284        add             v18.8H, v30.8H, v31.8H
285        prfm            pldl1strm, [x1]
286  .ifc \codec,h264
287        rshrn           v16.8B, v18.8H, #6
288  .else
289        add             v18.8H, v18.8H, v22.8H
290        shrn            v16.8B, v18.8H, #6
291  .endif
292  .ifc \type,avg
293        ld1             {v20.S}[0], [x8], x2
294        ld1             {v20.S}[1], [x8], x2
295        urhadd          v16.8B, v16.8B, v20.8B
296  .endif
297        subs            w3,  w3,  #2
298        prfm            pldl1strm, [x1, x2]
299        st1             {v16.S}[0], [x0], x2
300        st1             {v16.S}[1], [x0], x2
301        b.gt            3b
302        ret
303
3044:      ld1             {v4.8B}, [x1], x2
305        ld1             {v6.8B}, [x1], x2
306        ext             v5.8B,  v4.8B,  v5.8B, #1
307        ext             v7.8B,  v6.8B,  v7.8B, #1
308        trn1            v4.2S,  v4.2S,  v5.2S
309        trn1            v6.2S,  v6.2S,  v7.2S
310        umull           v18.8H, v4.8B,  v0.8B
311        umull           v19.8H, v6.8B,  v0.8B
312        subs            w3,  w3,  #2
313        trn1            v30.2D, v18.2D, v19.2D
314        trn2            v31.2D, v18.2D, v19.2D
315        add             v18.8H, v30.8H, v31.8H
316        prfm            pldl1strm, [x1]
317  .ifc \codec,h264
318        rshrn           v16.8B, v18.8H, #6
319  .else
320        add             v18.8H, v18.8H, v22.8H
321        shrn            v16.8B, v18.8H, #6
322  .endif
323  .ifc \type,avg
324        ld1             {v20.S}[0], [x8], x2
325        ld1             {v20.S}[1], [x8], x2
326        urhadd          v16.8B, v16.8B, v20.8B
327  .endif
328        prfm            pldl1strm, [x1]
329        st1             {v16.S}[0], [x0], x2
330        st1             {v16.S}[1], [x0], x2
331        b.gt            4b
332        ret
333
3345:      ld1             {v4.S}[0], [x1], x2
335        ld1             {v4.S}[1], [x1], x2
336        umull           v18.8H, v4.8B,  v30.8B
337        subs            w3,  w3,  #2
338        prfm            pldl1strm, [x1]
339  .ifc \codec,h264
340        rshrn           v16.8B, v18.8H, #6
341  .else
342        add             v18.8H, v18.8H, v22.8H
343        shrn            v16.8B, v18.8H, #6
344  .endif
345  .ifc \type,avg
346        ld1             {v20.S}[0], [x8], x2
347        ld1             {v20.S}[1], [x8], x2
348        urhadd          v16.8B, v16.8B, v20.8B
349  .endif
350        prfm            pldl1strm, [x1]
351        st1             {v16.S}[0], [x0], x2
352        st1             {v16.S}[1], [x0], x2
353        b.gt            5b
354        ret
355endfunc
356.endm
357
358.macro  h264_chroma_mc2 type
359function ff_\type\()_h264_chroma_mc2_neon, export=1
360        prfm            pldl1strm, [x1]
361        prfm            pldl1strm, [x1, x2]
362        orr             w7,  w4,  w5
363        cbz             w7,  2f
364
365        mul             w7,  w4,  w5
366        lsl             w14, w5,  #3
367        lsl             w13, w4,  #3
368        sub             w6,  w14, w7
369        sub             w12, w13, w7
370        sub             w4,  w7,  w13
371        sub             w4,  w4,  w14
372        add             w4,  w4,  #64
373        dup             v0.8B,  w4
374        dup             v2.8B,  w12
375        dup             v1.8B,  w6
376        dup             v3.8B,  w7
377        trn1            v0.4H,  v0.4H,  v2.4H
378        trn1            v1.4H,  v1.4H,  v3.4H
3791:
380        ld1             {v4.S}[0],  [x1], x2
381        ld1             {v4.S}[1],  [x1], x2
382        rev64           v5.2S,  v4.2S
383        ld1             {v5.S}[1],  [x1]
384        ext             v6.8B,  v4.8B,  v5.8B,  #1
385        ext             v7.8B,  v5.8B,  v4.8B,  #1
386        trn1            v4.4H,  v4.4H,  v6.4H
387        trn1            v5.4H,  v5.4H,  v7.4H
388        umull           v16.8H, v4.8B,  v0.8B
389        umlal           v16.8H, v5.8B,  v1.8B
390  .ifc \type,avg
391        ld1             {v18.H}[0], [x0], x2
392        ld1             {v18.H}[2], [x0]
393        sub             x0,  x0,  x2
394  .endif
395        rev64           v17.4S, v16.4S
396        add             v16.8H, v16.8H, v17.8H
397        rshrn           v16.8B, v16.8H, #6
398  .ifc \type,avg
399        urhadd          v16.8B, v16.8B, v18.8B
400  .endif
401        st1             {v16.H}[0], [x0], x2
402        st1             {v16.H}[2], [x0], x2
403        subs            w3,  w3,  #2
404        b.gt            1b
405        ret
406
4072:
408        ld1             {v16.H}[0], [x1], x2
409        ld1             {v16.H}[1], [x1], x2
410  .ifc \type,avg
411        ld1             {v18.H}[0], [x0], x2
412        ld1             {v18.H}[1], [x0]
413        sub             x0,  x0,  x2
414        urhadd          v16.8B, v16.8B, v18.8B
415  .endif
416        st1             {v16.H}[0], [x0], x2
417        st1             {v16.H}[1], [x0], x2
418        subs            w3,  w3,  #2
419        b.gt            2b
420        ret
421endfunc
422.endm
423
424        h264_chroma_mc8 put
425        h264_chroma_mc8 avg
426        h264_chroma_mc4 put
427        h264_chroma_mc4 avg
428        h264_chroma_mc2 put
429        h264_chroma_mc2 avg
430
431#if CONFIG_RV40_DECODER
432const   rv40bias
433        .short           0, 16, 32, 16
434        .short          32, 28, 32, 28
435        .short           0, 32, 16, 32
436        .short          32, 28, 32, 28
437endconst
438
439        h264_chroma_mc8 put, rv40
440        h264_chroma_mc8 avg, rv40
441        h264_chroma_mc4 put, rv40
442        h264_chroma_mc4 avg, rv40
443#endif
444
445#if CONFIG_VC1DSP
446        h264_chroma_mc8 put, vc1
447        h264_chroma_mc8 avg, vc1
448        h264_chroma_mc4 put, vc1
449        h264_chroma_mc4 avg, vc1
450#endif
451