• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "config_components.h"
22
23#include "libavutil/arm/asm.S"
24
25/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
26.macro  h264_chroma_mc8 type, codec=h264
27function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
28        push            {r4-r7, lr}
29        ldrd            r4,  r5,  [sp, #20]
30  .ifc \type,avg
31        mov             lr,  r0
32  .endif
33        pld             [r1]
34        pld             [r1, r2]
35
36  .ifc \codec,rv40
37        movrel          r6,  rv40bias
38        lsr             r7,  r5,  #1
39        add             r6,  r6,  r7,  lsl #3
40        lsr             r7,  r4,  #1
41        add             r6,  r6,  r7,  lsl #1
42        vld1.16         {d22[],d23[]}, [r6,:16]
43  .endif
44  .ifc \codec,vc1
45        vmov.u16        q11, #28
46  .endif
47
48A       muls            r7,  r4,  r5
49T       mul             r7,  r4,  r5
50T       cmp             r7,  #0
51        rsb             r6,  r7,  r5,  lsl #3
52        rsb             r12, r7,  r4,  lsl #3
53        sub             r4,  r7,  r4,  lsl #3
54        sub             r4,  r4,  r5,  lsl #3
55        add             r4,  r4,  #64
56
57        beq             2f
58
59        vdup.8          d0,  r4
60        vdup.8          d1,  r12
61        vld1.8          {d4, d5}, [r1], r2
62        vdup.8          d2,  r6
63        vdup.8          d3,  r7
64        vext.8          d5,  d4,  d5,  #1
65
661:      vld1.8          {d6, d7}, [r1], r2
67        vmull.u8        q8,  d4,  d0
68        vmlal.u8        q8,  d5,  d1
69        vext.8          d7,  d6,  d7,  #1
70        vld1.8          {d4, d5}, [r1], r2
71        vmlal.u8        q8,  d6,  d2
72        pld             [r1]
73        vext.8          d5,  d4,  d5,  #1
74        vmlal.u8        q8,  d7,  d3
75        vmull.u8        q9,  d6,  d0
76        subs            r3,  r3,  #2
77        vmlal.u8        q9,  d7,  d1
78        vmlal.u8        q9,  d4,  d2
79        vmlal.u8        q9,  d5,  d3
80        pld             [r1, r2]
81  .ifc \codec,h264
82        vrshrn.u16      d16, q8,  #6
83        vrshrn.u16      d17, q9,  #6
84  .else
85        vadd.u16        q8,  q8,  q11
86        vadd.u16        q9,  q9,  q11
87        vshrn.u16       d16, q8,  #6
88        vshrn.u16       d17, q9,  #6
89  .endif
90  .ifc \type,avg
91        vld1.8          {d20}, [lr,:64], r2
92        vld1.8          {d21}, [lr,:64], r2
93        vrhadd.u8       q8,  q8,  q10
94  .endif
95        vst1.8          {d16}, [r0,:64], r2
96        vst1.8          {d17}, [r0,:64], r2
97        bgt             1b
98
99        pop             {r4-r7, pc}
100
1012:      adds            r12, r12, r6
102        vdup.8          d0,  r4
103        beq             5f
104        tst             r6,  r6
105        vdup.8          d1,  r12
106
107        beq             4f
108
109        vld1.8          {d4}, [r1], r2
110
1113:      vld1.8          {d6}, [r1], r2
112        vmull.u8        q8,  d4,  d0
113        vmlal.u8        q8,  d6,  d1
114        vld1.8          {d4}, [r1], r2
115        vmull.u8        q9,  d6,  d0
116        vmlal.u8        q9,  d4,  d1
117        pld             [r1]
118  .ifc \codec,h264
119        vrshrn.u16      d16, q8,  #6
120        vrshrn.u16      d17, q9,  #6
121  .else
122        vadd.u16        q8,  q8,  q11
123        vadd.u16        q9,  q9,  q11
124        vshrn.u16       d16, q8,  #6
125        vshrn.u16       d17, q9,  #6
126  .endif
127        pld             [r1, r2]
128  .ifc \type,avg
129        vld1.8          {d20}, [lr,:64], r2
130        vld1.8          {d21}, [lr,:64], r2
131        vrhadd.u8       q8,  q8,  q10
132  .endif
133        subs            r3,  r3,  #2
134        vst1.8          {d16}, [r0,:64], r2
135        vst1.8          {d17}, [r0,:64], r2
136        bgt             3b
137
138        pop             {r4-r7, pc}
139
1404:      vld1.8          {d4, d5}, [r1], r2
141        vld1.8          {d6, d7}, [r1], r2
142        vext.8          d5,  d4,  d5,  #1
143        vext.8          d7,  d6,  d7,  #1
144        pld             [r1]
145        subs            r3,  r3,  #2
146        vmull.u8        q8,  d4,  d0
147        vmlal.u8        q8,  d5,  d1
148        vmull.u8        q9,  d6,  d0
149        vmlal.u8        q9,  d7,  d1
150        pld             [r1, r2]
151  .ifc \codec,h264
152        vrshrn.u16      d16, q8,  #6
153        vrshrn.u16      d17, q9,  #6
154  .else
155        vadd.u16        q8,  q8,  q11
156        vadd.u16        q9,  q9,  q11
157        vshrn.u16       d16, q8,  #6
158        vshrn.u16       d17, q9,  #6
159  .endif
160  .ifc \type,avg
161        vld1.8          {d20}, [lr,:64], r2
162        vld1.8          {d21}, [lr,:64], r2
163        vrhadd.u8       q8,  q8,  q10
164  .endif
165        vst1.8          {d16}, [r0,:64], r2
166        vst1.8          {d17}, [r0,:64], r2
167        bgt             4b
168
169        pop             {r4-r7, pc}
170
1715:      vld1.8          {d4}, [r1], r2
172        vld1.8          {d5}, [r1], r2
173        pld             [r1]
174        subs            r3,  r3,  #2
175        vmull.u8        q8,  d4,  d0
176        vmull.u8        q9,  d5,  d0
177        pld             [r1, r2]
178  .ifc \codec,h264
179        vrshrn.u16      d16, q8,  #6
180        vrshrn.u16      d17, q9,  #6
181  .else
182        vadd.u16        q8,  q8,  q11
183        vadd.u16        q9,  q9,  q11
184        vshrn.u16       d16, q8,  #6
185        vshrn.u16       d17, q9,  #6
186  .endif
187  .ifc \type,avg
188        vld1.8          {d20}, [lr,:64], r2
189        vld1.8          {d21}, [lr,:64], r2
190        vrhadd.u8       q8,  q8,  q10
191  .endif
192        vst1.8          {d16}, [r0,:64], r2
193        vst1.8          {d17}, [r0,:64], r2
194        bgt             5b
195
196        pop             {r4-r7, pc}
197endfunc
198.endm
199
200/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
201.macro  h264_chroma_mc4 type, codec=h264
202function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
203        push            {r4-r7, lr}
204        ldrd            r4,  r5,  [sp, #20]
205  .ifc \type,avg
206        mov             lr,  r0
207  .endif
208        pld             [r1]
209        pld             [r1, r2]
210
211  .ifc \codec,rv40
212        movrel          r6,  rv40bias
213        lsr             r7,  r5,  #1
214        add             r6,  r6,  r7,  lsl #3
215        lsr             r7,  r4,  #1
216        add             r6,  r6,  r7,  lsl #1
217        vld1.16         {d22[],d23[]}, [r6,:16]
218  .endif
219  .ifc \codec,vc1
220        vmov.u16        q11, #28
221  .endif
222
223A       muls            r7,  r4,  r5
224T       mul             r7,  r4,  r5
225T       cmp             r7,  #0
226        rsb             r6,  r7,  r5,  lsl #3
227        rsb             r12, r7,  r4,  lsl #3
228        sub             r4,  r7,  r4,  lsl #3
229        sub             r4,  r4,  r5,  lsl #3
230        add             r4,  r4,  #64
231
232        beq             2f
233
234        vdup.8          d0,  r4
235        vdup.8          d1,  r12
236        vld1.8          {d4},     [r1], r2
237        vdup.8          d2,  r6
238        vdup.8          d3,  r7
239
240        vext.8          d5,  d4,  d5,  #1
241        vtrn.32         d4,  d5
242
243        vtrn.32         d0,  d1
244        vtrn.32         d2,  d3
245
2461:      vld1.8          {d6},     [r1], r2
247        vext.8          d7,  d6,  d7,  #1
248        vtrn.32         d6,  d7
249        vmull.u8        q8,  d4,  d0
250        vmlal.u8        q8,  d6,  d2
251        vld1.8          {d4},     [r1], r2
252        vext.8          d5,  d4,  d5,  #1
253        vtrn.32         d4,  d5
254        pld             [r1]
255        vmull.u8        q9,  d6,  d0
256        vmlal.u8        q9,  d4,  d2
257        vadd.i16        d16, d16, d17
258        vadd.i16        d17, d18, d19
259  .ifc \codec,h264
260        vrshrn.u16      d16, q8,  #6
261  .else
262        vadd.u16        q8,  q8,  q11
263        vshrn.u16       d16, q8,  #6
264  .endif
265        subs            r3,  r3,  #2
266        pld             [r1, r2]
267  .ifc \type,avg
268        vld1.32         {d20[0]}, [lr,:32], r2
269        vld1.32         {d20[1]}, [lr,:32], r2
270        vrhadd.u8       d16, d16, d20
271  .endif
272        vst1.32         {d16[0]}, [r0,:32], r2
273        vst1.32         {d16[1]}, [r0,:32], r2
274        bgt             1b
275
276        pop             {r4-r7, pc}
277
2782:      adds            r12, r12, r6
279        vdup.8          d0,  r4
280        beq             5f
281        tst             r6,  r6
282        vdup.8          d1,  r12
283        vtrn.32         d0,  d1
284
285        beq             4f
286
287        vext.32         d1,  d0,  d1,  #1
288        vld1.32         {d4[0]},  [r1], r2
289
2903:      vld1.32         {d4[1]},  [r1], r2
291        vmull.u8        q8,  d4,  d0
292        vld1.32         {d4[0]},  [r1], r2
293        vmull.u8        q9,  d4,  d1
294        vadd.i16        d16, d16, d17
295        vadd.i16        d17, d18, d19
296        pld             [r1]
297  .ifc \codec,h264
298        vrshrn.u16      d16, q8,  #6
299  .else
300        vadd.u16        q8,  q8,  q11
301        vshrn.u16       d16, q8,  #6
302  .endif
303  .ifc \type,avg
304        vld1.32         {d20[0]}, [lr,:32], r2
305        vld1.32         {d20[1]}, [lr,:32], r2
306        vrhadd.u8       d16, d16, d20
307  .endif
308        subs            r3,  r3,  #2
309        pld             [r1, r2]
310        vst1.32         {d16[0]}, [r0,:32], r2
311        vst1.32         {d16[1]}, [r0,:32], r2
312        bgt             3b
313
314        pop             {r4-r7, pc}
315
3164:      vld1.8          {d4},     [r1], r2
317        vld1.8          {d6},     [r1], r2
318        vext.8          d5,  d4,  d5,  #1
319        vext.8          d7,  d6,  d7,  #1
320        vtrn.32         d4,  d5
321        vtrn.32         d6,  d7
322        vmull.u8        q8,  d4,  d0
323        vmull.u8        q9,  d6,  d0
324        subs            r3,  r3,  #2
325        vadd.i16        d16, d16, d17
326        vadd.i16        d17, d18, d19
327        pld             [r1]
328  .ifc \codec,h264
329        vrshrn.u16      d16, q8,  #6
330  .else
331        vadd.u16        q8,  q8,  q11
332        vshrn.u16       d16, q8,  #6
333  .endif
334  .ifc \type,avg
335        vld1.32         {d20[0]}, [lr,:32], r2
336        vld1.32         {d20[1]}, [lr,:32], r2
337        vrhadd.u8       d16, d16, d20
338  .endif
339        pld             [r1]
340        vst1.32         {d16[0]}, [r0,:32], r2
341        vst1.32         {d16[1]}, [r0,:32], r2
342        bgt             4b
343
344        pop             {r4-r7, pc}
345
3465:      vld1.32         {d4[0]},  [r1], r2
347        vld1.32         {d4[1]},  [r1], r2
348        vmull.u8        q8,  d4,  d0
349        subs            r3,  r3,  #2
350        pld             [r1]
351  .ifc \codec,h264
352        vrshrn.u16      d16, q8,  #6
353  .else
354        vadd.u16        q8,  q8,  q11
355        vshrn.u16       d16, q8,  #6
356  .endif
357  .ifc \type,avg
358        vld1.32         {d20[0]}, [lr,:32], r2
359        vld1.32         {d20[1]}, [lr,:32], r2
360        vrhadd.u8       d16, d16, d20
361  .endif
362        pld             [r1]
363        vst1.32         {d16[0]}, [r0,:32], r2
364        vst1.32         {d16[1]}, [r0,:32], r2
365        bgt             5b
366
367        pop             {r4-r7, pc}
368endfunc
369.endm
370
371.macro  h264_chroma_mc2 type
372function ff_\type\()_h264_chroma_mc2_neon, export=1
373        push            {r4-r6, lr}
374        ldr             r4,  [sp, #16]
375        ldr             lr,  [sp, #20]
376        pld             [r1]
377        pld             [r1, r2]
378        orrs            r5,  r4,  lr
379        beq             2f
380
381        mul             r5,  r4,  lr
382        rsb             r6,  r5,  lr,  lsl #3
383        rsb             r12, r5,  r4,  lsl #3
384        sub             r4,  r5,  r4,  lsl #3
385        sub             r4,  r4,  lr,  lsl #3
386        add             r4,  r4,  #64
387        vdup.8          d0,  r4
388        vdup.8          d2,  r12
389        vdup.8          d1,  r6
390        vdup.8          d3,  r5
391        vtrn.16         q0,  q1
3921:
393        vld1.32         {d4[0]},  [r1], r2
394        vld1.32         {d4[1]},  [r1], r2
395        vrev64.32       d5,  d4
396        vld1.32         {d5[1]},  [r1]
397        vext.8          q3,  q2,  q2,  #1
398        vtrn.16         q2,  q3
399        vmull.u8        q8,  d4,  d0
400        vmlal.u8        q8,  d5,  d1
401  .ifc \type,avg
402        vld1.16         {d18[0]}, [r0,:16], r2
403        vld1.16         {d18[1]}, [r0,:16]
404        sub             r0,  r0,  r2
405  .endif
406        vtrn.32         d16, d17
407        vadd.i16        d16, d16, d17
408        vrshrn.u16      d16, q8,  #6
409  .ifc \type,avg
410        vrhadd.u8       d16, d16, d18
411  .endif
412        vst1.16         {d16[0]}, [r0,:16], r2
413        vst1.16         {d16[1]}, [r0,:16], r2
414        subs            r3,  r3,  #2
415        bgt             1b
416        pop             {r4-r6, pc}
4172:
418  .ifc \type,put
419        ldrh_post       r5,  r1,  r2
420        strh_post       r5,  r0,  r2
421        ldrh_post       r6,  r1,  r2
422        strh_post       r6,  r0,  r2
423  .else
424        vld1.16         {d16[0]}, [r1], r2
425        vld1.16         {d16[1]}, [r1], r2
426        vld1.16         {d18[0]}, [r0,:16], r2
427        vld1.16         {d18[1]}, [r0,:16]
428        sub             r0,  r0,  r2
429        vrhadd.u8       d16, d16, d18
430        vst1.16         {d16[0]}, [r0,:16], r2
431        vst1.16         {d16[1]}, [r0,:16], r2
432  .endif
433        subs            r3,  r3,  #2
434        bgt             2b
435        pop             {r4-r6, pc}
436endfunc
437.endm
438
439        h264_chroma_mc8 put
440        h264_chroma_mc8 avg
441        h264_chroma_mc4 put
442        h264_chroma_mc4 avg
443        h264_chroma_mc2 put
444        h264_chroma_mc2 avg
445
446#if CONFIG_RV40_DECODER
447const   rv40bias
448        .short           0, 16, 32, 16
449        .short          32, 28, 32, 28
450        .short           0, 32, 16, 32
451        .short          32, 28, 32, 28
452endconst
453
454        h264_chroma_mc8 put, rv40
455        h264_chroma_mc8 avg, rv40
456        h264_chroma_mc4 put, rv40
457        h264_chroma_mc4 avg, rv40
458#endif
459
460#if CONFIG_VC1DSP
461        h264_chroma_mc8 put, vc1
462        h264_chroma_mc8 avg, vc1
463        h264_chroma_mc4 put, vc1
464        h264_chroma_mc4 avg, vc1
465#endif
466