• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23#define FRAC_BITS   23   // fractional bits for sb_samples and dct
24#define WFRAC_BITS  16   // fractional bits for window
25#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
26
27const   tbl_rev128_s, align=4
28        .byte           12, 13, 14, 15
29        .byte            8,  9, 10, 11
30        .byte            4,  5,  6,  7
31        .byte            0,  1,  2,  3
32endconst
33
34.macro   apply_window   type, st
35function ff_mpadsp_apply_window_\type\()_neon, export=1
36        mov             x7,  x0
37        add             x8,  x0,  #512<<2
38        ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
39        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
40        st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
41        st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
42        movrel          x15, tbl_rev128_s
43        ld1             {v27.4s}, [x15]
44.ifc \type, fixed
45        lsl             x4,  x4,  #1
46.else
47        lsl             x4,  x4,  #2
48.endif
49        add             x10, x0,  #45<<2
50        add             x0,  x0,  #16<<2
51        add             x1,  x1,  #16<<2
52        add             x5,  x3,  x4,  lsl #5
53        sub             x5,  x5,  x4            // samples2
54        neg             x13, x4                 // -incr
55        mov             x9,  #64<<2
56.ifc \type, fixed
57        ld1r            {v16.2s}, [x2]          // dither_state
58        sxtl            v16.2d, v16.2s
59        movi            v29.2d, #0
60        movi            v30.2d, #(1<<OUT_SHIFT)-1
61        trn1            v31.2d, v29.2d, v30.2d
62        trn2            v30.2d, v30.2d, v29.2d
63        trn1            v16.2d, v16.2d, v29.2d
64.else
65        movi            v16.4s, #0
66        movi            v28.4s, #0
67.endif
68        mov             x14, #4
691:
70        mov             x8,  x0
71        sub             x7,  x1,  #3<<2
72        sub             x6,  x1,  x14, lsl #4
73        add             x7,  x7,  x14, lsl #4
74        add             x11, x6, #(32)<<2      // w  + 32
75        add             x12, x7, #(32)<<2      // w2 + 32
76        mov             x15, #8
77        movi            v17.2d, #0
78        movi            v18.2d, #0
79        movi            v19.2d, #0
802:
81        subs            x15, x15, #1
82        ld1             {v0.4s},  [x8],  x9
83        ld1             {v1.4s},  [x10], x9
84        ld1             {v2.4s},  [x6],  x9
85        ld1             {v3.4s},  [x7],  x9
86        tbl             v6.16b, {v0.16b}, v27.16b
87        tbl             v7.16b, {v1.16b}, v27.16b
88        ld1             {v4.4s},  [x11], x9
89        ld1             {v5.4s},  [x12], x9
90        MLA             v16, v2, v0
91        MLA2            v17, v2, v0
92        MLS             v18, v3, v6
93        MLS2            v19, v3, v6
94        MLS             v16, v4, v7
95        MLS2            v17, v4, v7
96        MLS             v18, v5, v1
97        MLS2            v19, v5, v1
98        b.gt            2b
99
100        cmp             x14, #4
101        sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)
102
103.ifc \type, fixed
104        and             v28.16b, v16.16b, v30.16b
105        ext             v28.16b, v29.16b, v28.16b, #8
106
107        b.eq            4f
108        round_sample    v19, 1, 1
1094:
110        round_sample    v16, 1, 0
111        shrn            v16.2s, v16.2d,  #OUT_SHIFT
112        round_sample    v19, 0, 0
113        shrn            v19.2s, v19.2d,  #OUT_SHIFT
114        round_sample    v17, 0, 1
115        round_sample    v18, 1, 1
116        round_sample    v17, 1, 0
117        shrn2           v16.4s, v17.2d,  #OUT_SHIFT
118        round_sample    v18, 0, 0
119        shrn2           v19.4s, v18.2d,  #OUT_SHIFT
120        sqxtn           v16.4h, v16.4s
121        sqxtn           v18.4h, v19.4s
122.else
123        ext             v18.16b, v18.16b, v18.16b, #8
124.endif
125
126        st1             {v16.\st\()}[0], [x3], x4
127        b.eq            4f
128        st1             {v18.\st\()}[1], [x5], x13
1294:
130        st1             {v16.\st\()}[1], [x3], x4
131        st1             {v18.\st\()}[0], [x5], x13
132        st1             {v16.\st\()}[2], [x3], x4
133        st1             {v18.\st\()}[3], [x5], x13
134        st1             {v16.\st\()}[3], [x3], x4
135        st1             {v18.\st\()}[2], [x5], x13
136
137        mov             v16.16b, v28.16b
138
139        subs            x14, x14, #1
140        add             x0,  x0,  #4<<2
141        sub             x10, x10, #4<<2
142        b.gt            1b
143
144// computing samples[16]
145        add             x6,  x1,  #32<<2
146        ld1             {v0.2s},  [x6],  x9
147        ld1             {v1.2s},  [x0],  x9
148.rept   3
149        ld1             {v2.2s},  [x6],  x9
150        ld1             {v3.2s},  [x0],  x9
151        MLS             v16, v0,  v1
152        ld1             {v0.2s},  [x6],  x9
153        ld1             {v1.2s},  [x0],  x9
154        MLS             v16, v2,  v3
155.endr
156        ld1             {v2.2s},  [x6],  x9
157        ld1             {v3.2s},  [x0],  x9
158        MLS             v16, v0,  v1
159        MLS             v16, v2,  v3
160
161.ifc \type, fixed
162        and             v28.16b, v16.16b, v30.16b
163        shrn            v20.2s,  v16.2d,  #OUT_SHIFT
164        xtn             v28.2s,  v28.2d
165        sqxtn           v20.4h,  v20.4s
166        st1             {v28.s}[0], [x2]        // save dither_state
167        st1             {v20.h}[0], [x3]
168.else
169        st1             {v16.s}[0], [x3]
170.endif
171
172        ret
173endfunc
174.purgem round_sample
175.purgem MLA
176.purgem MLA2
177.purgem MLS
178.purgem MLS2
179.endm
180
181
182.macro  round_sample    r, idx, next
183        add             \r\().2d, \r\().2d, v28.2d
184.if \idx == 0
185        and             v28.16b,  \r\().16b,  v30.16b
186.else // \idx == 1
187        and             v28.16b,  \r\().16b,  v31.16b
188.endif
189.if \idx != \next
190  .if \next == 0
191        ext             v28.16b, v28.16b, v29.16b, #8
192  .else
193        ext             v28.16b, v29.16b, v28.16b, #8
194  .endif
195.endif
196.endm
197.macro  MLA             d, s1, s2
198        smlal           \d\().2d, \s1\().2s, \s2\().2s
199.endm
200.macro  MLA2            d, s1, s2
201        smlal2          \d\().2d, \s1\().4s, \s2\().4s
202.endm
203.macro  MLS             d, s1, s2
204        smlsl           \d\().2d, \s1\().2s, \s2\().2s
205.endm
206.macro  MLS2            d, s1, s2
207        smlsl2          \d\().2d, \s1\().4s, \s2\().4s
208.endm
209apply_window fixed, h
210
211
212// nothing to do for round_sample and ML{A,S}2
213.macro  round_sample    r, idx, next
214.endm
215.macro  MLA2            d, s1, s2
216.endm
217.macro  MLS2            d, s1, s2
218.endm
219.macro  MLA             d, s1, s2
220        fmla            \d\().4s, \s1\().4s, \s2\().4s
221.endm
222.macro  MLS             d, s1, s2
223        fmls            \d\().4s, \s1\().4s, \s2\().4s
224.endm
225apply_window float, s
226