• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19#include "libavutil/aarch64/asm.S"
20
21const factors, align=4
22        .float 1.0, -1.0, 1.0, -1.0
23endconst
24
25const phi_noise_0, align=4
26        .float 1.0, 0.0, 1.0, 0.0
27endconst
28
29const phi_noise_1, align=4
30        .float 0.0,  1.0,  0.0, -1.0
31        .float 0.0, -1.0,  0.0,  1.0
32endconst
33
34const phi_noise_2, align=4
35        .float -1.0, 0.0, -1.0, 0.0
36endconst
37
38const phi_noise_3, align=4
39        .float 0.0, -1.0,  0.0,  1.0
40        .float 0.0,  1.0,  0.0, -1.0
41endconst
42
43function ff_sbr_sum64x5_neon, export=1
44        add             x1, x0, #64*4
45        add             x2, x0, #128*4
46        add             x3, x0, #192*4
47        add             x4, x0, #256*4
48        mov             x5, #64
491:      ld1             {v0.4S}, [x0]
50        ld1             {v1.4S}, [x1], #16
51        fadd            v0.4S, v0.4S, v1.4S
52        ld1             {v2.4S}, [x2], #16
53        fadd            v0.4S, v0.4S, v2.4S
54        ld1             {v3.4S}, [x3], #16
55        fadd            v0.4S, v0.4S, v3.4S
56        ld1             {v4.4S}, [x4], #16
57        fadd            v0.4S, v0.4S, v4.4S
58        st1             {v0.4S}, [x0], #16
59        subs            x5, x5, #4
60        b.gt            1b
61        ret
62endfunc
63
64function ff_sbr_sum_square_neon, export=1
65        movi            v0.4S, #0
661:      ld1             {v1.4S}, [x0], #16
67        fmla            v0.4S, v1.4S, v1.4S
68        subs            w1, w1, #2
69        b.gt            1b
70        faddp           v0.4S, v0.4S, v0.4S
71        faddp           v0.4S, v0.4S, v0.4S
72        ret
73endfunc
74
75function ff_sbr_neg_odd_64_neon, export=1
76        mov             x1, x0
77        movi            v5.4S, #1<<7, lsl #24
78        ld2             {v0.4S, v1.4S}, [x0], #32
79        eor             v1.16B, v1.16B, v5.16B
80        ld2             {v2.4S, v3.4S}, [x0], #32
81.rept 3
82        st2             {v0.4S, v1.4S}, [x1], #32
83        eor             v3.16B, v3.16B, v5.16B
84        ld2             {v0.4S, v1.4S}, [x0], #32
85        st2             {v2.4S, v3.4S}, [x1], #32
86        eor             v1.16B, v1.16B, v5.16B
87        ld2             {v2.4S, v3.4S}, [x0], #32
88.endr
89        eor             v3.16B, v3.16B, v5.16B
90        st2             {v0.4S, v1.4S}, [x1], #32
91        st2             {v2.4S, v3.4S}, [x1], #32
92        ret
93endfunc
94
95function ff_sbr_qmf_pre_shuffle_neon, export=1
96        add             x1, x0, #60*4
97        add             x2, x0, #64*4
98        mov             x3, #-16
99        mov             x4, #-4
100        movi            v6.4S, #1<<7, lsl #24
101        ld1             {v0.2S}, [x0], #8
102        st1             {v0.2S}, [x2], #8
103.rept 7
104        ld1             {v1.4S}, [x1], x3
105        ld1             {v2.4S}, [x0], #16
106        eor             v1.16B, v1.16B, v6.16B
107        rev64           v1.4S, v1.4S
108        ext             v1.16B, v1.16B, v1.16B, #8
109        st2             {v1.4S, v2.4S}, [x2], #32
110.endr
111        add             x1, x1, #8
112        ld1             {v1.2S}, [x1], x4
113        ld1             {v2.2S}, [x0], #8
114        ld1             {v1.S}[3], [x1]
115        ld1             {v2.S}[2], [x0]
116        eor             v1.16B, v1.16B, v6.16B
117        rev64           v1.4S, v1.4S
118        st2             {v1.2S, v2.2S}, [x2], #16
119        st2             {v1.S, v2.S}[2], [x2]
120        ret
121endfunc
122
123function ff_sbr_qmf_post_shuffle_neon, export=1
124        add             x2, x1, #60*4
125        mov             x3, #-16
126        mov             x4, #32
127        movi            v6.4S, #1<<7, lsl #24
1281:      ld1             {v0.4S}, [x2], x3
129        ld1             {v1.4S}, [x1], #16
130        eor             v0.16B, v0.16B, v6.16B
131        rev64           v0.4S, v0.4S
132        ext             v0.16B, v0.16B, v0.16B, #8
133        st2             {v0.4S, v1.4S}, [x0], #32
134        subs            x4, x4, #4
135        b.gt            1b
136        ret
137endfunc
138
139function ff_sbr_qmf_deint_neg_neon, export=1
140        add             x1, x1, #56*4
141        add             x2, x0, #60*4
142        mov             x3, #-32
143        mov             x4, #32
144        movi            v2.4S, #1<<7, lsl #24
1451:      ld2             {v0.4S, v1.4S}, [x1], x3
146        eor             v0.16B, v0.16B, v2.16B
147        rev64           v1.4S, v1.4S
148        ext             v1.16B, v1.16B, v1.16B, #8
149        st1             {v0.4S}, [x2]
150        st1             {v1.4S}, [x0], #16
151        sub             x2, x2, #16
152        subs            x4, x4, #4
153        b.gt            1b
154        ret
155endfunc
156
157function ff_sbr_qmf_deint_bfly_neon, export=1
158        add             x2, x2, #60*4
159        add             x3, x0, #124*4
160        mov             x4, #64
161        mov             x5, #-16
1621:      ld1             {v0.4S}, [x1], #16
163        ld1             {v1.4S}, [x2], x5
164        rev64           v2.4S, v0.4S
165        ext             v2.16B, v2.16B, v2.16B, #8
166        rev64           v3.4S, v1.4S
167        ext             v3.16B, v3.16B, v3.16B, #8
168        fadd            v1.4S, v1.4S, v2.4S
169        fsub            v0.4S, v0.4S, v3.4S
170        st1             {v0.4S}, [x0], #16
171        st1             {v1.4S}, [x3], x5
172        subs            x4, x4, #4
173        b.gt            1b
174        ret
175endfunc
176
177function ff_sbr_hf_gen_neon, export=1
178        sxtw            x4, w4
179        sxtw            x5, w5
180        movrel          x6, factors
181        ld1             {v7.4S}, [x6]
182        dup             v1.4S, v0.S[0]
183        mov             v2.8B, v1.8B
184        mov             v2.S[2], v7.S[0]
185        mov             v2.S[3], v7.S[0]
186        fmul            v1.4S, v1.4S, v2.4S
187        ld1             {v0.D}[0], [x3]
188        ld1             {v0.D}[1], [x2]
189        fmul            v0.4S, v0.4S, v1.4S
190        fmul            v1.4S, v0.4S, v7.4S
191        rev64           v0.4S, v0.4S
192        sub             x7, x5, x4
193        add             x0, x0, x4, lsl #3
194        add             x1, x1, x4, lsl #3
195        sub             x1, x1, #16
1961:      ld1             {v2.4S}, [x1], #16
197        ld1             {v3.2S}, [x1]
198        fmul            v4.4S, v2.4S, v1.4S
199        fmul            v5.4S, v2.4S, v0.4S
200        faddp           v4.4S, v4.4S, v4.4S
201        faddp           v5.4S, v5.4S, v5.4S
202        faddp           v4.4S, v4.4S, v4.4S
203        faddp           v5.4S, v5.4S, v5.4S
204        mov             v4.S[1], v5.S[0]
205        fadd            v4.2S, v4.2S, v3.2S
206        st1             {v4.2S}, [x0], #8
207        sub             x1, x1, #8
208        subs            x7, x7, #1
209        b.gt            1b
210        ret
211endfunc
212
213function ff_sbr_hf_g_filt_neon, export=1
214        sxtw            x3, w3
215        sxtw            x4, w4
216        mov             x5, #40*2*4
217        add             x1, x1, x4, lsl #3
2181:      ld1             {v0.2S}, [x1], x5
219        ld1             {v1.S}[0], [x2], #4
220        fmul            v2.4S, v0.4S, v1.S[0]
221        st1             {v2.2S}, [x0], #8
222        subs            x3, x3, #1
223        b.gt            1b
224        ret
225endfunc
226
227function ff_sbr_autocorrelate_neon, export=1
228        mov             x2, #38
229        movrel          x3, factors
230        ld1             {v0.4S}, [x3]
231        movi            v1.4S, #0
232        movi            v2.4S, #0
233        movi            v3.4S, #0
234        ld1             {v4.2S}, [x0], #8
235        ld1             {v5.2S}, [x0], #8
236        fmul            v16.2S, v4.2S, v4.2S
237        fmul            v17.2S, v5.2S, v4.S[0]
238        fmul            v18.2S, v5.2S, v4.S[1]
2391:      ld1             {v5.D}[1], [x0], #8
240        fmla            v1.2S, v4.2S, v4.2S
241        fmla            v2.4S, v5.4S, v4.S[0]
242        fmla            v3.4S, v5.4S, v4.S[1]
243        mov             v4.D[0], v5.D[0]
244        mov             v5.D[0], v5.D[1]
245        subs            x2, x2, #1
246        b.gt            1b
247        fmul            v19.2S, v4.2S, v4.2S
248        fmul            v20.2S, v5.2S, v4.S[0]
249        fmul            v21.2S, v5.2S, v4.S[1]
250        fadd            v22.4S, v2.4S, v20.4S
251        fsub            v22.4S, v22.4S, v17.4S
252        fadd            v23.4S, v3.4S, v21.4S
253        fsub            v23.4S, v23.4S, v18.4S
254        rev64           v23.4S, v23.4S
255        fmul            v23.4S, v23.4S, v0.4S
256        fadd            v22.4S, v22.4S, v23.4S
257        st1             {v22.4S}, [x1], #16
258        fadd            v23.2S, v1.2S, v19.2S
259        fsub            v23.2S, v23.2S, v16.2S
260        faddp           v23.2S, v23.2S, v23.2S
261        st1             {v23.S}[0], [x1]
262        add             x1, x1, #8
263        rev64           v3.2S, v3.2S
264        fmul            v3.2S, v3.2S, v0.2S
265        fadd            v2.2S, v2.2S, v3.2S
266        st1             {v2.2S}, [x1]
267        add             x1, x1, #16
268        faddp           v1.2S, v1.2S, v1.2S
269        st1             {v1.S}[0], [x1]
270        ret
271endfunc
272
273.macro apply_noise_common
274        sxtw            x3, w3
275        sxtw            x5, w5
276        movrel          x7, X(ff_sbr_noise_table)
277        add             x3, x3, #1
2781:      and             x3, x3, #0x1ff
279        add             x8, x7, x3, lsl #3
280        add             x3, x3, #2
281        ld1             {v2.4S}, [x0]
282        ld1             {v3.2S}, [x1], #8
283        ld1             {v4.2S}, [x2], #8
284        ld1             {v5.4S}, [x8]
285        mov             v6.16B, v2.16B
286        zip1            v3.4S, v3.4S, v3.4S
287        zip1            v4.4S, v4.4S, v4.4S
288        fmla            v6.4S, v1.4S, v3.4S
289        fmla            v2.4S, v5.4S, v4.4S
290        fcmeq           v7.4S, v3.4S, #0
291        bif             v2.16B, v6.16B, v7.16B
292        st1             {v2.4S}, [x0], #16
293        subs            x5, x5, #2
294        b.gt            1b
295.endm
296
297function ff_sbr_hf_apply_noise_0_neon, export=1
298        movrel          x9, phi_noise_0
299        ld1             {v1.4S}, [x9]
300        apply_noise_common
301        ret
302endfunc
303
304function ff_sbr_hf_apply_noise_1_neon, export=1
305        movrel          x9, phi_noise_1
306        and             x4, x4, #1
307        add             x9, x9, x4, lsl #4
308        ld1             {v1.4S}, [x9]
309        apply_noise_common
310        ret
311endfunc
312
313function ff_sbr_hf_apply_noise_2_neon, export=1
314        movrel          x9, phi_noise_2
315        ld1             {v1.4S}, [x9]
316        apply_noise_common
317        ret
318endfunc
319
320function ff_sbr_hf_apply_noise_3_neon, export=1
321        movrel          x9, phi_noise_3
322        and             x4, x4, #1
323        add             x9, x9, x4, lsl #4
324        ld1             {v1.4S}, [x9]
325        apply_noise_common
326        ret
327endfunc
328