1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19#include "libavutil/aarch64/asm.S" 20 21const factors, align=4 22 .float 1.0, -1.0, 1.0, -1.0 23endconst 24 25const phi_noise_0, align=4 26 .float 1.0, 0.0, 1.0, 0.0 27endconst 28 29const phi_noise_1, align=4 30 .float 0.0, 1.0, 0.0, -1.0 31 .float 0.0, -1.0, 0.0, 1.0 32endconst 33 34const phi_noise_2, align=4 35 .float -1.0, 0.0, -1.0, 0.0 36endconst 37 38const phi_noise_3, align=4 39 .float 0.0, -1.0, 0.0, 1.0 40 .float 0.0, 1.0, 0.0, -1.0 41endconst 42 43function ff_sbr_sum64x5_neon, export=1 44 add x1, x0, #64*4 45 add x2, x0, #128*4 46 add x3, x0, #192*4 47 add x4, x0, #256*4 48 mov x5, #64 491: ld1 {v0.4S}, [x0] 50 ld1 {v1.4S}, [x1], #16 51 fadd v0.4S, v0.4S, v1.4S 52 ld1 {v2.4S}, [x2], #16 53 fadd v0.4S, v0.4S, v2.4S 54 ld1 {v3.4S}, [x3], #16 55 fadd v0.4S, v0.4S, v3.4S 56 ld1 {v4.4S}, [x4], #16 57 fadd v0.4S, v0.4S, v4.4S 58 st1 {v0.4S}, [x0], #16 59 subs x5, x5, #4 60 b.gt 1b 61 ret 62endfunc 63 64function ff_sbr_sum_square_neon, export=1 65 movi v0.4S, #0 661: ld1 {v1.4S}, [x0], #16 67 fmla v0.4S, v1.4S, v1.4S 68 subs w1, w1, #2 69 b.gt 1b 70 faddp v0.4S, v0.4S, v0.4S 71 faddp v0.4S, v0.4S, v0.4S 72 ret 73endfunc 74 75function ff_sbr_neg_odd_64_neon, export=1 76 mov x1, x0 77 movi v5.4S, #1<<7, lsl #24 78 ld2 {v0.4S, v1.4S}, [x0], #32 79 eor v1.16B, v1.16B, v5.16B 80 ld2 {v2.4S, v3.4S}, [x0], #32 81.rept 3 82 st2 {v0.4S, v1.4S}, [x1], #32 83 eor v3.16B, v3.16B, v5.16B 84 ld2 {v0.4S, v1.4S}, [x0], #32 85 st2 {v2.4S, v3.4S}, [x1], #32 86 eor v1.16B, v1.16B, v5.16B 87 ld2 {v2.4S, v3.4S}, [x0], #32 88.endr 89 eor v3.16B, v3.16B, v5.16B 90 st2 {v0.4S, v1.4S}, [x1], #32 91 st2 {v2.4S, v3.4S}, [x1], #32 92 ret 93endfunc 94 95function ff_sbr_qmf_pre_shuffle_neon, export=1 96 add x1, x0, #60*4 97 add x2, x0, #64*4 98 mov x3, #-16 99 mov x4, #-4 100 movi v6.4S, #1<<7, lsl #24 101 ld1 {v0.2S}, [x0], #8 102 st1 {v0.2S}, [x2], #8 103.rept 7 104 ld1 {v1.4S}, [x1], x3 105 ld1 {v2.4S}, [x0], #16 106 eor v1.16B, v1.16B, v6.16B 107 rev64 v1.4S, v1.4S 108 ext v1.16B, v1.16B, v1.16B, #8 109 st2 {v1.4S, v2.4S}, [x2], #32 110.endr 111 add x1, x1, #8 112 ld1 {v1.2S}, [x1], x4 113 ld1 {v2.2S}, [x0], #8 114 ld1 {v1.S}[3], [x1] 115 ld1 {v2.S}[2], [x0] 116 eor v1.16B, v1.16B, v6.16B 117 rev64 v1.4S, v1.4S 118 st2 {v1.2S, v2.2S}, [x2], #16 119 st2 {v1.S, v2.S}[2], [x2] 120 ret 121endfunc 122 123function ff_sbr_qmf_post_shuffle_neon, export=1 124 add x2, x1, #60*4 125 mov x3, #-16 126 mov x4, #32 127 movi v6.4S, #1<<7, lsl #24 1281: ld1 {v0.4S}, [x2], x3 129 ld1 {v1.4S}, [x1], #16 130 eor v0.16B, v0.16B, v6.16B 131 rev64 v0.4S, v0.4S 132 ext v0.16B, v0.16B, v0.16B, #8 133 st2 {v0.4S, v1.4S}, [x0], #32 134 subs x4, x4, #4 135 b.gt 1b 136 ret 137endfunc 138 139function ff_sbr_qmf_deint_neg_neon, export=1 140 add x1, x1, #56*4 141 add x2, x0, #60*4 142 mov x3, #-32 143 mov x4, #32 144 movi v2.4S, #1<<7, lsl #24 1451: ld2 {v0.4S, v1.4S}, [x1], x3 146 eor v0.16B, v0.16B, v2.16B 147 rev64 v1.4S, v1.4S 148 ext v1.16B, v1.16B, v1.16B, #8 149 st1 {v0.4S}, [x2] 150 st1 {v1.4S}, [x0], #16 151 sub x2, x2, #16 152 subs x4, x4, #4 153 b.gt 1b 154 ret 155endfunc 156 157function ff_sbr_qmf_deint_bfly_neon, export=1 158 add x2, x2, #60*4 159 add x3, x0, #124*4 160 mov x4, #64 161 mov x5, #-16 1621: ld1 {v0.4S}, [x1], #16 163 ld1 {v1.4S}, [x2], x5 164 rev64 v2.4S, v0.4S 165 ext v2.16B, v2.16B, v2.16B, #8 166 rev64 v3.4S, v1.4S 167 ext v3.16B, v3.16B, v3.16B, #8 168 fadd v1.4S, v1.4S, v2.4S 169 fsub v0.4S, v0.4S, v3.4S 170 st1 {v0.4S}, [x0], #16 171 st1 {v1.4S}, [x3], x5 172 subs x4, x4, #4 173 b.gt 1b 174 ret 175endfunc 176 177function ff_sbr_hf_gen_neon, export=1 178 sxtw x4, w4 179 sxtw x5, w5 180 movrel x6, factors 181 ld1 {v7.4S}, [x6] 182 dup v1.4S, v0.S[0] 183 mov v2.8B, v1.8B 184 mov v2.S[2], v7.S[0] 185 mov v2.S[3], v7.S[0] 186 fmul v1.4S, v1.4S, v2.4S 187 ld1 {v0.D}[0], [x3] 188 ld1 {v0.D}[1], [x2] 189 fmul v0.4S, v0.4S, v1.4S 190 fmul v1.4S, v0.4S, v7.4S 191 rev64 v0.4S, v0.4S 192 sub x7, x5, x4 193 add x0, x0, x4, lsl #3 194 add x1, x1, x4, lsl #3 195 sub x1, x1, #16 1961: ld1 {v2.4S}, [x1], #16 197 ld1 {v3.2S}, [x1] 198 fmul v4.4S, v2.4S, v1.4S 199 fmul v5.4S, v2.4S, v0.4S 200 faddp v4.4S, v4.4S, v4.4S 201 faddp v5.4S, v5.4S, v5.4S 202 faddp v4.4S, v4.4S, v4.4S 203 faddp v5.4S, v5.4S, v5.4S 204 mov v4.S[1], v5.S[0] 205 fadd v4.2S, v4.2S, v3.2S 206 st1 {v4.2S}, [x0], #8 207 sub x1, x1, #8 208 subs x7, x7, #1 209 b.gt 1b 210 ret 211endfunc 212 213function ff_sbr_hf_g_filt_neon, export=1 214 sxtw x3, w3 215 sxtw x4, w4 216 mov x5, #40*2*4 217 add x1, x1, x4, lsl #3 2181: ld1 {v0.2S}, [x1], x5 219 ld1 {v1.S}[0], [x2], #4 220 fmul v2.4S, v0.4S, v1.S[0] 221 st1 {v2.2S}, [x0], #8 222 subs x3, x3, #1 223 b.gt 1b 224 ret 225endfunc 226 227function ff_sbr_autocorrelate_neon, export=1 228 mov x2, #38 229 movrel x3, factors 230 ld1 {v0.4S}, [x3] 231 movi v1.4S, #0 232 movi v2.4S, #0 233 movi v3.4S, #0 234 ld1 {v4.2S}, [x0], #8 235 ld1 {v5.2S}, [x0], #8 236 fmul v16.2S, v4.2S, v4.2S 237 fmul v17.2S, v5.2S, v4.S[0] 238 fmul v18.2S, v5.2S, v4.S[1] 2391: ld1 {v5.D}[1], [x0], #8 240 fmla v1.2S, v4.2S, v4.2S 241 fmla v2.4S, v5.4S, v4.S[0] 242 fmla v3.4S, v5.4S, v4.S[1] 243 mov v4.D[0], v5.D[0] 244 mov v5.D[0], v5.D[1] 245 subs x2, x2, #1 246 b.gt 1b 247 fmul v19.2S, v4.2S, v4.2S 248 fmul v20.2S, v5.2S, v4.S[0] 249 fmul v21.2S, v5.2S, v4.S[1] 250 fadd v22.4S, v2.4S, v20.4S 251 fsub v22.4S, v22.4S, v17.4S 252 fadd v23.4S, v3.4S, v21.4S 253 fsub v23.4S, v23.4S, v18.4S 254 rev64 v23.4S, v23.4S 255 fmul v23.4S, v23.4S, v0.4S 256 fadd v22.4S, v22.4S, v23.4S 257 st1 {v22.4S}, [x1], #16 258 fadd v23.2S, v1.2S, v19.2S 259 fsub v23.2S, v23.2S, v16.2S 260 faddp v23.2S, v23.2S, v23.2S 261 st1 {v23.S}[0], [x1] 262 add x1, x1, #8 263 rev64 v3.2S, v3.2S 264 fmul v3.2S, v3.2S, v0.2S 265 fadd v2.2S, v2.2S, v3.2S 266 st1 {v2.2S}, [x1] 267 add x1, x1, #16 268 faddp v1.2S, v1.2S, v1.2S 269 st1 {v1.S}[0], [x1] 270 ret 271endfunc 272 273.macro apply_noise_common 274 sxtw x3, w3 275 sxtw x5, w5 276 movrel x7, X(ff_sbr_noise_table) 277 add x3, x3, #1 2781: and x3, x3, #0x1ff 279 add x8, x7, x3, lsl #3 280 add x3, x3, #2 281 ld1 {v2.4S}, [x0] 282 ld1 {v3.2S}, [x1], #8 283 ld1 {v4.2S}, [x2], #8 284 ld1 {v5.4S}, [x8] 285 mov v6.16B, v2.16B 286 zip1 v3.4S, v3.4S, v3.4S 287 zip1 v4.4S, v4.4S, v4.4S 288 fmla v6.4S, v1.4S, v3.4S 289 fmla v2.4S, v5.4S, v4.4S 290 fcmeq v7.4S, v3.4S, #0 291 bif v2.16B, v6.16B, v7.16B 292 st1 {v2.4S}, [x0], #16 293 subs x5, x5, #2 294 b.gt 1b 295.endm 296 297function ff_sbr_hf_apply_noise_0_neon, export=1 298 movrel x9, phi_noise_0 299 ld1 {v1.4S}, [x9] 300 apply_noise_common 301 ret 302endfunc 303 304function ff_sbr_hf_apply_noise_1_neon, export=1 305 movrel x9, phi_noise_1 306 and x4, x4, #1 307 add x9, x9, x4, lsl #4 308 ld1 {v1.4S}, [x9] 309 apply_noise_common 310 ret 311endfunc 312 313function ff_sbr_hf_apply_noise_2_neon, export=1 314 movrel x9, phi_noise_2 315 ld1 {v1.4S}, [x9] 316 apply_noise_common 317 ret 318endfunc 319 320function ff_sbr_hf_apply_noise_3_neon, export=1 321 movrel x9, phi_noise_3 322 and x4, x4, #1 323 add x9, x9, x4, lsl #4 324 ld1 {v1.4S}, [x9] 325 apply_noise_common 326 ret 327endfunc 328