1/* 2 * Copyright (c) 2012 Mans Rullgard 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23function ff_sbr_sum64x5_neon, export=1 24 push {lr} 25 add r1, r0, # 64*4 26 add r2, r0, #128*4 27 add r3, r0, #192*4 28 add lr, r0, #256*4 29 mov r12, #64 301: 31 vld1.32 {q0}, [r0,:128] 32 vld1.32 {q1}, [r1,:128]! 33 vadd.f32 q0, q0, q1 34 vld1.32 {q2}, [r2,:128]! 35 vadd.f32 q0, q0, q2 36 vld1.32 {q3}, [r3,:128]! 37 vadd.f32 q0, q0, q3 38 vld1.32 {q8}, [lr,:128]! 39 vadd.f32 q0, q0, q8 40 vst1.32 {q0}, [r0,:128]! 41 subs r12, #4 42 bgt 1b 43 pop {pc} 44endfunc 45 46function ff_sbr_sum_square_neon, export=1 47 vmov.f32 q0, #0.0 481: 49 vld1.32 {q1}, [r0,:128]! 50 vmla.f32 q0, q1, q1 51 subs r1, r1, #2 52 bgt 1b 53 vadd.f32 d0, d0, d1 54 vpadd.f32 d0, d0, d0 55NOVFP vmov.32 r0, d0[0] 56 bx lr 57endfunc 58 59function ff_sbr_neg_odd_64_neon, export=1 60 mov r1, r0 61 vmov.i32 q8, #1<<31 62 vld2.32 {q0,q1}, [r0,:128]! 63 veor q1, q1, q8 64 vld2.32 {q2,q3}, [r0,:128]! 65 .rept 3 66 vst2.32 {q0,q1}, [r1,:128]! 67 veor q3, q3, q8 68 vld2.32 {q0,q1}, [r0,:128]! 69 vst2.32 {q2,q3}, [r1,:128]! 70 veor q1, q1, q8 71 vld2.32 {q2,q3}, [r0,:128]! 72 .endr 73 veor q3, q3, q8 74 vst2.32 {q0,q1}, [r1,:128]! 75 vst2.32 {q2,q3}, [r1,:128]! 76 bx lr 77endfunc 78 79function ff_sbr_qmf_pre_shuffle_neon, export=1 80 add r1, r0, #60*4 81 add r2, r0, #64*4 82 vld1.32 {d0}, [r0,:64]! 83 vst1.32 {d0}, [r2,:64]! 84 mov r3, #-16 85 mov r12, #24 86 vmov.i32 q8, #1<<31 87 vld1.32 {q0}, [r1,:128], r3 88 vld1.32 {d2}, [r0,:64]! 891: 90 vld1.32 {d3,d4}, [r0,:128]! 91 vrev64.32 q0, q0 92 vld1.32 {q9}, [r1,:128], r3 93 veor q0, q0, q8 94 vld1.32 {d5,d6}, [r0,:128]! 95 vswp d0, d1 96 vrev64.32 q9, q9 97 vst2.32 {q0,q1}, [r2,:64]! 98 vmov q10, q2 99 veor q9, q9, q8 100 vmov d2, d6 101 vswp d18, d19 102 vld1.32 {q0}, [r1,:128], r3 103 vst2.32 {q9,q10}, [r2,:64]! 104 subs r12, r12, #8 105 bgt 1b 106 vld1.32 {d3,d4}, [r0,:128]! 107 vrev64.32 q0, q0 108 vld1.32 {q9}, [r1,:128], r3 109 veor q0, q0, q8 110 vld1.32 {d5}, [r0,:64]! 111 vswp d0, d1 112 vrev64.32 q9, q9 113 vst2.32 {q0,q1}, [r2,:64]! 114 vswp d4, d5 115 veor q1, q9, q8 116 vst2.32 {d3,d5}, [r2,:64]! 117 vst2.32 {d2[0],d4[0]}, [r2,:64]! 118 bx lr 119endfunc 120 121function ff_sbr_qmf_post_shuffle_neon, export=1 122 add r2, r1, #60*4 123 mov r3, #-16 124 mov r12, #32 125 vmov.i32 q8, #1<<31 126 vld1.32 {q0}, [r2,:128], r3 127 vld1.32 {q1}, [r1,:128]! 1281: 129 pld [r2, #-32] 130 vrev64.32 q0, q0 131 vswp d2, d3 132 veor q0, q0, q8 133 vld1.32 {q2}, [r2,:128], r3 134 vld1.32 {q3}, [r1,:128]! 135 vst2.32 {d1,d3}, [r0,:128]! 136 vst2.32 {d0,d2}, [r0,:128]! 137 pld [r2, #-32] 138 vrev64.32 q2, q2 139 vswp d6, d7 140 veor q2, q2, q8 141 vld1.32 {q0}, [r2,:128], r3 142 vld1.32 {q1}, [r1,:128]! 143 vst2.32 {d5,d7}, [r0,:128]! 144 vst2.32 {d4,d6}, [r0,:128]! 145 subs r12, r12, #8 146 bgt 1b 147 bx lr 148endfunc 149 150function ff_sbr_qmf_deint_neg_neon, export=1 151 add r1, r1, #60*4 152 add r2, r0, #62*4 153 mov r3, #-16 154 mov r12, #32 155 vmov.i32 d2, #1<<31 1561: 157 vld2.32 {d0,d1}, [r1,:128], r3 158 veor d0, d0, d2 159 vrev64.32 d1, d1 160 vst1.32 {d0}, [r2,:64] 161 vst1.32 {d1}, [r0,:64]! 162 sub r2, r2, #8 163 subs r12, r12, #2 164 bgt 1b 165 bx lr 166endfunc 167 168function ff_sbr_qmf_deint_bfly_neon, export=1 169 push {lr} 170 add r2, r2, #60*4 171 add r3, r0, #124*4 172 mov r12, #64 173 mov lr, #-16 1741: 175 vld1.32 {q0}, [r1,:128]! 176 vld1.32 {q1}, [r2,:128], lr 177 vrev64.32 q2, q0 178 vrev64.32 q3, q1 179 vadd.f32 d3, d4, d3 180 vadd.f32 d2, d5, d2 181 vsub.f32 d0, d0, d7 182 vsub.f32 d1, d1, d6 183 vst1.32 {q1}, [r3,:128], lr 184 vst1.32 {q0}, [r0,:128]! 185 subs r12, r12, #4 186 bgt 1b 187 pop {pc} 188endfunc 189 190function ff_sbr_hf_g_filt_neon, export=1 191 ldr r12, [sp] 192 add r1, r1, r12, lsl #3 193 mov r12, #40*2*4 194 sub r3, r3, #1 195 vld2.32 {d2[],d3[]},[r2,:64]! 196 vld1.32 {d0}, [r1,:64], r12 1971: 198 vld1.32 {d1}, [r1,:64], r12 199 vmul.f32 q3, q0, q1 200 vld2.32 {d2[],d3[]},[r2,:64]! 201 vld1.32 {d0}, [r1,:64], r12 202 vst1.32 {q3}, [r0,:64]! 203 subs r3, r3, #2 204 bgt 1b 205 it lt 206 bxlt lr 207 vmul.f32 d0, d0, d2 208 vst1.32 {d0}, [r0,:64]! 209 bx lr 210endfunc 211 212function ff_sbr_hf_gen_neon, export=1 213NOVFP vld1.32 {d1[]}, [sp,:32] 214VFP vdup.32 d1, d0[0] 215 vmul.f32 d0, d1, d1 216 vld1.32 {d3}, [r2,:64] 217 vld1.32 {d2}, [r3,:64] 218 vmul.f32 q0, q0, q1 219 ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] 220 vtrn.32 d0, d1 221 vneg.f32 d18, d1 222 vtrn.32 d18, d1 223 add r0, r0, r2, lsl #3 224 add r1, r1, r2, lsl #3 225 sub r1, r1, #2*8 226 sub r3, r3, r2 227 vld1.32 {q1}, [r1,:128]! 2281: 229 vld1.32 {q3}, [r1,:128]! 230 vrev64.32 q2, q1 231 vmov q8, q3 232 vrev64.32 d20, d3 233 vrev64.32 d21, d6 234 vmla.f32 q3, q1, d0[0] 235 vmla.f32 d6, d4, d18 236 vmla.f32 d7, d20, d18 237 vmla.f32 d6, d3, d0[1] 238 vmla.f32 d7, d16, d0[1] 239 vmla.f32 d6, d5, d1 240 vmla.f32 d7, d21, d1 241 vmov q1, q8 242 vst1.32 {q3}, [r0,:128]! 243 subs r3, r3, #2 244 bgt 1b 245 bx lr 246endfunc 247 248function ff_sbr_autocorrelate_neon, export=1 249 vld1.32 {q0}, [r0,:128]! 250 vmov.f32 q1, #0.0 251 vmov.f32 q3, #0.0 252 vmov.f32 d20, #0.0 253 vmul.f32 d21, d1, d1 254 vmov q8, q0 255 vmov q11, q0 256 mov r12, #36 2571: 258 vld1.32 {q2}, [r0,:128]! 259 vrev64.32 q12, q2 260 vmla.f32 q10, q2, q2 261 vmla.f32 d2, d1, d4 262 vmla.f32 d3, d1, d24 263 vmla.f32 d6, d0, d4 264 vmla.f32 d7, d0, d24 265 vmla.f32 d2, d4, d5 266 vmla.f32 d3, d4, d25 267 vmla.f32 d6, d1, d5 268 vmla.f32 d7, d1, d25 269 vmov q0, q2 270 subs r12, r12, #2 271 bgt 1b 272 vld1.32 {q2}, [r0,:128]! 273 vrev64.32 q12, q2 274 vmla.f32 d2, d1, d4 275 vmla.f32 d3, d1, d24 276 vmla.f32 d6, d0, d4 277 vmla.f32 d7, d0, d24 278 vadd.f32 d20, d20, d21 279 vrev64.32 d18, d17 280 vmla.f32 d6, d1, d5 281 vmla.f32 d7, d1, d25 282 vmov q0, q1 283 vmla.f32 d0, d16, d17 284 vmla.f32 d1, d16, d18 285 vmla.f32 d2, d4, d5 286 vmla.f32 d3, d4, d25 287 vneg.f32 s15, s15 288 vmov d21, d20 289 vpadd.f32 d0, d0, d2 290 vpadd.f32 d7, d6, d7 291 vtrn.32 d1, d3 292 vsub.f32 d6, d1, d3 293 vmla.f32 d20, d22, d22 294 vmla.f32 d21, d4, d4 295 vtrn.32 d0, d6 296 vpadd.f32 d20, d20, d21 297 vst1.32 {q3}, [r1,:128]! 298 vst1.32 {d20[1]}, [r1,:32] 299 add r1, r1, #2*4 300 vst1.32 {d0}, [r1,:64] 301 add r1, r1, #4*4 302 vst1.32 {d20[0]}, [r1,:32] 303 bx lr 304endfunc 305 306function ff_sbr_hf_apply_noise_0_neon, export=1 307 vmov.i32 d3, #0 308.Lhf_apply_noise_0: 309 push {r4,lr} 310 movrelx r4, X(ff_sbr_noise_table) 311 ldr r12, [sp, #12] 312 add r3, r3, #1 313 bfc r3, #9, #23 314 sub r12, r12, #1 3151: 316 add lr, r4, r3, lsl #3 317 vld2.32 {q0}, [r0,:64] 318 vld2.32 {q3}, [lr,:64] 319 vld1.32 {d2}, [r1,:64]! 320 vld1.32 {d18}, [r2,:64]! 321 vceq.f32 d16, d2, #0 322 veor d2, d2, d3 323 vmov q2, q0 324 vmla.f32 d0, d6, d18 325 vmla.f32 d1, d7, d18 326 vadd.f32 d4, d4, d2 327 add r3, r3, #2 328 bfc r3, #9, #23 329 vbif d0, d4, d16 330 vbif d1, d5, d16 331 vst2.32 {q0}, [r0,:64]! 332 subs r12, r12, #2 333 bgt 1b 334 blt 2f 335 add lr, r4, r3, lsl #3 336 vld1.32 {d0}, [r0,:64] 337 vld1.32 {d6}, [lr,:64] 338 vld1.32 {d2[]}, [r1,:32]! 339 vld1.32 {d18[]}, [r2,:32]! 340 vceq.f32 d4, d2, #0 341 veor d2, d2, d3 342 vmov d1, d0 343 vmla.f32 d0, d6, d18 344 vadd.f32 s2, s2, s4 345 vbif d0, d1, d4 346 vst1.32 {d0}, [r0,:64]! 3472: 348 pop {r4,pc} 349endfunc 350 351function ff_sbr_hf_apply_noise_1_neon, export=1 352 ldr r12, [sp] 353 push {r4,lr} 354 lsl r12, r12, #31 355 eor lr, r12, #1<<31 356 vmov d3, r12, lr 357.Lhf_apply_noise_1: 358 movrelx r4, X(ff_sbr_noise_table) 359 ldr r12, [sp, #12] 360 add r3, r3, #1 361 bfc r3, #9, #23 362 sub r12, r12, #1 3631: 364 add lr, r4, r3, lsl #3 365 vld2.32 {q0}, [r0,:64] 366 vld2.32 {q3}, [lr,:64] 367 vld1.32 {d2}, [r1,:64]! 368 vld1.32 {d18}, [r2,:64]! 369 vceq.f32 d16, d2, #0 370 veor d2, d2, d3 371 vmov q2, q0 372 vmla.f32 d0, d6, d18 373 vmla.f32 d1, d7, d18 374 vadd.f32 d5, d5, d2 375 add r3, r3, #2 376 bfc r3, #9, #23 377 vbif d0, d4, d16 378 vbif d1, d5, d16 379 vst2.32 {q0}, [r0,:64]! 380 subs r12, r12, #2 381 bgt 1b 382 blt 2f 383 add lr, r4, r3, lsl #3 384 vld1.32 {d0}, [r0,:64] 385 vld1.32 {d6}, [lr,:64] 386 vld1.32 {d2[]}, [r1,:32]! 387 vld1.32 {d18[]}, [r2,:32]! 388 vceq.f32 d4, d2, #0 389 veor d2, d2, d3 390 vmov d1, d0 391 vmla.f32 d0, d6, d18 392 vadd.f32 s3, s3, s5 393 vbif d0, d1, d4 394 vst1.32 {d0}, [r0,:64]! 3952: 396 pop {r4,pc} 397endfunc 398 399function ff_sbr_hf_apply_noise_2_neon, export=1 400 vmov.i32 d3, #1<<31 401 b .Lhf_apply_noise_0 402endfunc 403 404function ff_sbr_hf_apply_noise_3_neon, export=1 405 ldr r12, [sp] 406 push {r4,lr} 407 lsl r12, r12, #31 408 eor lr, r12, #1<<31 409 vmov d3, lr, r12 410 b .Lhf_apply_noise_1 411endfunc 412