1/* 2 * Bluetooth low-complexity, subband codec (SBC) 3 * 4 * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> 5 * Copyright (C) 2008-2010 Nokia Corporation 6 * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> 7 * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> 8 * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27/** 28 * @file 29 * SBC ARM NEON optimizations 30 */ 31 32#include "libavutil/arm/asm.S" 33#include "neon.S" 34 35#define SBC_PROTO_FIXED_SCALE 16 36 37function ff_sbc_analyze_4_neon, export=1 38 /* TODO: merge even and odd cases (or even merge all four calls to this 39 * function) in order to have only aligned reads from 'in' array 40 * and reduce number of load instructions */ 41 vld1.16 {d4, d5}, [r0, :64]! 42 vld1.16 {d8, d9}, [r2, :128]! 43 44 vmull.s16 q0, d4, d8 45 vld1.16 {d6, d7}, [r0, :64]! 46 vmull.s16 q1, d5, d9 47 vld1.16 {d10, d11}, [r2, :128]! 48 49 vmlal.s16 q0, d6, d10 50 vld1.16 {d4, d5}, [r0, :64]! 51 vmlal.s16 q1, d7, d11 52 vld1.16 {d8, d9}, [r2, :128]! 53 54 vmlal.s16 q0, d4, d8 55 vld1.16 {d6, d7}, [r0, :64]! 56 vmlal.s16 q1, d5, d9 57 vld1.16 {d10, d11}, [r2, :128]! 58 59 vmlal.s16 q0, d6, d10 60 vld1.16 {d4, d5}, [r0, :64]! 61 vmlal.s16 q1, d7, d11 62 vld1.16 {d8, d9}, [r2, :128]! 63 64 vmlal.s16 q0, d4, d8 65 vmlal.s16 q1, d5, d9 66 67 vpadd.s32 d0, d0, d1 68 vpadd.s32 d1, d2, d3 69 70 vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE 71 72 vld1.16 {d2, d3, d4, d5}, [r2, :128]! 73 74 vdup.i32 d1, d0[1] /* TODO: can be eliminated */ 75 vdup.i32 d0, d0[0] /* TODO: can be eliminated */ 76 77 vmull.s16 q3, d2, d0 78 vmull.s16 q4, d3, d0 79 vmlal.s16 q3, d4, d1 80 vmlal.s16 q4, d5, d1 81 82 vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */ 83 vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */ 84 85 vst1.32 {d0, d1}, [r1, :128] 86 87 bx lr 88endfunc 89 90function ff_sbc_analyze_8_neon, export=1 91 /* TODO: merge even and odd cases (or even merge all four calls to this 92 * function) in order to have only aligned reads from 'in' array 93 * and reduce number of load instructions */ 94 vld1.16 {d4, d5}, [r0, :64]! 95 vld1.16 {d8, d9}, [r2, :128]! 96 97 vmull.s16 q6, d4, d8 98 vld1.16 {d6, d7}, [r0, :64]! 99 vmull.s16 q7, d5, d9 100 vld1.16 {d10, d11}, [r2, :128]! 101 vmull.s16 q8, d6, d10 102 vld1.16 {d4, d5}, [r0, :64]! 103 vmull.s16 q9, d7, d11 104 vld1.16 {d8, d9}, [r2, :128]! 105 106 vmlal.s16 q6, d4, d8 107 vld1.16 {d6, d7}, [r0, :64]! 108 vmlal.s16 q7, d5, d9 109 vld1.16 {d10, d11}, [r2, :128]! 110 vmlal.s16 q8, d6, d10 111 vld1.16 {d4, d5}, [r0, :64]! 112 vmlal.s16 q9, d7, d11 113 vld1.16 {d8, d9}, [r2, :128]! 114 115 vmlal.s16 q6, d4, d8 116 vld1.16 {d6, d7}, [r0, :64]! 117 vmlal.s16 q7, d5, d9 118 vld1.16 {d10, d11}, [r2, :128]! 119 vmlal.s16 q8, d6, d10 120 vld1.16 {d4, d5}, [r0, :64]! 121 vmlal.s16 q9, d7, d11 122 vld1.16 {d8, d9}, [r2, :128]! 123 124 vmlal.s16 q6, d4, d8 125 vld1.16 {d6, d7}, [r0, :64]! 126 vmlal.s16 q7, d5, d9 127 vld1.16 {d10, d11}, [r2, :128]! 128 vmlal.s16 q8, d6, d10 129 vld1.16 {d4, d5}, [r0, :64]! 130 vmlal.s16 q9, d7, d11 131 vld1.16 {d8, d9}, [r2, :128]! 132 133 vmlal.s16 q6, d4, d8 134 vld1.16 {d6, d7}, [r0, :64]! 135 vmlal.s16 q7, d5, d9 136 vld1.16 {d10, d11}, [r2, :128]! 137 138 vmlal.s16 q8, d6, d10 139 vmlal.s16 q9, d7, d11 140 141 vpadd.s32 d0, d12, d13 142 vpadd.s32 d1, d14, d15 143 vpadd.s32 d2, d16, d17 144 vpadd.s32 d3, d18, d19 145 146 vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE 147 vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE 148 vmovn.s32 d0, q0 149 vmovn.s32 d1, q1 150 151 vdup.i32 d3, d1[1] /* TODO: can be eliminated */ 152 vdup.i32 d2, d1[0] /* TODO: can be eliminated */ 153 vdup.i32 d1, d0[1] /* TODO: can be eliminated */ 154 vdup.i32 d0, d0[0] /* TODO: can be eliminated */ 155 156 vld1.16 {d4, d5}, [r2, :128]! 157 vmull.s16 q6, d4, d0 158 vld1.16 {d6, d7}, [r2, :128]! 159 vmull.s16 q7, d5, d0 160 vmull.s16 q8, d6, d0 161 vmull.s16 q9, d7, d0 162 163 vld1.16 {d4, d5}, [r2, :128]! 164 vmlal.s16 q6, d4, d1 165 vld1.16 {d6, d7}, [r2, :128]! 166 vmlal.s16 q7, d5, d1 167 vmlal.s16 q8, d6, d1 168 vmlal.s16 q9, d7, d1 169 170 vld1.16 {d4, d5}, [r2, :128]! 171 vmlal.s16 q6, d4, d2 172 vld1.16 {d6, d7}, [r2, :128]! 173 vmlal.s16 q7, d5, d2 174 vmlal.s16 q8, d6, d2 175 vmlal.s16 q9, d7, d2 176 177 vld1.16 {d4, d5}, [r2, :128]! 178 vmlal.s16 q6, d4, d3 179 vld1.16 {d6, d7}, [r2, :128]! 180 vmlal.s16 q7, d5, d3 181 vmlal.s16 q8, d6, d3 182 vmlal.s16 q9, d7, d3 183 184 vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */ 185 vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */ 186 vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */ 187 vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */ 188 189 vst1.32 {d0, d1, d2, d3}, [r1, :128] 190 191 bx lr 192endfunc 193 194function ff_sbc_calc_scalefactors_neon, export=1 195 @ parameters 196 @ r0 = sb_sample_f 197 @ r1 = scale_factor 198 @ r2 = blocks 199 @ r3 = channels 200 @ r4 = subbands 201 @ local variables 202 @ r5 = in_loop_1 203 @ r6 = in 204 @ r7 = out_loop_1 205 @ r8 = out 206 @ r9 = ch 207 @ r10 = sb 208 @ r11 = inc 209 @ r12 = blk 210 211 push {r1-r2, r4-r12} 212 ldr r4, [sp, #44] 213 mov r11, #64 214 215 mov r9, #0 2161: 217 add r5, r0, r9, lsl#5 218 add r7, r1, r9, lsl#5 219 220 mov r10, #0 2212: 222 add r6, r5, r10, lsl#2 223 add r8, r7, r10, lsl#2 224 mov r12, r2 225 226 vmov.s32 q0, #0 227 vmov.s32 q1, #0x8000 @ 1 << SCALE_OUT_BITS 228 vmov.s32 q14, #1 229 vmov.s32 q15, #16 @ 31 - SCALE_OUT_BITS 230 vadd.s32 q1, q1, q14 2313: 232 vld1.32 {d16, d17}, [r6, :128], r11 233 vabs.s32 q8, q8 234 vld1.32 {d18, d19}, [r6, :128], r11 235 vabs.s32 q9, q9 236 vld1.32 {d20, d21}, [r6, :128], r11 237 vabs.s32 q10, q10 238 vld1.32 {d22, d23}, [r6, :128], r11 239 vabs.s32 q11, q11 240 vmax.s32 q0, q0, q8 241 vmax.s32 q1, q1, q9 242 vmax.s32 q0, q0, q10 243 vmax.s32 q1, q1, q11 244 subs r12, r12, #4 245 bgt 3b 246 vmax.s32 q0, q0, q1 247 vsub.s32 q0, q0, q14 248 vclz.s32 q0, q0 249 vsub.s32 q0, q15, q0 250 vst1.32 {d0, d1}, [r8, :128] 251 252 add r10, r10, #4 253 cmp r10, r4 254 blt 2b 255 256 add r9, r9, #1 257 cmp r9, r3 258 blt 1b 259 260 pop {r1-r2, r4-r12} 261 bx lr 262endfunc 263 264/* 265 * constants: q13 = (31 - SCALE_OUT_BITS) 266 * q14 = 1 267 * input: q0 - ((1 << SCALE_OUT_BITS) + 1) 268 * r5 - samples for channel 0 269 * r6 - samples for shannel 1 270 * output: q0, q1 - scale factors without joint stereo 271 * q2, q3 - scale factors with joint stereo 272 * q15 - joint stereo selection mask 273 */ 274.macro calc_scalefactors 275 vmov.s32 q1, q0 276 vmov.s32 q2, q0 277 vmov.s32 q3, q0 278 mov r3, r2 2791: 280 vld1.32 {d18, d19}, [r6, :128], r11 281 vbic.s32 q11, q9, q14 282 vld1.32 {d16, d17}, [r5, :128], r11 283 vhadd.s32 q10, q8, q11 284 vhsub.s32 q11, q8, q11 285 vabs.s32 q8, q8 286 vabs.s32 q9, q9 287 vabs.s32 q10, q10 288 vabs.s32 q11, q11 289 vmax.s32 q0, q0, q8 290 vmax.s32 q1, q1, q9 291 vmax.s32 q2, q2, q10 292 vmax.s32 q3, q3, q11 293 subs r3, r3, #1 294 bgt 1b 295 vsub.s32 q0, q0, q14 296 vsub.s32 q1, q1, q14 297 vsub.s32 q2, q2, q14 298 vsub.s32 q3, q3, q14 299 vclz.s32 q0, q0 300 vclz.s32 q1, q1 301 vclz.s32 q2, q2 302 vclz.s32 q3, q3 303 vsub.s32 q0, q13, q0 304 vsub.s32 q1, q13, q1 305 vsub.s32 q2, q13, q2 306 vsub.s32 q3, q13, q3 307.endm 308 309/* 310 * constants: q14 = 1 311 * input: q15 - joint stereo selection mask 312 * r5 - value set by calc_scalefactors macro 313 * r6 - value set by calc_scalefactors macro 314 */ 315.macro update_joint_stereo_samples 316 sub r8, r6, r11 317 sub r7, r5, r11 318 sub r6, r6, r11, asl #1 319 sub r5, r5, r11, asl #1 320 vld1.32 {d18, d19}, [r6, :128] 321 vbic.s32 q11, q9, q14 322 vld1.32 {d16, d17}, [r5, :128] 323 vld1.32 {d2, d3}, [r8, :128] 324 vbic.s32 q3, q1, q14 325 vld1.32 {d0, d1}, [r7, :128] 326 vhsub.s32 q10, q8, q11 327 vhadd.s32 q11, q8, q11 328 vhsub.s32 q2, q0, q3 329 vhadd.s32 q3, q0, q3 330 vbif.s32 q10, q9, q15 331 vbif.s32 d22, d16, d30 332 sub r11, r10, r11, asl #1 333 sub r3, r2, #2 3342: 335 vbif.s32 d23, d17, d31 336 vst1.32 {d20, d21}, [r6, :128], r11 337 vbif.s32 d4, d2, d30 338 vld1.32 {d18, d19}, [r6, :128] 339 vbif.s32 d5, d3, d31 340 vst1.32 {d22, d23}, [r5, :128], r11 341 vbif.s32 d6, d0, d30 342 vld1.32 {d16, d17}, [r5, :128] 343 vbif.s32 d7, d1, d31 344 vst1.32 {d4, d5}, [r8, :128], r11 345 vbic.s32 q11, q9, q14 346 vld1.32 {d2, d3}, [r8, :128] 347 vst1.32 {d6, d7}, [r7, :128], r11 348 vbic.s32 q3, q1, q14 349 vld1.32 {d0, d1}, [r7, :128] 350 vhsub.s32 q10, q8, q11 351 vhadd.s32 q11, q8, q11 352 vhsub.s32 q2, q0, q3 353 vhadd.s32 q3, q0, q3 354 vbif.s32 q10, q9, q15 355 vbif.s32 d22, d16, d30 356 subs r3, r3, #2 357 bgt 2b 358 sub r11, r10, r11, asr #1 359 vbif.s32 d23, d17, d31 360 vst1.32 {d20, d21}, [r6, :128] 361 vbif.s32 q2, q1, q15 362 vst1.32 {d22, d23}, [r5, :128] 363 vbif.s32 q3, q0, q15 364 vst1.32 {d4, d5}, [r8, :128] 365 vst1.32 {d6, d7}, [r7, :128] 366.endm 367 368function ff_sbc_calc_scalefactors_j_neon, export=1 369 @ parameters 370 @ r0 = in = sb_sample_f 371 @ r1 = out = scale_factor 372 @ r2 = blocks 373 @ r3 = subbands 374 @ local variables 375 @ r4 = consts = ff_sbcdsp_joint_bits_mask 376 @ r5 = in0 377 @ r6 = in1 378 @ r7 = out0 379 @ r8 = out1 380 @ r10 = zero 381 @ r11 = inc 382 @ return r0 = joint 383 384 push {r3-r11} 385 movrelx r4, X(ff_sbcdsp_joint_bits_mask) 386 mov r10, #0 387 mov r11, #64 388 389 vmov.s32 q14, #1 390 vmov.s32 q13, #16 @ 31 - SCALE_OUT_BITS 391 392 cmp r3, #4 393 bne 8f 394 3954: @ 4 subbands 396 add r5, r0, #0 397 add r6, r0, #32 398 add r7, r1, #0 399 add r8, r1, #32 400 vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS 401 vadd.s32 q0, q0, q14 402 403 calc_scalefactors 404 405 @ check whether to use joint stereo for subbands 0, 1, 2 406 vadd.s32 q15, q0, q1 407 vadd.s32 q9, q2, q3 408 vmov.s32 d31[1], r10 @ last subband -> no joint 409 vld1.32 {d16, d17}, [r4, :128]! 410 vcgt.s32 q15, q15, q9 411 412 @ calculate and save to memory 'joint' variable 413 @ update and save scale factors to memory 414 vand.s32 q8, q8, q15 415 vbit.s32 q0, q2, q15 416 vpadd.s32 d16, d16, d17 417 vbit.s32 q1, q3, q15 418 vpadd.s32 d16, d16, d16 419 vst1.32 {d0, d1}, [r7, :128] 420 vst1.32 {d2, d3}, [r8, :128] 421 vmov.32 r0, d16[0] 422 423 update_joint_stereo_samples 424 b 9f 425 4268: @ 8 subbands 427 add r5, r0, #16 428 add r6, r0, #48 429 add r7, r1, #16 430 add r8, r1, #48 431 vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS 432 vadd.s32 q0, q0, q14 433 434 calc_scalefactors 435 436 @ check whether to use joint stereo for subbands 4, 5, 6 437 vadd.s32 q15, q0, q1 438 vadd.s32 q9, q2, q3 439 vmov.s32 d31[1], r10 @ last subband -> no joint 440 vld1.32 {d16, d17}, [r4, :128]! 441 vcgt.s32 q15, q15, q9 442 443 @ calculate part of 'joint' variable and save it to d24 444 @ update and save scale factors to memory 445 vand.s32 q8, q8, q15 446 vbit.s32 q0, q2, q15 447 vpadd.s32 d16, d16, d17 448 vbit.s32 q1, q3, q15 449 vst1.32 {d0, d1}, [r7, :128] 450 vst1.32 {d2, d3}, [r8, :128] 451 vpadd.s32 d24, d16, d16 452 453 update_joint_stereo_samples 454 455 add r5, r0, #0 456 add r6, r0, #32 457 add r7, r1, #0 458 add r8, r1, #32 459 vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS 460 vadd.s32 q0, q0, q14 461 462 calc_scalefactors 463 464 @ check whether to use joint stereo for subbands 0, 1, 2, 3 465 vadd.s32 q15, q0, q1 466 vadd.s32 q9, q2, q3 467 vld1.32 {d16, d17}, [r4, :128]! 468 vcgt.s32 q15, q15, q9 469 470 @ combine last part of 'joint' with d24 and save to memory 471 @ update and save scale factors to memory 472 vand.s32 q8, q8, q15 473 vbit.s32 q0, q2, q15 474 vpadd.s32 d16, d16, d17 475 vbit.s32 q1, q3, q15 476 vpadd.s32 d16, d16, d16 477 vst1.32 {d0, d1}, [r7, :128] 478 vadd.s32 d16, d16, d24 479 vst1.32 {d2, d3}, [r8, :128] 480 vmov.32 r0, d16[0] 481 482 update_joint_stereo_samples 4839: 484 pop {r3-r11} 485 bx lr 486endfunc 487 488function ff_sbc_enc_process_input_4s_neon, export=1 489 @ parameters 490 @ r0 = positioin 491 @ r1 = pcm 492 @ r2 = X 493 @ r3 = nsamples 494 @ r4 = nchannels 495 @ local variables 496 @ r5 = ff_sbc_input_perm_4 497 @ r6 = src / x 498 @ r7 = dst / y 499 500 push {r1, r3-r7} 501 ldr r4, [sp, #24] 502 movrelx r5, X(ff_sbc_input_perm_4) 503 504 @ handle X buffer wraparound 505 cmp r0, r3 506 bge 1f @ if (position < nsamples) 507 add r7, r2, #576 @ &X[0][SBC_X_BUFFER_SIZE - 40] 508 add r6, r2, r0, lsl#1 @ &X[0][position] 509 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 510 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 511 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 512 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 513 vld1.16 {d0}, [r6, :64]! 514 vst1.16 {d0}, [r7, :64]! 515 cmp r4, #1 516 ble 2f @ if (nchannels > 1) 517 add r7, r2, #1232 @ &X[1][SBC_X_BUFFER_SIZE - 40] 518 add r6, r2, #656 519 add r6, r6, r0, lsl#1 @ &X[1][position] 520 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 521 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 522 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 523 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 524 vld1.16 {d0}, [r6, :64]! 525 vst1.16 {d0}, [r7, :64]! 5262: 527 mov r0, #288 @ SBC_X_BUFFER_SIZE - 40 5281: 529 530 add r6, r2, r0, lsl#1 @ &X[0][position] 531 add r7, r6, #656 @ &X[1][position] 532 533 cmp r4, #1 534 ble 8f @ if (nchannels > 1) 535 tst r1, #1 536 beq 7f @ if (pcm & 1) 537 @ poor 'pcm' alignment 538 vld1.8 {d0, d1}, [r5, :128] 5391: 540 sub r6, r6, #16 541 sub r7, r7, #16 542 sub r0, r0, #8 543 vld1.8 {d4, d5}, [r1]! 544 vuzp.16 d4, d5 545 vld1.8 {d20, d21}, [r1]! 546 vuzp.16 d20, d21 547 vswp d5, d20 548 vtbl.8 d16, {d4, d5}, d0 549 vtbl.8 d17, {d4, d5}, d1 550 vtbl.8 d18, {d20, d21}, d0 551 vtbl.8 d19, {d20, d21}, d1 552 vst1.16 {d16, d17}, [r6, :128] 553 vst1.16 {d18, d19}, [r7, :128] 554 subs r3, r3, #8 555 bgt 1b 556 b 9f 5577: 558 @ proper 'pcm' alignment 559 vld1.8 {d0, d1}, [r5, :128] 5601: 561 sub r6, r6, #16 562 sub r7, r7, #16 563 sub r0, r0, #8 564 vld2.16 {d4, d5}, [r1]! 565 vld2.16 {d20, d21}, [r1]! 566 vswp d5, d20 567 vtbl.8 d16, {d4, d5}, d0 568 vtbl.8 d17, {d4, d5}, d1 569 vtbl.8 d18, {d20, d21}, d0 570 vtbl.8 d19, {d20, d21}, d1 571 vst1.16 {d16, d17}, [r6, :128] 572 vst1.16 {d18, d19}, [r7, :128] 573 subs r3, r3, #8 574 bgt 1b 575 b 9f 5768: 577 @ mono 578 vld1.8 {d0, d1}, [r5, :128] 5791: 580 sub r6, r6, #16 581 sub r0, r0, #8 582 vld1.8 {d4, d5}, [r1]! 583 vtbl.8 d16, {d4, d5}, d0 584 vtbl.8 d17, {d4, d5}, d1 585 vst1.16 {d16, d17}, [r6, :128] 586 subs r3, r3, #8 587 bgt 1b 5889: 589 pop {r1, r3-r7} 590 bx lr 591endfunc 592 593function ff_sbc_enc_process_input_8s_neon, export=1 594 @ parameters 595 @ r0 = positioin 596 @ r1 = pcm 597 @ r2 = X 598 @ r3 = nsamples 599 @ r4 = nchannels 600 @ local variables 601 @ r5 = ff_sbc_input_perm_8 602 @ r6 = src 603 @ r7 = dst 604 605 push {r1, r3-r7} 606 ldr r4, [sp, #24] 607 movrelx r5, X(ff_sbc_input_perm_8) 608 609 @ handle X buffer wraparound 610 cmp r0, r3 611 bge 1f @ if (position < nsamples) 612 add r7, r2, #512 @ &X[0][SBC_X_BUFFER_SIZE - 72] 613 add r6, r2, r0, lsl#1 @ &X[0][position] 614 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 615 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 616 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 617 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 618 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 619 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 620 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 621 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 622 vld1.16 {d0, d1}, [r6, :128]! 623 vst1.16 {d0, d1}, [r7, :128]! 624 cmp r4, #1 625 ble 2f @ if (nchannels > 1) 626 add r7, r2, #1168 @ &X[1][SBC_X_BUFFER_SIZE - 72] 627 add r6, r2, #656 628 add r6, r6, r0, lsl#1 @ &X[1][position] 629 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 630 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 631 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 632 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 633 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 634 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 635 vld1.16 {d0, d1, d2, d3}, [r6, :128]! 636 vst1.16 {d0, d1, d2, d3}, [r7, :128]! 637 vld1.16 {d0, d1}, [r6, :128]! 638 vst1.16 {d0, d1}, [r7, :128]! 6392: 640 mov r0, #256 @ SBC_X_BUFFER_SIZE - 72 6411: 642 643 add r6, r2, r0, lsl#1 @ &X[0][position] 644 add r7, r6, #656 @ &X[1][position] 645 646 cmp r4, #1 647 ble 8f @ if (nchannels > 1) 648 tst r1, #1 649 beq 7f @ if (pcm & 1) 650 @ poor 'pcm' alignment 651 vld1.8 {d0, d1, d2, d3}, [r5, :128] 6521: 653 sub r6, r6, #32 654 sub r7, r7, #32 655 sub r0, r0, #16 656 vld1.8 {d4, d5, d6, d7}, [r1]! 657 vuzp.16 q2, q3 658 vld1.8 {d20, d21, d22, d23}, [r1]! 659 vuzp.16 q10, q11 660 vswp q3, q10 661 vtbl.8 d16, {d4, d5, d6, d7}, d0 662 vtbl.8 d17, {d4, d5, d6, d7}, d1 663 vtbl.8 d18, {d4, d5, d6, d7}, d2 664 vtbl.8 d19, {d4, d5, d6, d7}, d3 665 vst1.16 {d16, d17, d18, d19}, [r6, :128] 666 vtbl.8 d16, {d20, d21, d22, d23}, d0 667 vtbl.8 d17, {d20, d21, d22, d23}, d1 668 vtbl.8 d18, {d20, d21, d22, d23}, d2 669 vtbl.8 d19, {d20, d21, d22, d23}, d3 670 vst1.16 {d16, d17, d18, d19}, [r7, :128] 671 subs r3, r3, #16 672 bgt 1b 673 b 9f 6747: 675 @ proper 'pcm' alignment 676 vld1.8 {d0, d1, d2, d3}, [r5, :128] 6771: 678 sub r6, r6, #32 679 sub r7, r7, #32 680 sub r0, r0, #16 681 vld2.16 {d4, d5, d6, d7}, [r1]! 682 vld2.16 {d20, d21, d22, d23}, [r1]! 683 vswp q3, q10 684 vtbl.8 d16, {d4, d5, d6, d7}, d0 685 vtbl.8 d17, {d4, d5, d6, d7}, d1 686 vtbl.8 d18, {d4, d5, d6, d7}, d2 687 vtbl.8 d19, {d4, d5, d6, d7}, d3 688 vst1.16 {d16, d17, d18, d19}, [r6, :128] 689 vtbl.8 d16, {d20, d21, d22, d23}, d0 690 vtbl.8 d17, {d20, d21, d22, d23}, d1 691 vtbl.8 d18, {d20, d21, d22, d23}, d2 692 vtbl.8 d19, {d20, d21, d22, d23}, d3 693 vst1.16 {d16, d17, d18, d19}, [r7, :128] 694 subs r3, r3, #16 695 bgt 1b 696 b 9f 6978: 698 @ mono 699 vld1.8 {d0, d1, d2, d3}, [r5, :128] 7001: 701 sub r6, r6, #32 702 sub r0, r0, #16 703 vld1.8 {d4, d5, d6, d7}, [r1]! 704 vtbl.8 d16, {d4, d5, d6, d7}, d0 705 vtbl.8 d17, {d4, d5, d6, d7}, d1 706 vtbl.8 d18, {d4, d5, d6, d7}, d2 707 vtbl.8 d19, {d4, d5, d6, d7}, d3 708 vst1.16 {d16, d17, d18, d19}, [r6, :128] 709 subs r3, r3, #16 710 bgt 1b 7119: 712 pop {r1, r3-r7} 713 bx lr 714endfunc 715