1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21 22.macro push_v_regs 23 stp q8, q9, [sp, #-32]! 24 stp q10, q11, [sp, #-32]! 25 stp q12, q13, [sp, #-32]! 26 stp q14, q15, [sp, #-32]! 27 stp X8, X9, [sp, #-16]! 28 stp X10, X11, [sp, #-16]! 29 stp X12, X13, [sp, #-16]! 30 stp X14, X15, [sp, #-16]! 31 stp X16, X17, [sp, #-16]! 32 stp X29, X30, [sp, #-16]! 33.endm 34.macro pop_v_regs 35 ldp X29, X30, [sp], #16 36 ldp X16, X17, [sp], #16 37 ldp X14, X15, [sp], #16 38 ldp X12, X13, [sp], #16 39 ldp X10, X11, [sp], #16 40 ldp X8, X9, [sp], #16 41 ldp q14, q15, [sp], #32 42 ldp q12, q13, [sp], #32 43 ldp q10, q11, [sp], #32 44 ldp q8, q9, [sp], #32 45.endm 46 47.macro swp reg1, reg2 48 MOv x16, \reg1 49 MOv \reg1, \reg2 50 MOv \reg2, x16 51.endm 52.text 53.p2align 2 54.global ixheaacd_imdct_using_fft_armv8 55ixheaacd_imdct_using_fft_armv8: 56 push_v_regs 57 58 MOV X29, #11600 59 ADD X4, X0, X29 60 MOV X29, #11856 61 ADD X5, X0, X29 62 MOV X29, #11920 63 ADD X6, X0, X29 64 MOV X29, #11936 65 ADD X7, X0, X29 66 67COND_1: CMP X1, #0x400 68 BNE COND_2 69 MOv X8, #4 70 B RADIX_4_FIRST_START 71 72 73COND_2: CMP X1, #0x200 74 BNE COND_3 75 MOv X8, #3 76 MOv X4, X5 77 B RADIX_8_FIRST_START 78 79COND_3: CMP X1, #0x100 80 BNE COND_4 81 MOv X8, #3 82 MOv X4, X5 83 B RADIX_4_FIRST_START 84 85COND_4: CMP X1, #0x80 86 BNE COND_5 87 MOv X8, #2 88 MOv X4, X6 89 B RADIX_8_FIRST_START 90 91COND_5: CMP X1, #0x40 92 BNE COND_6 93 MOv X8, #2 94 MOv X4, X6 95 B RADIX_4_FIRST_START 96COND_6: 97 MOv X8, #1 98 MOv X4, X7 99 100 101 102RADIX_8_FIRST_START: 103 LSR W9 , W1, #5 104 LSL W1, W1, #1 105 106RADIX_8_FIRST_LOOP: 107 108 MOv X5 , X2 109 MOv X6 , X2 110 MOv X7 , X2 111 MOv X11 , X2 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 LDRB W12, [X4] 135 ADD X5, X5, X12, LSL #3 136 LD2 {v0.S, v1.S}[0], [X5], X1 137 ADD X5, X5, X1 138 LD2 {v4.S, v5.S}[0], [X5], X1 139 SUB X5, X5, X1, LSL #1 140 LD2 {v2.S, v3.S}[0], [X5], X1 141 ADD X5, X5, X1 142 LD2 {v6.S, v7.S}[0], [X5], X1 143 SUB X5, X5, X1, LSL #2 144 145 LDRB W12, [X4, #1] 146 ADD X6, X6, X12, LSL #3 147 LD2 {v0.S, v1.S}[1], [X6] , X1 148 ADD X6, X6, X1 149 LD2 {v4.S, v5.S}[1], [X6] , X1 150 SUB X6, X6, X1, LSL #1 151 LD2 {v2.S, v3.S}[1], [X6] , X1 152 ADD X6, X6, X1 153 LD2 {v6.S, v7.S}[1], [X6], X1 154 SUB X6, X6, X1, LSL #2 155 156 157 LDRB W12, [X4, #2] 158 ADD X7, X7, X12, LSL #3 159 LD2 {v0.S, v1.S}[2], [X7] , X1 160 ADD X7, X7, X1 161 LD2 {v4.S, v5.S}[2], [X7] , X1 162 SUB X7, X7, X1, LSL #1 163 164 LDRB W12, [X4, #3] 165 ADD X11, X11, X12, LSL #3 166 LD2 {v0.S, v1.S}[3], [X11] , X1 167 ADD X11, X11, X1 168 LD2 {v4.S, v5.S}[3], [X11] , X1 169 SUB X11, X11, X1, LSL #1 170 171 172 ADD v8.4S, v0.4S, v4.4S 173 LD2 {v2.S, v3.S}[2], [X7] , X1 174 ADD X7, X7, X1 175 176 177 SUB v9.4S, v0.4S, v4.4S 178 LD2 {v6.S, v7.S}[2], [X7], X1 179 SUB X7, X7, X1, LSL #2 180 181 182 ADD v0.4S, v1.4S, v5.4S 183 LD2 {v2.S, v3.S}[3], [X11] , X1 184 ADD X11, X11, X1 185 186 SUB v4.4S, v1.4S, v5.4S 187 LD2 {v6.S, v7.S}[3], [X11], X1 188 SUB X11, X11, X1, LSL #2 189 190 ADD X4, X4, #4 191 192 ADD X5, X5, X1, LSR #1 193 ADD X6, X6, X1, LSR #1 194 ADD X7, X7, X1, LSR #1 195 ADD X11, X11, X1, LSR #1 196 197 198 ADD v1.4S, v2.4S, v6.4S 199 LD2 {v14.S, v15.S}[0], [X5] , X1 200 201 202 SUB v5.4S, v2.4S, v6.4S 203 LD2 {v10.S, v11.S}[0], [X5] , X1 204 205 206 ADD v2.4S, v3.4S, v7.4S 207 LD2 {v12.S, v13.S}[0], [X5] , X1 208 209 210 SUB v6.4S, v3.4S, v7.4S 211 LD2 {v14.S, v15.S}[1], [X6] , X1 212 213 ADD v3.4S, v9.4S, v6.4S 214 LD2 {v10.S, v11.S}[1], [X6] , X1 215 216 SUB v7.4S, v9.4S, v6.4S 217 LD2 {v12.S, v13.S}[1], [X6] , X1 218 219 SUB v6.4S, v4.4S, v5.4S 220 LD2 {v14.S, v15.S}[2], [X7] , X1 221 222 ADD v9.4S, v4.4S, v5.4S 223 LD2 {v10.S, v11.S}[2], [X7] , X1 224 225 ADD v4.4S, v8.4S, v1.4S 226 LD2 {v12.S, v13.S}[2], [X7] , X1 227 228 SUB v5.4S, v8.4S, v1.4S 229 LD2 {v14.S, v15.S}[3], [X11] , X1 230 231 ADD v8.4S, v0.4S, v2.4S 232 LD2 {v10.S, v11.S}[3], [X11] , X1 233 234 SUB v0.4S, v0.4S, v2.4S 235 LD2 {v12.S, v13.S}[3], [X11] , X1 236 237 238 239 240 241 242 243 244 245 246 247 248 LD2 {v1.S, v2.S}[0], [X5], X1 249 250 ADD v17.4S, v14.4S, v12.4S 251 252 LD2 {v1.S, v2.S}[1], [X6] , X1 253 254 SUB v16.4S, v14.4S, v12.4S 255 256 LD2 {v1.S, v2.S}[2], [X7] , X1 257 258 ADD v14.4S, v15.4S, v13.4S 259 260 LD2 {v1.S, v2.S}[3], [X11] , X1 261 262 SUB v12.4S, v15.4S, v13.4S 263 264 ADD v15.4S, v10.4S, v1.4S 265 SUB v13.4S, v10.4S, v1.4S 266 ADD v10.4S, v11.4S, v2.4S 267 SUB v1.4S, v11.4S, v2.4S 268 269 ADD v11.4S, v17.4S, v15.4S 270 SUB v2.4S, v17.4S, v15.4S 271 ADD v17.4S, v14.4S, v10.4S 272 SUB v15.4S, v14.4S, v10.4S 273 274 ADD v14.4S, v16.4S, v12.4S 275 SUB v10.4S, v16.4S, v12.4S 276 ADD v16.4S, v13.4S, v1.4S 277 SUB v12.4S, v13.4S, v1.4S 278 279 ADD v1.4S , v14.4S, v12.4S 280 SUB v13.4S, v14.4S, v12.4S 281 SUB v12.4S, v16.4S, v10.4S 282 283 284 UZP1 v22.8H, v1.8H, v1.8H 285 UZP2 v23.8H, v1.8H, v1.8H 286 ADD v14.4S, v16.4S, v10.4S 287 288 UZP1 v26.8H, v13.8H, v13.8H 289 UZP2 v27.8H, v13.8H, v13.8H 290 ADD v16.4S, v4.4S, v11.4S 291 292 UZP1 v24.8H, v12.8H, v12.8H 293 UZP2 v25.8H, v12.8H, v12.8H 294 SUB v10.4S, v4.4S, v11.4S 295 296 UZP1 v28.8H, v14.8H, v14.8H 297 UZP2 v29.8H, v14.8H, v14.8H 298 ADD v4.4S, v8.4S, v17.4S 299 300 MOv W14, #0x5a82 301 302 SUB v11.4S, v8.4S, v17.4S 303 304 ADD v8.4S, v5.4S, v15.4S 305 SUB v17.4S, v5.4S, v15.4S 306 SUB v5.4S, v0.4S, v2.4S 307 ADD v15.4S, v0.4S, v2.4S 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 DUP v31.4H, W14 330 331 UMULL v19.4S, v26.4H, v31.4H 332 UMULL v18.4S, v28.4H, v31.4H 333 SSHR v19.4S, v19.4S, #15 334 SSHR v18.4S, v18.4S, #15 335 336 337 SQDMLAL v19.4S, v27.4H, v31.4H 338 SQDMLAL v18.4S, v29.4H, v31.4H 339 340 341 UMULL v13.4S, v24.4H, v31.4H 342 UMULL v14.4S, v22.4H, v31.4H 343 344 ADD v20.4S, v3.4S, v19.4S 345 SUB v21.4S, v3.4S, v19.4S 346 ADD v30.4S, v6.4S, v18.4S 347 SUB v6.4S, v6.4S, v18.4S 348 349 SSHR v13.4S, v13.4S, #15 350 SSHR v14.4S, v14.4S, #15 351 352 SQDMLAL v13.4S, v25.4H, v31.4H 353 SQDMLAL v14.4S, v23.4H, v31.4H 354 355 356 357 358 ADD v3.4S, v7.4S, v13.4S 359 SUB v19.4S, v7.4S, v13.4S 360 ADD v1.4S, v9.4S, v14.4S 361 SUB v18.4S, v9.4S, v14.4S 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 swp v17.D[0], v8.D[0] 386 swp v17.D[1], v8.D[1] 387 swp v4.D[0], v16.D[0] 388 swp v4.D[1], v16.D[1] 389 390 TRN1 v12.4S, v4.4S, v20.4S 391 TRN2 v22.4S, v4.4S, v20.4S 392 393 SHL v12.4S, v12.4S, #3 394 TRN1 v9.4S, v17.4S, v3.4S 395 TRN2 v2.4S, v17.4S, v3.4S 396 SHL v22.4S, v22.4S, #3 397 398 SHL v9.4S, v9.4S, #3 399 TRN1 v24.4S, v10.4S, v21.4S 400 TRN2 v7.4S, v10.4S, v21.4S 401 SHL v2.4S, v2.4S, #3 402 403 SHL v24.4S, v24.4S, #3 404 TRN1 v13.4S, v16.4S, v6.4S 405 TRN2 v23.4S, v16.4S, v6.4S 406 SHL v7.4S, v7.4S, #3 407 408 SHL v13.4S, v13.4S, #3 409 TRN1 v10.4S, v5.4S, v18.4S 410 TRN2 v3.4S, v5.4S, v18.4S 411 SHL v23.4S, v23.4S, #3 412 413 SHL v10.4S, v10.4S, #3 414 TRN1 v26.4S, v8.4S, v19.4S 415 TRN2 v4.4S, v8.4S, v19.4S 416 SHL v3.4S, v3.4S, #3 417 418 SHL v26.4S, v26.4S, #3 419 TRN1 v25.4S, v11.4S, v30.4S 420 TRN2 v8.4S, v11.4S, v30.4S 421 SHL v4.4S, v4.4S, #3 422 423 SHL v25.4S, v25.4S, #3 424 TRN1 v27.4S, v15.4S, v1.4S 425 TRN2 v5.4S, v15.4S, v1.4S 426 SHL v8.4S, v8.4S, #3 427 428 SHL v27.4S, v27.4S, #3 429 swp v9.D[0], v12.D[1] 430 SHL v5.4S, v5.4S, #3 431 swp v2.D[0], v22.D[1] 432 433 swp v24.D[1], v26.D[0] 434 swp v7.D[1], v4.D[0] 435 swp v10.D[0], v13.D[1] 436 swp v3.D[0], v23.D[1] 437 swp v27.D[0], v25.D[1] 438 swp v5.D[0], v8.D[1] 439 440 MOv X15, #32 441 ST2 {v12.4S, v13.4S}, [X3], X15 442 ST2 {v24.4S, v25.4S}, [X3], X15 443 ST2 {v22.4S, v23.4S}, [X3], X15 444 ST2 {v7.4S, v8.4S}, [X3], X15 445 ST2 {v9.4S, v10.4S}, [X3], X15 446 ST2 {v26.4S, v27.4S}, [X3], X15 447 ST2 {v2.4S, v3.4S}, [X3], X15 448 ST2 {v4.4S, v5.4S}, [X3], X15 449 450 451 SUBS X9, X9, #1 452 BNE RADIX_8_FIRST_LOOP 453 454 LSR X1, X1, #1 455 LSL X15, X1, #3 456 SUB X3, X3, X15 457 458 MOv X5, #8 459 MOv X4, #32 460 LSR X15, X1, #5 461 MOv X6, X15 462 B RADIX_4_FIRST_ENDS 463RADIX_8_FIRST_ENDS: 464 465RADIX_4_FIRST_START: 466 467 LSR W9, W1, #4 468 LSL W1, W1, #1 469RADIX_4_LOOP: 470 471 MOv X5 , X2 472 MOv X6 , X2 473 MOv X7 , X2 474 MOv X11 , X2 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 LDRB W12, [X4, #0] 491 ADD X5, X5, X12, LSL #3 492 493 LD2 {v0.S, v1.S}[0], [X5] , X1 494 ADD X5, X5, X1 495 LD2 {v8.S, v9.S}[0], [X5] , X1 496 SUB X5, X5, X1, LSL #1 497 LD2 {v4.S, v5.S}[0], [X5] , X1 498 ADD X5, X5, X1 499 LD2 {v12.S, v13.S}[0], [X5] , X1 500 501 LDRB W12, [X4, #1] 502 ADD X6, X6, X12, LSL #3 503 LD2 {v0.S, v1.S}[1], [X6] , X1 504 ADD X6, X6, X1 505 LD2 {v8.S, v9.S}[1], [X6] , X1 506 SUB X6, X6, X1, LSL #1 507 LD2 {v4.S, v5.S}[1], [X6] , X1 508 ADD X6, X6, X1 509 LD2 {v12.S, v13.S}[1], [X6] , X1 510 511 LDRB W12, [X4, #2] 512 ADD X7, X7, X12, LSL #3 513 514 LD2 {v0.S, v1.S}[2], [X7] , X1 515 ADD X7, X7, X1 516 LD2 {v8.S, v9.S}[2], [X7] , X1 517 518 519 LDRB W12, [X4, #3] 520 ADD X11, X11, X12 , LSL #3 521 522 523 LD2 {v0.S, v1.S}[3], [X11] , X1 524 ADD X11, X11, X1 525 LD2 {v8.S, v9.S}[3], [X11] , X1 526 527 SUB X7, X7, X1, LSL #1 528 ADD v16.4S, v0.4S, v8.4S 529 LD2 {v4.S, v5.S}[2], [X7] , X1 530 ADD X7, X7, X1 531 ADD v18.4S, v1.4S, v9.4S 532 LD2 {v12.S, v13.S}[2], [X7] , X1 533 534 SUB X11, X11, X1, LSL #1 535 SUB v20.4S, v0.4S, v8.4S 536 LD2 {v4.S, v5.S}[3], [X11] , X1 537 ADD X11, X11, X1 538 SUB v22.4S, v1.4S, v9.4S 539 LD2 {v12.S, v13.S}[3], [X11] , X1 540 541 542 543 544 545 546 ADD X4, X4, #4 547 548 ADD v24.4S, v4.4S, v12.4S 549 ADD v26.4S, v5.4S, v13.4S 550 SUB v28.4S, v4.4S, v12.4S 551 SUB v30.4S, v5.4S, v13.4S 552 553 ADD v17.4S, v16.4S, v24.4S 554 ADD v11.4S, v18.4S, v26.4S 555 SUB v19.4S, v16.4S, v24.4S 556 SUB v15.4S, v18.4S, v26.4S 557 558 ADD v8.4S, v20.4S, v30.4S 559 SUB v9.4S, v22.4S, v28.4S 560 ADD v13.4S, v22.4S, v28.4S 561 SUB v12.4S, v20.4S, v30.4S 562 563 564 565 566 TRN1 v0.4S, v17.4S, v8.4S 567 TRN2 v8.4S, v17.4S, v8.4S 568 569 SHL v0.4S, v0.4S, #2 570 TRN1 v4.4S, v19.4S, v12.4S 571 TRN2 v12.4S, v19.4S, v12.4S 572 SHL v8.4S, v8.4S, #2 573 574 SHL v4.4S, v4.4S, #2 575 TRN1 v1.4S, v11.4S, v9.4S 576 TRN2 v9.4S, v11.4S, v9.4S 577 SHL v12.4S, v12.4S, #2 578 579 SHL v1.4S, v1.4S, #2 580 TRN1 v5.4S, v15.4S, v13.4S 581 TRN2 v13.4S, v15.4S, v13.4S 582 SHL v9.4S, v9.4S, #2 583 584 SHL v5.4S, v5.4S, #2 585 swp v4.D[0], v0.D[1] 586 SHL v13.4S, v13.4S, #2 587 588 swp v12.D[0], v8.D[1] 589 swp v5.D[0], v1.D[1] 590 swp v13.D[0], v9.D[1] 591 592 MOv X15, #32 593 ST2 {v0.4S, v1.4S}, [X3], X15 594 ST2 {v8.4S, v9.4S}, [X3], X15 595 ST2 {v4.4S, v5.4S}, [X3], X15 596 ST2 {v12.4S, v13.4S}, [X3], X15 597 598 599 SUBS W9, W9, #1 600 BNE RADIX_4_LOOP 601 602 LSR X1, X1, #1 603 SUB X3, X3, X1, LSL #3 604 MOv X5, #4 605 MOv X4, #64 606 LSR X6, X1, #4 607 608 609RADIX_4_FIRST_ENDS: 610 611 MOv x30, X3 612 LSR X5, X5, #2 613 614 MOV X14, #8528 615 ADD X0, X0, X14 616 617OUTER_LOOP_R4: 618 619 MOv X14, x30 620 621 MOv X7, X5 622 MOv X2, #0 623 MOv X9, X0 624 LSL X12, X5, #5 625MIDDLE_LOOP_R4: 626 627 LD2 {v20.H, v21.H}[0], [X9], X2 628 LD2 {v22.H, v23.H}[0], [X9], X2 629 ADD X11, X2, X4, LSL #2 630 LD2 {v24.H, v25.H}[0], [X9] 631 ADD X10, X0, X11 632 633 LD2 {v20.H, v21.H}[1], [X10], X11 634 LD2 {v22.H, v23.H}[1], [X10], X11 635 ADD X2, X11, X4, LSL #2 636 LD2 {v24.H, v25.H}[1], [X10] 637 ADD X9, X0, X2 638 639 LD2 {v20.H, v21.H}[2], [X9], X2 640 LD2 {v22.H, v23.H}[2], [X9], X2 641 ADD X11, X2, X4, LSL #2 642 LD2 {v24.H, v25.H}[2], [X9] 643 ADD X10, X0, X11 644 645 LD2 {v20.H, v21.H}[3], [X10], X11 646 LD2 {v22.H, v23.H}[3], [X10], X11 647 ADD X2, X11, X4, LSL #2 648 LD2 {v24.H, v25.H}[3], [X10] 649 ADD X9, X0, X2 650 651 MOv X10, X6 652INNER_LOOP_R4: 653 654 LD2 {v30.4S, v31.4S}, [X14], X12 655 SSHR v30.4S, v30.4S, #1 656 LD4 {v16.4H, v17.4H, v18.4H, v19.4H}, [X14], X12 657 SSHR v31.4S, v31.4S, #1 658 659 USHR v16.4H, v16.4H, #1 660 LD4 {v26.4H, v27.4H, v28.4H, v29.4H}, [X14], X12 661 USHR v18.4H, v18.4H, #1 662 663 SMULL v11.4S, v16.4H, v20.4H 664 SMLSL v11.4S, v18.4H, v21.4H 665 666 LD4 {v0.4H, v1.4H, v2.4H, v3.4H}, [X14], X12 667 SMULL v12.4S, v16.4H, v21.4H 668 SMLAL v12.4S, v18.4H, v20.4H 669 670 USHR v26.4H, v26.4H, #1 671 USHR v28.4H, v28.4H, #1 672 673 LSL x29, X12, #2 674 SUB X14, X14, X12, LSL #2 675 676 USHR v0.4H, v0.4H, #1 677 USHR v2.4H, v2.4H, #1 678 679 SMULL v13.4S, v26.4H, v22.4H 680 SMLSL v13.4S, v28.4H, v23.4H 681 682 SSHR v11.4S, v11.4S, #15 683 684 SMULL v14.4S, v26.4H, v23.4H 685 SMLAL v14.4S, v28.4H, v22.4H 686 687 SMULL v15.4S, v0.4H, v24.4H 688 SMLSL v15.4S, v2.4H, v25.4H 689 690 SMLAL v11.4S, v17.4H, v20.4H 691 SMLSL v11.4S, v19.4H, v21.4H 692 693 SSHR v12.4S, v12.4S, #15 694 SSHR v13.4S, v13.4S, #15 695 SSHR v14.4S, v14.4S, #15 696 SSHR v15.4S, v15.4S, #15 697 698 SMLAL v12.4S, v17.4H, v21.4H 699 SMLAL v12.4S, v19.4H, v20.4H 700 701 SMULL v5.4S, v0.4H, v25.4H 702 SMLAL v5.4S, v2.4H, v24.4H 703 704 SMLAL v13.4S, v27.4H, v22.4H 705 SMLSL v13.4S, v29.4H, v23.4H 706 707 SMLAL v14.4S, v27.4H, v23.4H 708 SMLAL v14.4S, v29.4H, v22.4H 709 710 SMLAL v15.4S, v1.4H, v24.4H 711 SMLSL v15.4S, v3.4H, v25.4H 712 713 SSHR v5.4S, v5.4S, #15 714 715 SMLAL v5.4S, v1.4H, v25.4H 716 SMLAL v5.4S, v3.4H, v24.4H 717 718 719 720 SUBS x17, X7, X5 721 BNE BYPASS_IF 722 723 ADD X14, X14, X12 724 725 LDR W3, [X14] 726 ADD X14, X14, X12 727 ASR W3, W3, #1 728 729 MOv v11.S[0], W3 730 731 LDR W3, [X14] 732 ADD X14, X14, X12 733 ASR W3, W3, #1 734 MOv v13.S[0], W3 735 736 LDR W3, [X14] 737 ASR W3, W3, #1 738 MOv v15.S[0], W3 739 740 SUB X14, X14, X12, LSL #1 741 ADD X14, X14, #4 742 743 LDR W3, [X14] 744 ADD X14, X14, X12 745 ASR W3, W3, #1 746 MOv v12.S[0], W3 747 748 LDR W3, [X14] 749 ADD X14, X14, X12 750 ASR W3, W3, #1 751 MOv v14.S[0], W3 752 753 LDR W3, [X14] 754 ADD X14, X14, X12 755 ASR W3, W3, #1 756 MOv v5.S[0], W3 757 758 SUB X14, X14, #4 759 760 SUB X14, X14, x29 761 762 763 764 765 766 767 768 769BYPASS_IF: 770 771 ADD v6.4S, v30.4S, v13.4S 772 ADD v7.4S, v31.4S, v14.4S 773 SUB v30.4S, v30.4S, v13.4S 774 SUB v31.4S, v31.4S, v14.4S 775 ADD v8.4S, v11.4S, v15.4S 776 ADD v9.4S, v12.4S, v5.4S 777 778 SUB v15.4S, v11.4S, v15.4S 779 SUB v14.4S, v12.4S, v5.4S 780 781 782 ADD v10.4S, v6.4S, v8.4S 783 ADD v11.4S, v7.4S, v9.4S 784 ADD v12.4S, v30.4S, v14.4S 785 SUB v13.4S, v31.4S, v15.4S 786 787 SUB v6.4S, v6.4S, v8.4S 788 ST2 {v10.4S, v11.4S}, [X14], X12 789 SUB v7.4S, v7.4S, v9.4S 790 791 SUB v8.4S, v30.4S, v14.4S 792 ST2 {v12.4S, v13.4S}, [X14], X12 793 ADD v9.4S, v31.4S, v15.4S 794 795 ST2 {v6.4S, v7.4S}, [X14], X12 796 ST2 {v8.4S, v9.4S}, [X14], X12 797 SUBS X10, X10, #1 798 BNE INNER_LOOP_R4 799 800 SUB X14, X14, X1, LSL #3 801 ADD X14, X14, #32 802 803 SUBS X7, X7, #1 804 BNE MIDDLE_LOOP_R4 805 806 807 808 809 LSR X4, X4, #2 810 LSL X5, X5, #2 811 LSR X6, X6, #2 812 SUBS X8, X8, #1 813 BNE OUTER_LOOP_R4 814END_LOOPS: 815 pop_v_regs 816 RET 817 818 819 820