1.macro push_v_regs 2 stp q8, q9, [sp, #-32]! 3 stp q10, q11, [sp, #-32]! 4 stp q12, q13, [sp, #-32]! 5 stp q14, q15, [sp, #-32]! 6//st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]! 7//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]! 8 stp X8, X9, [sp, #-16]! 9 stp X10, X11, [sp, #-16]! 10 stp X12, X13, [sp, #-16]! 11 stp X14, X15, [sp, #-16]! 12 stp X16, X17, [sp, #-16]! 13 stp X18, X19, [sp, #-16]! 14 stp X20, X21, [sp, #-16]! 15 stp X22, X23, [sp, #-16]! 16 stp X24, X25, [sp, #-16]! 17 stp X26, X27, [sp, #-16]! 18 stp X28, X29, [sp, #-16]! 19 stp X30, X29, [sp, #-16]! 20.endm 21 22.macro pop_v_regs 23 ldp X30, X29, [sp], #16 24 ldp X28, X29, [sp], #16 25 ldp X26, X27, [sp], #16 26 ldp X24, X25, [sp], #16 27 ldp X22, X23, [sp], #16 28 ldp X20, X21, [sp], #16 29 ldp X18, X19, [sp], #16 30 ldp X16, X17, [sp], #16 31 ldp X14, X15, [sp], #16 32 ldp X12, X13, [sp], #16 33 ldp X10, X11, [sp], #16 34 ldp X8, X9, [sp], #16 35//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64 36//ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64 37 ldp q14, q15, [sp], #32 38 ldp q12, q13, [sp], #32 39 ldp q10, q11, [sp], #32 40 ldp q8, q9, [sp], #32 41.endm 42 43 44.text 45.p2align 2 46.global ixheaacd_post_twid_overlap_add_armv8 47 48ixheaacd_post_twid_overlap_add_armv8: 49 50 // STMFD sp!, {x4-x12} 51 push_v_regs 52 //stp x19, x20,[sp,#-16]! 53 //VPUSH {d8 - d15} 54 55 //LDR w4, [sp, #100] 56 //sxtw x4,w4 57 //LDR w5, [sp, #104] 58 //sxtw x5,w5 59 //LDR w6, [sp, #108] 60 //sxtw x6,w6 61 MOV x16, x5 62 MOV x17, x7 63 LSL x9, x3, #2 64 ASR x9, x9, #1 65 ADD x6, x6, x9 66 SUB x6, x6, #4 67 68 MOV w8, #7500 69 sxtw x8, w8 70 ADD x2, x2, x8 71 72 73 74 movi v18.4h, #50 75 sub x20, x5, #15 76 neg x9, x20 77 movi v20.4s, #0x80, LSL #8 78 dup v16.4s, w5 79 SUB x5, x5, #16 80 //STR w5, [sp, #116] 81 MOV w25, w5 82 sxtw x25, w25 83 MOV x8, #1 84 LSL x8, x8, x9 85 //STR w8, [sp, #120] 86 MOV w26, w8 87 88 //sxtw x8,w8 89 90 91ARM_PROLOGUE: 92 93 94 LDR w8, [x1], #4 95 sxtw x8, w8 96 LDR w9, [x1], #4 97 sxtw x9, w9 98 99 LDR w10, [x2], #4 100 sxtw x10, w10 101 102 AND w19, w10, 0xFFFF 103 sxth x19, w19 104 ASR w10, w10, #16 105// SMULWT x11, x8, x10 106// 107// SMULWB x12, x9, x10 108// SMULWB x5, x8, x10 109// SMLAWT x7, x9, x10, x5 110 111 SMULL x11, w8, w10 112 ASR x11, x11, #16 113 SMULL x12, w9, w19 114 ASR x12, x12, #16 115 SMULL x5, w8, w19 116 ASR x5, x5, #16 117 SMULL x7, w9, w10 118 ASR x7, x7, #16 119 ADD x7, x7, x5 120 121 SUB x8, x12, x11 122 MVN x5, x7 123 ADD x5, x5, #1 124 125 126 MOV x9, #50 127 MOV x12, #-50 128 AND w19, w9, 0xFFFF 129 sxth x19, w19 130 SMULL x10, w5, w19 131 ASR x10, x10, #16 132 AND w19, w12, 0xFFFF 133 sxth x19, w19 134 SMULL x11, w8, w19 135 ASR x11, x11, #16 136 137 ADD x8, x8, x10 138 ADD x5, x5, x11 139 140 //LDR w11, [sp, #104] 141 MOV w11, w16 142 sxth x11, w11 143 LDR w10, [x6], #-32 144 sxtw x10, w10 145 146 AND w19, w10, 0xFFFF 147 sxth x19, w19 148 ASR w20, w10, #16 149 150 //SMULWB x7, x8, x10 151 SMULL x7, w8, w19 152 ASR x7, x7, #16 153 MVN x8, x8 154 ADD x8, x8, #1 155 //SMULWT x12, x8, x10 156 SMULL x12, w8, w20 157 ASR x12, x12, #16 158 159 CMP x11, #0 160 BLT NEXT 161 162 SUB x9, x11, #16 163 negs x9, x9 164 165 166 167 168 // LDR w8, [sp, #120] 169 //sxtw x8,w8 170 MOV v1.s[0], w26 171 MOV v2.s[0], w5 172 173 //sQADD w5, w5, w8 174 //ASR w5, w5, w9 175 176 SQADD v2.2s, v2.2s, v1.2s 177 MOV w5, v2.s[0] 178 ASR w5, w5, w9 179 180 SUB x9, x11, #31 181 negs x9, x9 182 ASR x20, x7, x9 183 //MOV x8, x20 184 ADDS x8, x20, #0 185 BGE NEXT2 186 CMN x8, #1 187NEXT2: 188 MOV x20, #0x80000000 189 csel x7, x20, x7, LT 190 MOV x20, #0x7fffffff 191 csel x7, x20, x7, GT 192 LSL x20, x7, x11 193 csel x7, x20, x7, EQ 194 195 SUB x9, x11, #31 196 negs x9, x9 197 ASR x20, x12, x9 198 //MOV x8, x20 199 ADDS x8, x20, #0 200 BGE NEXT3 201 CMN x8, #1 202NEXT3: 203 MOV x20, #0x80000000 204 csel x12, x20, x12, LT 205 MOV x20, #0x7fffffff 206 csel x12, x20, x12, GT 207 LSL x20, x12, x11 208 csel x12, x20, x12, EQ 209 210 B NEXT1 211NEXT: 212 MVN w11, w11 213 ADD w11, w11, #1 214 ASR w5, w5, w11 215 MOV w8, #0x8000 216 217 MOV v1.s[0], w8 218 MOV v2.s[0], w5 219 220 //QADD x5, x5, x8 221 222 SQADD v2.2s, v2.2s, v1.2s 223 MOV w5, v2.s[0] 224 225 ASR w5, w5, #16 226 ASR w7, w7, w11 227 ASR w12, w12, w11 228 229NEXT1: 230 LDR w9, [x4] 231 sxtw x9, w9 232 MOV w8, #0x8000 233 //sxtw x8,w8 234 235 STR w5, [x4], #4 236 sxtw x5, w5 237 238 239 ROR w20, w10, #16 240 //UXTH x5, x10, ROR #16 241 UXTH w5, w20 242 UXTH w10, w10 243 244 245 dup v0.2s, w9 246 dup v2.2s, w10 247 dup v3.2s, w5 248 //VZIP.32 D2, D3 249 ZIP1 v28.2s, v2.2s, v3.2s 250 ZIP2 v3.2s, v2.2s, v3.2s 251 MOV v2.8b, v28.8b 252 sMULL v0.2d, v2.2s, v0.2s 253 Sqxtn v8.2s, v0.2d 254 255 256 dup v0.2s, w12 257 dup v1.2s, w7 258 259 //VZIP.32 D0, D1 260 261 ZIP1 v28.2s, v0.2s, v1.2s 262 ZIP2 v1.2s, v0.2s, v1.2s 263 MOV v0.8b, v28.8b 264 265 SQSUB v8.2s, v0.2s , v8.2s 266 267 268 sQshL v8.2s, v8.2s, #2 269 dup v0.2s, w8 270 SQADD v8.2s, v8.2s , v0.2s 271 sshR v8.2s, v8.2s, #16 272 273 274 275 MOV x7, x17 276 //sxtw x7,w7 277 LSL x10, x7, #1 278 279 ASR x5, x3, #1 280 //SMULBB x5, x10, x5 281 AND w5, w5, 0xFFFF 282 sxth x5, w5 283 AND w19, w10, 0xFFFF 284 sxth x19, w19 285 SMULL x5, w19, w5 286 287 ADD x5, x5, x0 288 SUB x0, x5, x10 289 MVN x9, x10 290 ADD x9, x9, #1 291 292 ST1 {v8.h}[2], [x0], x9 293 ST1 {v8.h}[0], [x5], x10 294 295 296 MOV x8, x1 297 LSL x12, x3, #2 298 299 ADD x1, x1, x12 300 301 SUB x1, x1, #40 302 303 MOV x12, #-32 304 305 306 307PROLOGUE_NEON: 308 309 ASR x3, x3, #2 310 SUB x3, x3, #4 311 ASR x3, x3, #2 312 SUB x3, x3, #2 313 314 LD2 { v0.4s, v1.4s}, [x1] 315 MOV v2.16b, v1.16b 316 ADD x1, x1, x12 317 318 //VUZP.16 D0, D1 319 UZP1 v28.8h, v0.8h, v0.8h 320 UZP2 v29.8h, v0.8h, v0.8h 321 MOV v0.d[0], v28.d[0] 322 MOV v0.d[1], v29.d[0] 323 324 //VUZP.16 D2, D3 325 326 UZP1 v28.8h, v2.8h, v2.8h 327 UZP2 v29.8h, v2.8h, v2.8h 328 MOV v2.d[0], v28.d[0] 329 MOV v2.d[1], v29.d[0] 330 331 332 //rev64 v0.8h, v0.8h 333 rev64 v0.8h, v0.8h 334 MOV v1.d[0], v0.d[1] 335 rev64 v2.8h, v2.8h 336 MOV v3.d[0], v2.d[1] 337 LD2 {v8.4h, v9.4h}, [x2] 338 ADD x2, x2, #16 339 340 LD2 { v4.4s, v5.4s}, [x8] 341 MOV v6.16b, v5.16b 342 ADD x8, x8, #32 343 uMULL v30.4s, v0.4h, v9.4h 344 345// VUZP.16 D4, D5 346 347 UZP1 v28.8h, v4.8h, v4.8h 348 UZP2 v29.8h, v4.8h, v4.8h 349 MOV v4.d[0], v28.d[0] 350 MOV v5.d[0], v29.d[0] 351 352 uMULL v28.4s, v2.4h, v8.4h 353 354// VUZP.16 D6, D7 355 UZP1 v26.8h, v6.8h, v6.8h 356 UZP2 v27.8h, v6.8h, v6.8h 357 MOV v6.d[0], v26.d[0] 358 MOV v7.d[0], v27.d[0] 359 360 uMULL v26.4s, v0.4h, v8.4h 361 362 363 uMULL v24.4s, v2.4h, v9.4h 364 365 LD2 { v10.4s, v11.4s}, [x6] 366 MOV v12.16b, v11.16b 367 ADD x6, x6, x12 368 ushR v30.4s, v30.4s, #16 369 370 //VUZP.16 D10, D11 371 372 UZP1 v22.8h, v10.8h, v10.8h 373 UZP2 v23.8h, v10.8h, v10.8h 374 MOV v10.d[0], v22.d[0] 375 MOV v10.d[1], v23.d[0] 376 377 ushR v28.4s, v28.4s, #16 378 379 //VUZP.16 D12, D13 380 381 UZP1 v22.8h, v12.8h, v12.8h 382 UZP2 v23.8h, v12.8h, v12.8h 383 MOV v12.d[0], v22.d[0] 384 MOV v12.d[1], v23.d[0] 385 386 sMLAL v30.4s, v1.4h, v9.4h 387 388 rev64 v10.8h, v10.8h 389 MOV v11.d[0], v10.d[1] 390 sMLAL v28.4s, v3.4h, v8.4h 391 392 rev64 v12.8h, v12.8h 393 MOV v13.d[0], v12.d[1] 394 ushR v26.4s, v26.4s, #16 395 396 397 ushR v24.4s, v24.4s, #16 398 399 sMLAL v26.4s, v1.4h, v8.4h 400 sMLAL v24.4s, v3.4h, v9.4h 401 402 403 404 ADD v30.4s, v30.4s , v28.4s 405 NEG v30.4s, v30.4s 406 407 uMULL v22.4s, v4.4h, v8.4h 408 409 SUB v28.4s, v24.4s , v26.4s 410 411 412 mov v26.16b, v30.16b 413 mov v24.16b, v28.16b 414 415// VUZP.16 D24, D25 416 417 UZP1 v19.8h, v24.8h, v24.8h 418 UZP2 v21.8h, v24.8h, v24.8h 419 MOV v24.d[0], v19.d[0] 420 MOV v25.d[0], v21.d[0] 421 422 423// VUZP.16 D26, D27 424 425 UZP1 v19.8h, v26.8h, v26.8h 426 UZP2 v21.8h, v26.8h, v26.8h 427 MOV v26.d[0], v19.d[0] 428 MOV v27.d[0], v21.d[0] 429 430 uMULL v2.4s, v24.4h, v18.4h 431 432 uMULL v0.4s, v26.4h, v18.4h 433 434 ushR v22.4s, v22.4s, #16 435 sMLAL v22.4s, v5.4h, v8.4h 436 437 ushR v2.4s, v2.4s, #16 438 ushR v0.4s, v0.4s, #16 439 sMLAL v2.4s, v25.4h, v18.4h 440 sMLAL v0.4s, v27.4h, v18.4h 441 442 uMULL v24.4s, v4.4h, v9.4h 443 uMULL v26.4s, v6.4h, v8.4h 444 445 NEG v2.4s, v2.4s 446 ADD v28.4s, v28.4s , v0.4s 447 ADD v30.4s, v30.4s , v2.4s 448 449 uMULL v0.4s, v6.4h, v9.4h 450 sshR v24.4s, v24.4s, #16 451 sMLAL v24.4s, v5.4h, v9.4h 452 sshR v26.4s, v26.4s, #16 453 sshR v0.4s, v0.4s, #16 454 sMLAL v26.4s, v7.4h, v8.4h 455 sMLAL v0.4s, v7.4h, v9.4h 456 457 458 459 460 ADD v22.4s, v22.4s , v0.4s 461 NEG v22.4s, v22.4s 462 SUB v24.4s, v26.4s , v24.4s 463 464 465 466 //LDR w11, [sp, #120] 467 //sxtw x11,w11 468 MOV w11, w26 469 dup v14.4s, w11 470 SQADD v28.4s, v28.4s , v14.4s 471 //LDR w11, [sp, #116] 472 MOV w11, w25 473 //sxtw x11,w11 474 dup v0.4s, w11 475 sQshL v28.4s, v28.4s, v0.4s 476 477 mov v0.16b, v22.16b 478 mov v14.16b, v24.16b 479 480 481// VUZP.16 D24, D25 482 483 UZP1 v19.8h, v24.8h, v24.8h 484 UZP2 v21.8h, v24.8h, v24.8h 485 MOV v24.d[0], v19.d[0] 486 MOV v25.d[0], v21.d[0] 487 488 489// VUZP.16 D22, D23 490 491 UZP1 v19.8h, v22.8h, v22.8h 492 UZP2 v21.8h, v22.8h, v22.8h 493 MOV v22.d[0], v19.d[0] 494 MOV v23.d[0], v21.d[0] 495 496 uMULL v8.4s, v24.4h, v18.4h 497 uMULL v26.4s, v22.4h, v18.4h 498 499 NEG v2.4s, v30.4s 500// VUZP.16 D30, D31 501 502 UZP1 v19.8h, v30.8h, v30.8h 503 UZP2 v21.8h, v30.8h, v30.8h 504 MOV v30.d[0], v19.d[0] 505 MOV v30.d[1], v21.d[0] 506 507// VUZP.16 D2, D3 508 509 UZP1 v19.8h, v2.8h, v2.8h 510 UZP2 v21.8h, v2.8h, v2.8h 511 MOV v2.d[0], v19.d[0] 512 MOV v3.d[0], v21.d[0] 513 514 uMULL v4.4s, v30.4h, v12.4h 515 516 uMULL v6.4s, v2.4h, v13.4h 517 518 ushR v8.4s, v8.4s, #16 519 ushR v26.4s, v26.4s, #16 520 521 sMLAL v8.4s, v25.4h, v18.4h 522 sMLAL v26.4s, v23.4h, v18.4h 523 524 ushR v4.4s, v4.4s, #16 525 ushR v6.4s, v6.4s, #16 526 527 MOV v19.d[0], v30.d[1] 528 529 sMLAL v4.4s, v19.4h, v12.4h 530 sMLAL v6.4s, v3.4h, v13.4h 531 532 NEG v8.4s, v8.4s 533 ADD v14.4s, v14.4s , v26.4s 534 ADD v0.4s, v0.4s , v8.4s 535 536 //LDR w11, [sp, #120] 537 //sxtw x11,w11 538 MOV w11, w26 539 dup v8.4s, w11 540 SQADD v0.4s, v0.4s , v8.4s 541 //LDR w11, [sp, #116] 542 //sxtw x11,w11 543 MOV w11, w25 544 dup v26.4s, w11 545 sQshL v0.4s, v0.4s, v26.4s 546 547 mov v26.16b, v28.16b 548 549 LD2 { v28.4s, v29.4s}, [x4] 550 MOV v30.16b, v29.16b 551 MOV v29.d[0], v28.d[1] 552// VZIP.32 Q13, Q0 553 554 ZIP1 v19.4s, v26.4s, v0.4s 555 ZIP2 v0.4s, v26.4s, v0.4s 556 MOV v26.16b, v19.16b 557 558 ST1 { v26.4s}, [x4], #16 559 ST1 { v0.4s}, [x4], #16 560 561 movi v1.2s, #0 562 //VADDL.S16 Q0, D13, D1 563 564 SADDL v0.4s, v13.4h, v1.4h 565 MOV v1.d[0], v0.d[1] 566 sMULL v26.2d, v28.2s, v0.2s 567 Sqxtn v8.2s, v26.2d 568 sMULL v26.2d, v29.2s, v1.2s 569 Sqxtn v9.2s, v26.2d 570 MOV v8.d[1], v9.d[0] 571 movi v1.2s, #0 572// VADDL.S16 Q0, D12, D1 573 SADDL v0.4s, v12.4h, v1.4h 574 MOV v1.d[0], v0.d[1] 575 sMULL v24.2d, v28.2s, v0.2s 576 Sqxtn v26.2s, v24.2d 577 sMULL v24.2d, v29.2s, v1.2s 578 Sqxtn v27.2s, v24.2d 579 MOV v26.d[1], v27.d[0] 580 581 sQshL v4.4s, v4.4s, v16.4s 582 sQshL v6.4s, v6.4s, v16.4s 583 584 SQSUB v4.4s, v4.4s , v8.4s 585 SQSUB v6.4s, v6.4s , v26.4s 586 587 NEG v26.4s, v14.4s 588// VUZP.16 D14, D15 589 590 591 UZP1 v19.8h, v14.8h, v14.8h 592 UZP2 v21.8h, v14.8h, v14.8h 593 MOV v14.d[0], v19.d[0] 594 MOV v15.d[0], v21.d[0] 595 596// VUZP.16 D26, D27 597 598 599 UZP1 v19.8h, v26.8h, v26.8h 600 UZP2 v21.8h, v26.8h, v26.8h 601 MOV v26.d[0], v19.d[0] 602 MOV v27.d[0], v21.d[0] 603 604 605 movi v1.2s, #0 606// VADDL.S16 Q0, D10, D1 607 SADDL v0.4s, v10.4h, v1.4h 608 MOV v1.d[0], v0.d[0] 609 sMULL v22.2d, v30.2s, v0.2s 610 Sqxtn v24.2s, v22.2d 611 sMULL2 v22.2d, v30.4s, v0.4s 612 Sqxtn v25.2s, v22.2d 613 MOV v24.d[1], v25.d[0] 614 movi v1.2s, #0 615// VADDL.S16 Q0, D11, D1 616 SADDL v0.4s, v11.4h, v1.4h 617 MOV v1.d[0], v0.d[1] 618 619 sMULL v8.2d, v30.2s, v0.2s 620 Sqxtn v22.2s, v8.2d 621 sMULL2 v8.2d, v30.4s, v0.4s 622 Sqxtn v23.2s, v8.2d 623 MOV v22.d[1], v23.d[0] 624 uMULL v8.4s, v26.4h, v11.4h 625 uMULL v30.4s, v14.4h, v10.4h 626 627 LD2 { v0.4s, v1.4s}, [x1] 628 MOV v2.16b, v1.16b 629 ADD x1, x1, x12 630 631// VUZP.16 D0, D1 632 633 UZP1 v19.8h, v0.8h, v0.8h 634 UZP2 v21.8h, v0.8h, v0.8h 635 MOV v0.d[0], v19.d[0] 636 MOV v0.d[1], v21.d[0] 637 638// VUZP.16 D2, D3 639 640 UZP1 v19.8h, v2.8h, v2.8h 641 UZP2 v21.8h, v2.8h, v2.8h 642 MOV v2.d[0], v19.d[0] 643 MOV v2.d[1], v21.d[0] 644 645 ushR v8.4s, v8.4s, #16 646 647 rev64 v0.8h, v0.8h 648 MOV v1.d[0], v0.d[1] 649 ushR v30.4s, v30.4s, #16 650 651 rev64 v2.8h, v2.8h 652 MOV v3.d[0], v2.d[1] 653 sMLAL v8.4s, v27.4h, v11.4h 654 655 sMLAL v30.4s, v15.4h, v10.4h 656 657 LD2 { v10.4s, v11.4s}, [x6] 658 ADD x6, x6, x12 659 MOV v12.16b, v11.16b 660 sQshL v4.4s, v4.4s, #2 661 662// VUZP.16 D10, D11 663 664 UZP1 v19.8h, v10.8h, v10.8h 665 UZP2 v21.8h, v10.8h, v10.8h 666 MOV v10.d[0], v19.d[0] 667 MOV v10.d[1], v21.d[0] 668 669 sQshL v6.4s, v6.4s, #2 670 671// VUZP.16 D12, D13 672 673 UZP1 v19.8h, v12.8h, v12.8h 674 UZP2 v21.8h, v12.8h, v12.8h 675 MOV v12.d[0], v19.d[0] 676 MOV v12.d[1], v21.d[0] 677 678 SQADD v14.4s, v4.4s , v20.4s 679 680 rev64 v10.8h, v10.8h 681 MOV v11.d[0], v10.d[1] 682 SQADD v6.4s, v6.4s , v20.4s 683 684 rev64 v12.8h, v12.8h 685 MOV v13.d[0], v12.d[1] 686 sshR v14.4s, v14.4s, #16 687 688// VUZP.16 D14, D15 689 690 UZP1 v19.8h, v14.8h, v14.8h 691 UZP2 v21.8h, v14.8h, v14.8h 692 MOV v14.d[0], v19.d[0] 693 MOV v15.d[0], v21.d[0] 694 695 sshR v6.4s, v6.4s, #16 696 697// VUZP.16 D6, D7 698 699 UZP1 v19.8h, v6.8h, v6.8h 700 UZP2 v21.8h, v6.8h, v6.8h 701 MOV v6.d[0], v19.d[0] 702 MOV v7.d[0], v21.d[0] 703 704 mov v15.8b, v6.8b 705 sQshL v8.4s, v8.4s, v16.4s 706 707 LD2 { v4.4s, v5.4s}, [x8] 708 ADD x8, x8, #32 709 MOV v6.16b, v5.16b 710 sQshL v30.4s, v30.4s, v16.4s 711 712// VUZP.16 D4, D5 713 714 UZP1 v19.8h, v4.8h, v4.8h 715 UZP2 v21.8h, v4.8h, v4.8h 716 MOV v4.d[0], v19.d[0] 717 MOV v5.d[0], v21.d[0] 718 719 SQSUB v8.4s, v8.4s , v24.4s 720 721// VUZP.16 D6, D7 722 723 UZP1 v19.8h, v6.8h, v6.8h 724 UZP2 v21.8h, v6.8h, v6.8h 725 MOV v6.d[0], v19.d[0] 726 MOV v7.d[0], v21.d[0] 727 728 SQSUB v22.4s, v30.4s , v22.4s 729 730 sQshL v30.4s, v8.4s, #2 731 732 LD2 {v8.4h, v9.4h}, [x2] 733 ADD x2, x2, #16 734 sQshL v22.4s, v22.4s, #2 735 736 SQADD v30.4s, v30.4s , v20.4s 737 SQADD v22.4s, v22.4s , v20.4s 738 739 sshR v30.4s, v30.4s, #16 740 741// VUZP.16 D30, D31 742 743 UZP1 v19.8h, v30.8h, v30.8h 744 UZP2 v21.8h, v30.8h, v30.8h 745 MOV v30.d[0], v19.d[0] 746 MOV v30.d[1], v21.d[0] 747 748 sshR v22.4s, v22.4s, #16 749 750 751// VUZP.16 D22, D23 752 753 UZP1 v19.8h, v22.8h, v22.8h 754 UZP2 v21.8h, v22.8h, v22.8h 755 MOV v22.d[0], v19.d[0] 756 MOV v23.d[0], v21.d[0] 757 758 759 mov v23.8b, v30.8b 760 761CORE_LOOP: 762 ST1 {v14.h}[0], [x0] 763 ADD x0, x0, x9 764 uMULL v30.4s, v0.4h, v9.4h 765 766 ST1 {v22.h}[0], [x0] 767 ADD x0, x0, x9 768 uMULL v28.4s, v2.4h, v8.4h 769 770 ST1 {v14.h}[1], [x0] 771 ADD x0, x0, x9 772 uMULL v26.4s, v0.4h, v8.4h 773 774 ST1 {v22.h}[1], [x0] 775 ADD x0, x0, x9 776 uMULL v24.4s, v2.4h, v9.4h 777 778 ST1 {v14.h}[2], [x0] 779 ADD x0, x0, x9 780 ushR v30.4s, v30.4s, #16 781 782 ST1 {v22.h}[2], [x0] 783 ADD x0, x0, x9 784 ushR v28.4s, v28.4s, #16 785 786 ST1 {v14.h}[3], [x0] 787 ADD x0, x0, x9 788 sMLAL v30.4s, v1.4h, v9.4h 789 790 ST1 {v22.h}[3], [x0] 791 ADD x0, x0, x9 792 sMLAL v28.4s, v3.4h, v8.4h 793 794 ST1 {v15.h}[0], [x5] 795 ADD x5, x5, x10 796 ushR v26.4s, v26.4s, #16 797 798 ST1 {v23.h}[0], [x5] 799 ADD x5, x5, x10 800 ushR v24.4s, v24.4s, #16 801 802 ST1 {v15.h}[1], [x5] 803 ADD x5, x5, x10 804 sMLAL v26.4s, v1.4h, v8.4h 805 806 ST1 {v23.h}[1], [x5] 807 ADD x5, x5, x10 808 sMLAL v24.4s, v3.4h, v9.4h 809 810 ST1 {v15.h}[2], [x5] 811 ADD x5, x5, x10 812 ADD v30.4s, v30.4s , v28.4s 813 814 ST1 {v23.h}[2], [x5] 815 ADD x5, x5, x10 816 NEG v30.4s, v30.4s 817 818 ST1 {v15.h}[3], [x5] 819 ADD x5, x5, x10 820 821 ST1 {v23.h}[3], [x5] 822 ADD x5, x5, x10 823 SUB v28.4s, v24.4s , v26.4s 824 825 826 mov v26.16b, v30.16b 827 uMULL v22.4s, v4.4h, v8.4h 828 829 mov v24.16b, v28.16b 830 831// VUZP.16 D24, D25 832 833 UZP1 v19.8h, v24.8h, v24.8h 834 UZP2 v21.8h, v24.8h, v24.8h 835 MOV v24.d[0], v19.d[0] 836 MOV v25.d[0], v21.d[0] 837 838 839// VUZP.16 D26, D27 840 841 UZP1 v19.8h, v26.8h, v26.8h 842 UZP2 v21.8h, v26.8h, v26.8h 843 MOV v26.d[0], v19.d[0] 844 MOV v27.d[0], v21.d[0] 845 846 uMULL v2.4s, v24.4h, v18.4h 847 uMULL v0.4s, v26.4h, v18.4h 848 849 ushR v22.4s, v22.4s, #16 850 sMLAL v22.4s, v5.4h, v8.4h 851 852 ushR v2.4s, v2.4s, #16 853 ushR v0.4s, v0.4s, #16 854 sMLAL v2.4s, v25.4h, v18.4h 855 sMLAL v0.4s, v27.4h, v18.4h 856 857 uMULL v24.4s, v4.4h, v9.4h 858 uMULL v26.4s, v6.4h, v8.4h 859 860 NEG v2.4s, v2.4s 861 ADD v28.4s, v28.4s , v0.4s 862 ADD v30.4s, v30.4s , v2.4s 863 864 uMULL v0.4s, v6.4h, v9.4h 865 sshR v24.4s, v24.4s, #16 866 sMLAL v24.4s, v5.4h, v9.4h 867 sshR v26.4s, v26.4s, #16 868 sshR v0.4s, v0.4s, #16 869 sMLAL v26.4s, v7.4h, v8.4h 870 sMLAL v0.4s, v7.4h, v9.4h 871 872 873 874 ADD v22.4s, v22.4s , v0.4s 875 876 NEG v22.4s, v22.4s 877 SUB v24.4s, v26.4s , v24.4s 878 879 880 //LDR w11, [sp, #120] 881 //sxtw x11,w11 882 MOV w11, w26 883 dup v14.4s, w11 884 SQADD v28.4s, v28.4s , v14.4s 885 //LDR w11, [sp, #116] 886 //sxtw x11,w11 887 MOV w11, w25 888 dup v0.4s, w11 889 sQshL v28.4s, v28.4s, v0.4s 890 891 892 mov v0.16b, v22.16b 893 mov v14.16b, v24.16b 894 895// VUZP.16 D24, D25 896 897 UZP1 v19.8h, v24.8h, v24.8h 898 UZP2 v21.8h, v24.8h, v24.8h 899 MOV v24.d[0], v19.d[0] 900 MOV v25.d[0], v21.d[0] 901 902 903// VUZP.16 D22, D23 904 905 UZP1 v19.8h, v22.8h, v22.8h 906 UZP2 v21.8h, v22.8h, v22.8h 907 MOV v22.d[0], v19.d[0] 908 MOV v23.d[0], v21.d[0] 909 910 uMULL v8.4s, v24.4h, v18.4h 911 uMULL v26.4s, v22.4h, v18.4h 912 913 NEG v2.4s, v30.4s 914 915// VUZP.16 D30, D31 916 917 UZP1 v19.8h, v30.8h, v30.8h 918 UZP2 v21.8h, v30.8h, v30.8h 919 MOV v30.d[0], v19.d[0] 920 MOV v30.d[1], v21.d[0] 921 922 923// VUZP.16 D2, D3 924 925 UZP1 v19.8h, v2.8h, v2.8h 926 UZP2 v21.8h, v2.8h, v2.8h 927 MOV v2.d[0], v19.d[0] 928 MOV v3.d[0], v21.d[0] 929 930 uMULL v4.4s, v30.4h, v12.4h 931 uMULL v6.4s, v2.4h, v13.4h 932 933 ushR v8.4s, v8.4s, #16 934 ushR v26.4s, v26.4s, #16 935 936 sMLAL v8.4s, v25.4h, v18.4h 937 sMLAL v26.4s, v23.4h, v18.4h 938 939 ushR v4.4s, v4.4s, #16 940 ushR v6.4s, v6.4s, #16 941 942 MOV v19.d[0], v30.d[1] 943 944 sMLAL v4.4s, v19.4h, v12.4h 945 sMLAL v6.4s, v3.4h, v13.4h 946 947 NEG v8.4s, v8.4s 948 ADD v14.4s, v14.4s , v26.4s 949 ADD v0.4s, v0.4s , v8.4s 950 951 952 953 //LDR w11, [sp, #120] 954 //sxtw x11,w11 955 MOV w11, w26 956 dup v8.4s, w11 957 SQADD v0.4s, v0.4s , v8.4s 958 //LDR w11, [sp, #116] 959 //sxtw x11,w11 960 MOV w11, w25 961 dup v26.4s, w11 962 sQshL v0.4s, v0.4s, v26.4s 963 mov v26.16b, v28.16b 964 965 LD2 { v28.4s, v29.4s}, [x4] 966 MOV v30.16b, v29.16b 967 MOV v29.d[0], v28.d[1] 968// VZIP.32 Q13, Q0 969 970 ZIP1 v19.4s, v26.4s, v0.4s 971 ZIP2 v0.4s, v26.4s, v0.4s 972 MOV v26.16b, v19.16b 973 974 ST1 { v26.4s}, [x4] 975 ADD x4, x4, #16 976 ST1 { v0.4s}, [x4] 977 ADD x4, x4, #16 978 979 movi v1.2s, #0 980// VADDL.S16 Q0, D13, D1 981 SADDL v0.4s, v13.4h, v1.4h 982 MOV v1.d[0], v0.d[1] 983 984 sMULL v26.2d, v28.2s, v0.2s 985 Sqxtn v8.2s, v26.2d 986 sMULL v26.2d, v29.2s, v1.2s 987 Sqxtn v9.2s, v26.2d 988 MOV v8.d[1], v9.d[0] 989 movi v1.2s, #0 990 //VADDL.S16 Q0, D12, D1 991 SADDL v0.4s, v12.4h, v1.4h 992 MOV v1.d[0], v0.d[1] 993 994 sMULL v24.2d, v28.2s, v0.2s 995 Sqxtn v26.2s, v24.2d 996 sMULL v24.2d, v29.2s, v1.2s 997 Sqxtn v27.2s, v24.2d 998 MOV v26.d[1], v27.d[0] 999 sQshL v4.4s, v4.4s, v16.4s 1000 sQshL v6.4s, v6.4s, v16.4s 1001 1002 1003 1004 SQSUB v4.4s, v4.4s , v8.4s 1005 SQSUB v6.4s, v6.4s , v26.4s 1006 1007 NEG v26.4s, v14.4s 1008// VUZP.16 D26, D27 1009 UZP1 v19.8h, v26.8h, v26.8h 1010 UZP2 v21.8h, v26.8h, v26.8h 1011 MOV v26.d[0], v19.d[0] 1012 MOV v27.d[0], v21.d[0] 1013 1014 movi v1.2s, #0 1015 //VADDL.S16 Q0, D10, D1 1016 SADDL v0.4s, v10.4h, v1.4h 1017 MOV v1.d[0], v0.d[1] 1018 1019 sMULL v22.2d, v30.2s, v0.2s 1020 Sqxtn v24.2s, v22.2d 1021 sMULL2 v22.2d, v30.4s, v0.4s 1022 Sqxtn v25.2s, v22.2d 1023 MOV v24.d[1], v25.d[0] 1024 movi v1.2s, #0 1025 //VADDL.S16 Q0, D11, D1 1026 SADDL v0.4s, v11.4h, v1.4h 1027 1028 sMULL v8.2d, v30.2s, v0.2s 1029 Sqxtn v22.2s, v8.2d 1030 sMULL2 v8.2d, v30.4s, v0.4s 1031 Sqxtn v23.2s, v8.2d 1032 MOV v22.d[1], v23.d[0] 1033 1034// VUZP.16 D14, D15 1035 1036 UZP1 v19.8h, v14.8h, v14.8h 1037 UZP2 v21.8h, v14.8h, v14.8h 1038 MOV v14.d[0], v19.d[0] 1039 MOV v15.d[0], v21.d[0] 1040 1041 uMULL v8.4s, v26.4h, v11.4h 1042 uMULL v30.4s, v14.4h, v10.4h 1043 1044 1045 LD2 { v0.4s, v1.4s}, [x1] 1046 MOV v2.16b, v1.16b 1047 ADD X1, X1, x12 1048 1049// VUZP.16 D0, D1 1050 UZP1 v19.8h, v0.8h, v0.8h 1051 UZP2 v21.8h, v0.8h, v0.8h 1052 MOV v0.d[0], v19.d[0] 1053 MOV v0.d[1], v21.d[0] 1054 1055// VUZP.16 D2, D3 1056 1057 UZP1 v19.8h, v2.8h, v2.8h 1058 UZP2 v21.8h, v2.8h, v2.8h 1059 MOV v2.d[0], v19.d[0] 1060 MOV v2.d[1], v21.d[0] 1061 1062 ushR v8.4s, v8.4s, #16 1063 1064 rev64 v0.8h, v0.8h 1065 MOV v1.d[0], v0.d[1] 1066 ushR v30.4s, v30.4s, #16 1067 1068 rev64 v2.8h, v2.8h 1069 MOV v3.d[0], v2.d[1] 1070 sMLAL v8.4s, v27.4h, v11.4h 1071 1072 sMLAL v30.4s, v15.4h, v10.4h 1073 1074 LD2 { v10.4s, v11.4s}, [x6] 1075 add X6, x6, x12 1076 MOV v12.16b, v11.16b 1077 sQshL v4.4s, v4.4s, #2 1078 1079 //VUZP.16 D10, D11 1080 1081 UZP1 v19.8h, v10.8h, v10.8h 1082 UZP2 v21.8h, v10.8h, v10.8h 1083 MOV v10.d[0], v19.d[0] 1084 MOV v10.d[1], v21.d[0] 1085 1086 sQshL v6.4s, v6.4s, #2 1087 1088// VUZP.16 D12, D13 1089 1090 UZP1 v19.8h, v12.8h, v12.8h 1091 UZP2 v21.8h, v12.8h, v12.8h 1092 MOV v12.d[0], v19.d[0] 1093 MOV v12.d[1], v21.d[0] 1094 1095 1096 SQADD v14.4s, v4.4s , v20.4s 1097 1098 rev64 v10.8h, v10.8h 1099 MOV v11.d[0], v10.d[1] 1100 SQADD v6.4s, v6.4s , v20.4s 1101 1102 rev64 v12.8h, v12.8h 1103 MOV v13.d[0], v12.d[1] 1104 sshR v14.4s, v14.4s, #16 1105 1106// VUZP.16 D14, D15 1107 1108 UZP1 v19.8h, v14.8h, v14.8h 1109 UZP2 v21.8h, v14.8h, v14.8h 1110 MOV v14.d[0], v19.d[0] 1111 MOV v15.d[0], v21.d[0] 1112 1113 1114 sshR v6.4s, v6.4s, #16 1115 1116// VUZP.16 D6, D7 1117 1118 UZP1 v19.8h, v6.8h, v6.8h 1119 UZP2 v21.8h, v6.8h, v6.8h 1120 MOV v6.d[0], v19.d[0] 1121 MOV v7.d[0], v21.d[0] 1122 1123 1124 mov v15.8b, v6.8b 1125 sQshL v8.4s, v8.4s, v16.4s 1126 1127 LD2 { v4.4s, v5.4s}, [x8] 1128 ADD x8, x8, #32 1129 MOV v6.16b, v5.16b 1130 1131 sQshL v30.4s, v30.4s, v16.4s 1132 1133// VUZP.16 D4, D5 1134 1135 UZP1 v19.8h, v4.8h, v4.8h 1136 UZP2 v21.8h, v4.8h, v4.8h 1137 MOV v4.d[0], v19.d[0] 1138 MOV v5.d[0], v21.d[0] 1139 1140 1141 SQSUB v8.4s, v8.4s , v24.4s 1142 1143// VUZP.16 D6, D7 1144 1145 UZP1 v19.8h, v6.8h, v6.8h 1146 UZP2 v21.8h, v6.8h, v6.8h 1147 MOV v6.d[0], v19.d[0] 1148 MOV v7.d[0], v21.d[0] 1149 1150 1151 SQSUB v22.4s, v30.4s , v22.4s 1152 1153 sQshL v30.4s, v8.4s, #2 1154 1155 LD2 {v8.4h, v9.4h}, [x2] 1156 ADD x2, x2, #16 1157 sQshL v22.4s, v22.4s, #2 1158 1159 SQADD v30.4s, v30.4s , v20.4s 1160 SQADD v22.4s, v22.4s , v20.4s 1161 1162 sshR v30.4s, v30.4s, #16 1163 1164// VUZP.16 D30, D31 1165 1166 UZP1 v19.8h, v30.8h, v30.8h 1167 UZP2 v21.8h, v30.8h, v30.8h 1168 MOV v30.d[0], v19.d[0] 1169 MOV v30.d[1], v21.d[0] 1170 1171 1172 sshR v22.4s, v22.4s, #16 1173 1174 1175// VUZP.16 D22, D23 1176 UZP1 v19.8h, v22.8h, v22.8h 1177 UZP2 v21.8h, v22.8h, v22.8h 1178 MOV v22.d[0], v19.d[0] 1179 MOV v23.d[0], v21.d[0] 1180 1181 1182 mov v23.8b, v30.8b 1183 1184 SUBS x3, x3, #1 1185 BNE CORE_LOOP 1186 1187 1188 1189 1190 1191EPILOGUE: 1192 1193 ST1 {v14.h}[0], [x0] 1194 ADD x0, x0, x9 1195 uMULL v30.4s, v0.4h, v9.4h 1196 1197 ST1 {v22.h}[0], [x0] 1198 ADD x0, x0, x9 1199 uMULL v28.4s, v2.4h, v8.4h 1200 1201 ST1 {v14.h}[1], [x0] 1202 ADD x0, x0, x9 1203 uMULL v26.4s, v0.4h, v8.4h 1204 1205 ST1 {v22.h}[1], [x0] 1206 ADD x0, x0, x9 1207 uMULL v24.4s, v2.4h, v9.4h 1208 1209 ST1 {v14.h}[2], [x0] 1210 ADD x0, x0, x9 1211 ushR v30.4s, v30.4s, #16 1212 1213 ST1 {v22.h}[2], [x0] 1214 ADD x0, x0, x9 1215 ushR v28.4s, v28.4s, #16 1216 1217 ST1 {v14.h}[3], [x0] 1218 ADD x0, x0, x9 1219 sMLAL v30.4s, v1.4h, v9.4h 1220 1221 ST1 {v22.h}[3], [x0] 1222 ADD x0, x0, x9 1223 sMLAL v28.4s, v3.4h, v8.4h 1224 1225 ST1 {v15.h}[0], [x5] 1226 ADD x5, x5, x10 1227 ushR v26.4s, v26.4s, #16 1228 1229 ST1 {v23.h}[0], [x5] 1230 ADD x5, x5, x10 1231 ushR v24.4s, v24.4s, #16 1232 1233 ST1 {v15.h}[1], [x5] 1234 ADD x5, x5, x10 1235 sMLAL v26.4s, v1.4h, v8.4h 1236 1237 ST1 {v23.h}[1], [x5] 1238 ADD x5, x5, x10 1239 sMLAL v24.4s, v3.4h, v9.4h 1240 1241 ST1 {v15.h}[2], [x5] 1242 ADD x5, x5, x10 1243 ADD v30.4s, v30.4s , v28.4s 1244 1245 ST1 {v23.h}[2], [x5] 1246 ADD x5, x5, x10 1247 NEG v30.4s, v30.4s 1248 1249 ST1 {v15.h}[3], [x5] 1250 ADD x5, x5, x10 1251 1252 1253 ST1 {v23.h}[3], [x5] 1254 ADD x5, x5, x10 1255 SUB v28.4s, v24.4s , v26.4s 1256 1257 1258 uMULL v22.4s, v4.4h, v8.4h 1259 mov v26.16b, v30.16b 1260 mov v24.16b, v28.16b 1261 1262 mov v26.16b, v30.16b 1263 mov v24.16b, v28.16b 1264 1265 //VUZP.16 D26, D27 1266 1267 UZP1 v19.8h, v26.8h, v26.8h 1268 UZP2 v21.8h, v26.8h, v26.8h 1269 MOV v26.d[0], v19.d[0] 1270 MOV v27.d[0], v21.d[0] 1271 1272// VUZP.16 D24, D25 1273 1274 UZP1 v19.8h, v24.8h, v24.8h 1275 UZP2 v21.8h, v24.8h, v24.8h 1276 MOV v24.d[0], v19.d[0] 1277 MOV v25.d[0], v21.d[0] 1278 1279 uMULL v2.4s, v24.4h, v18.4h 1280 uMULL v0.4s, v26.4h, v18.4h 1281 1282 ushR v22.4s, v22.4s, #16 1283 sMLAL v22.4s, v5.4h, v8.4h 1284 1285 ushR v2.4s, v2.4s, #16 1286 ushR v0.4s, v0.4s, #16 1287 sMLAL v2.4s, v25.4h, v18.4h 1288 sMLAL v0.4s, v27.4h, v18.4h 1289 1290 uMULL v24.4s, v4.4h, v9.4h 1291 uMULL v26.4s, v6.4h, v8.4h 1292 1293 NEG v2.4s, v2.4s 1294 ADD v28.4s, v28.4s , v0.4s 1295 ADD v30.4s, v30.4s , v2.4s 1296 1297 uMULL v0.4s, v6.4h, v9.4h 1298 sshR v24.4s, v24.4s, #16 1299 sMLAL v24.4s, v5.4h, v9.4h 1300 sshR v26.4s, v26.4s, #16 1301 sshR v0.4s, v0.4s, #16 1302 sMLAL v26.4s, v7.4h, v8.4h 1303 sMLAL v0.4s, v7.4h, v9.4h 1304 1305 1306 1307 1308 1309 ADD v22.4s, v22.4s , v0.4s 1310 NEG v22.4s, v22.4s 1311 SUB v24.4s, v26.4s , v24.4s 1312 1313 1314 1315 1316 //LDR w11, [sp, #120] 1317 //sxtw x11,w11 1318 MOV w11, w26 1319 dup v14.4s, w11 1320 SQADD v28.4s, v28.4s , v14.4s 1321 //LDR w11, [sp, #116] 1322 //sxtw x11,w11 1323 MOV w11, w25 1324 dup v0.4s, w11 1325 sQshL v28.4s, v28.4s, v0.4s 1326 1327 1328 mov v0.16b, v22.16b 1329 mov v14.16b, v24.16b 1330 1331 1332// VUZP.16 D22, D23 1333 1334 UZP1 v19.8h, v22.8h, v22.8h 1335 UZP2 v21.8h, v22.8h, v22.8h 1336 MOV v22.d[0], v19.d[0] 1337 MOV v23.d[0], v21.d[0] 1338 1339// VUZP.16 D24, D25 1340 1341 UZP1 v19.8h, v24.8h, v24.8h 1342 UZP2 v21.8h, v24.8h, v24.8h 1343 MOV v24.d[0], v19.d[0] 1344 MOV v25.d[0], v21.d[0] 1345 1346 uMULL v8.4s, v24.4h, v18.4h 1347 uMULL v26.4s, v22.4h, v18.4h 1348 1349 NEG v2.4s, v30.4s 1350 1351// VUZP.16 D30, D31 1352 1353 UZP1 v19.8h, v30.8h, v30.8h 1354 UZP2 v21.8h, v30.8h, v30.8h 1355 MOV v30.d[0], v19.d[0] 1356 MOV v30.d[1], v21.d[0] 1357 1358// VUZP.16 D2, D3 1359 1360 UZP1 v19.8h, v2.8h, v2.8h 1361 UZP2 v21.8h, v2.8h, v2.8h 1362 MOV v2.d[0], v19.d[0] 1363 MOV v3.d[0], v21.d[0] 1364 1365 uMULL v4.4s, v30.4h, v12.4h 1366 uMULL v6.4s, v2.4h, v13.4h 1367 1368 ushR v8.4s, v8.4s, #16 1369 ushR v26.4s, v26.4s, #16 1370 1371 sMLAL v8.4s, v25.4h, v18.4h 1372 sMLAL v26.4s, v23.4h, v18.4h 1373 1374 ushR v4.4s, v4.4s, #16 1375 ushR v6.4s, v6.4s, #16 1376 1377 MOV v19.d[0], v30.d[1] 1378 1379 sMLAL v4.4s, v19.4h, v12.4h 1380 sMLAL v6.4s, v3.4h, v13.4h 1381 1382 NEG v8.4s, v8.4s 1383 ADD v14.4s, v14.4s , v26.4s 1384 ADD v0.4s, v0.4s , v8.4s 1385 1386 //LDR w11, [sp, #120] 1387 //sxtw x11,w11 1388 MOV w11, w26 1389 dup v8.4s, w11 1390 SQADD v0.4s, v0.4s , v8.4s 1391 //LDR w11, [sp, #116] 1392 //sxtw x11,w11 1393 MOV w11, w25 1394 dup v26.4s, w11 1395 sQshL v0.4s, v0.4s, v26.4s 1396 1397 1398 mov v26.16b, v28.16b 1399 1400 LD2 { v28.4s, v29.4s}, [x4] 1401 MOV v30.16b, v29.16b 1402 MOV v29.d[0], v28.d[1] 1403// VZIP.32 Q13, Q0 1404 1405 ZIP1 v19.4s, v26.4s, v0.4s 1406 ZIP2 v0.4s, v26.4s, v0.4s 1407 MOV v26.16b, v19.16b 1408 1409 ST1 { v26.4s}, [x4], #16 1410 ST1 { v0.4s}, [x4], #16 1411 1412 movi v1.2s, #0 1413// VADDL.S16 Q0, D13, D1 1414 SADDL v0.4s, v13.4h, v1.4h 1415 MOV v1.d[0], v0.d[1] 1416 1417 sMULL v26.2d, v28.2s, v0.2s 1418 Sqxtn v8.2s, v26.2d 1419 sMULL v26.2d, v29.2s, v1.2s 1420 Sqxtn v9.2s, v26.2d 1421 MOV v8.d[1], v9.d[0] 1422 movi v1.2s, #0 1423// VADDL.S16 Q0, D12, D1 1424 SADDL v0.4s, v12.4h, v1.4h 1425 MOV v1.d[0], v0.d[1] 1426 1427 sMULL v24.2d, v28.2s, v0.2s 1428 Sqxtn v26.2s, v24.2d 1429 sMULL v24.2d, v29.2s, v1.2s 1430 Sqxtn v27.2s, v24.2d 1431 MOV v26.d[1], v27.d[0] 1432 1433 sQshL v4.4s, v4.4s, v16.4s 1434 sQshL v6.4s, v6.4s, v16.4s 1435 1436 SQSUB v4.4s, v4.4s , v8.4s 1437 SQSUB v6.4s, v6.4s , v26.4s 1438 1439 NEG v26.4s, v14.4s 1440// VUZP.16 D14, D15 1441 1442 UZP1 v19.8h, v14.8h, v14.8h 1443 UZP2 v21.8h, v14.8h, v14.8h 1444 MOV v14.d[0], v19.d[0] 1445 MOV v15.d[0], v21.d[0] 1446 1447 1448// VUZP.16 D26, D27 1449 1450 UZP1 v19.8h, v26.8h, v26.8h 1451 UZP2 v21.8h, v26.8h, v26.8h 1452 MOV v26.d[0], v19.d[0] 1453 MOV v27.d[0], v21.d[0] 1454 1455 1456 movi v1.2s, #0 1457 //VADDL.S16 Q0, D10, D1 1458 SADDL v0.4s, v10.4h, v1.4h 1459 MOV v1.d[0], v0.d[1] 1460 1461 sMULL v22.2d, v30.2s, v0.2s 1462 Sqxtn v24.2s, v22.2d 1463 sMULL2 v22.2d, v30.4s, v0.4s 1464 Sqxtn v25.2s, v22.2d 1465 MOV v24.d[1], v25.d[0] 1466 movi v1.2s, #0 1467 //VADDL.S16 Q0, D11, D1 1468 SADDL v0.4s, v11.4h, v1.4h 1469 MOV v1.d[0], v0.d[1] 1470 1471 sMULL v8.2d, v30.2s, v0.2s 1472 Sqxtn v22.2s, v8.2d 1473 sMULL2 v8.2d, v30.4s, v0.4s 1474 Sqxtn v23.2s, v8.2d 1475 MOV v22.d[1], v23.d[0] 1476 1477 uMULL v8.4s, v26.4h, v11.4h 1478 uMULL v30.4s, v14.4h, v10.4h 1479 1480 ushR v8.4s, v8.4s, #16 1481 1482 ushR v30.4s, v30.4s, #16 1483 1484 sMLAL v8.4s, v27.4h, v11.4h 1485 1486 sMLAL v30.4s, v15.4h, v10.4h 1487 1488 sQshL v4.4s, v4.4s, #2 1489 1490 sQshL v6.4s, v6.4s, #2 1491 1492 SQADD v14.4s, v4.4s , v20.4s 1493 1494 SQADD v6.4s, v6.4s , v20.4s 1495 1496 sshR v14.4s, v14.4s, #16 1497 1498// VUZP.16 D14, D15 1499 1500 UZP1 v19.8h, v14.8h, v14.8h 1501 UZP2 v21.8h, v14.8h, v14.8h 1502 MOV v14.d[0], v19.d[0] 1503 MOV v15.d[0], v21.d[0] 1504 1505 sshR v6.4s, v6.4s, #16 1506 1507// VUZP.16 D6, D7 1508 1509 UZP1 v19.8h, v6.8h, v6.8h 1510 UZP2 v21.8h, v6.8h, v6.8h 1511 MOV v6.d[0], v19.d[0] 1512 MOV v7.d[0], v21.d[0] 1513 1514 mov v15.8b, v6.8b 1515 sQshL v8.4s, v8.4s, v16.4s 1516 1517 sQshL v30.4s, v30.4s, v16.4s 1518 1519 SQSUB v8.4s, v8.4s , v24.4s 1520 1521 SQSUB v22.4s, v30.4s , v22.4s 1522 1523 sQshL v30.4s, v8.4s, #2 1524 1525 sQshL v22.4s, v22.4s, #2 1526 1527 SQADD v30.4s, v30.4s , v20.4s 1528 SQADD v22.4s, v22.4s , v20.4s 1529 1530 sshR v30.4s, v30.4s, #16 1531 1532 //VUZP.16 D30, D31 1533 1534 UZP1 v19.8h, v30.8h, v30.8h 1535 UZP2 v21.8h, v30.8h, v30.8h 1536 MOV v30.d[0], v19.d[0] 1537 MOV v30.d[1], v21.d[0] 1538 1539 sshR v22.4s, v22.4s, #16 1540 1541// VUZP.16 D22, D23 1542 UZP1 v19.8h, v22.8h, v22.8h 1543 UZP2 v21.8h, v22.8h, v22.8h 1544 MOV v22.d[0], v19.d[0] 1545 MOV v23.d[0], v21.d[0] 1546 1547 mov v23.8b, v30.8b 1548 1549 1550 1551 1552 ST1 {v14.h}[0], [x0] 1553 ADD x0, x0, x9 1554 ST1 {v22.h}[0], [x0] 1555 ADD x0, x0, x9 1556 ST1 {v14.h}[1], [x0] 1557 ADD x0, x0, x9 1558 ST1 {v22.h}[1], [x0] 1559 ADD x0, x0, x9 1560 ST1 {v14.h}[2], [x0] 1561 ADD x0, x0, x9 1562 ST1 {v22.h}[2], [x0] 1563 ADD x0, x0, x9 1564 ST1 {v14.h}[3], [x0] 1565 ADD x0, x0, x9 1566 ST1 {v22.h}[3], [x0] 1567 ADD x0, x0, x9 1568 ST1 {v15.h}[0], [x5] 1569 ADD x5, x5, x10 1570 ST1 {v23.h}[0], [x5] 1571 ADD x5, x5, x10 1572 ST1 {v15.h}[1], [x5] 1573 ADD x5, x5, x10 1574 ST1 {v23.h}[1], [x5] 1575 ADD x5, x5, x10 1576 ST1 {v15.h}[2], [x5] 1577 ADD x5, x5, x10 1578 ST1 {v23.h}[2], [x5] 1579 ADD x5, x5, x10 1580 ST1 {v15.h}[3], [x5] 1581 ADD x5, x5, x10 1582 ST1 {v23.h}[3], [x5] 1583 ADD x5, x5, x10 1584 1585ARM_EPILOGUE: 1586 1587ARM_LOOP: 1588 1589 LD2 { v0.4s, v1.4s}, [x1] 1590 MOV v2.16b, v1.16b 1591 1592 //VUZP.16 D0, D1 1593 UZP1 v19.8h, v0.8h, v0.8h 1594 UZP2 v21.8h, v0.8h, v0.8h 1595 MOV v0.d[0], v19.d[0] 1596 MOV v0.d[1], v21.d[0] 1597 1598 //VUZP.16 D2, D3 1599 UZP1 v19.8h, v2.8h, v2.8h 1600 UZP2 v21.8h, v2.8h, v2.8h 1601 MOV v2.d[0], v19.d[0] 1602 MOV v2.d[1], v21.d[0] 1603 1604 1605 rev64 v0.8h, v0.8h 1606 MOV v1.d[0], v0.d[1] 1607 rev64 v2.8h, v2.8h 1608 MOV v3.d[0], v2.d[1] 1609 1610 LD2 {v8.4h, v9.4h}, [x2] 1611 ADD x2, x2, #16 1612 1613 LD2 {v4.2s, v5.2s}, [x8] 1614 ADD x8, x8, #16 1615 MOV v6.16b, v5.16b 1616 movi v5.2s, #0x00000000 1617 movi v7.2s, #0x00000000 1618 1619 LD1 {v5.s}[0], [x8], #4 1620 LD1 {v7.s}[0], [x8] 1621 1622 MOV x12, #16 1623 MOV v4.d[1], v5.d[0] 1624 MOV v6.d[1], v7.d[0] 1625// VUZP.16 D4, D5 1626 1627 UZP1 v19.8h, v4.8h, v4.8h 1628 UZP2 v21.8h, v4.8h, v4.8h 1629 MOV v4.d[0], v19.d[0] 1630 MOV v5.d[0], v21.d[0] 1631 1632// VUZP.16 D6, D7 1633 1634 UZP1 v19.8h, v6.8h, v6.8h 1635 UZP2 v21.8h, v6.8h, v6.8h 1636 MOV v6.d[0], v19.d[0] 1637 MOV v7.d[0], v21.d[0] 1638 1639 ADD x6, x6, #16 1640 1641 MOV x12, #-4 1642 LD2 {v11.2s, v12.2s}, [x6] 1643 ADD x6, x6, x12 1644 MOV v13.16b, v12.16b 1645 1646 1647 movi v10.2s, #0x00000000 1648 1649 LD1 {v12.s}[1], [x6] 1650 ADD x6, x6, x12 1651 LD1 {v10.s}[1], [x6] 1652 ADD x6, x6, x12 1653 LD1 {v12.s}[0], [x6] 1654 ADD x6, x6, x12 1655 1656 MOV v10.d[1], v11.d[0] 1657 MOV v12.d[1], v13.d[0] 1658 1659 //VUZP.16 D10, D11 1660 1661 UZP1 v19.8h, v10.8h, v10.8h 1662 UZP2 v21.8h, v10.8h, v10.8h 1663 MOV v10.d[0], v19.d[0] 1664 MOV v10.d[1], v21.d[0] 1665 1666 //VUZP.16 D12, D13 1667 1668 UZP1 v19.8h, v12.8h, v12.8h 1669 UZP2 v21.8h, v12.8h, v12.8h 1670 MOV v12.d[0], v19.d[0] 1671 MOV v12.d[1], v21.d[0] 1672 1673 1674 rev64 v10.8h, v10.8h 1675 MOV v11.d[0], v10.d[1] 1676 rev64 v12.8h, v12.8h 1677 MOV v13.d[0], v12.d[1] 1678 1679 uMULL v30.4s, v0.4h, v9.4h 1680 uMULL v28.4s, v2.4h, v8.4h 1681 uMULL v26.4s, v0.4h, v8.4h 1682 uMULL v24.4s, v2.4h, v9.4h 1683 1684 ushR v30.4s, v30.4s, #16 1685 ushR v28.4s, v28.4s, #16 1686 1687 sMLAL v30.4s, v1.4h, v9.4h 1688 sMLAL v28.4s, v3.4h, v8.4h 1689 1690 ushR v26.4s, v26.4s, #16 1691 ushR v24.4s, v24.4s, #16 1692 1693 sMLAL v26.4s, v1.4h, v8.4h 1694 sMLAL v24.4s, v3.4h, v9.4h 1695 1696 ADD v30.4s, v30.4s , v28.4s 1697 NEG v30.4s, v30.4s 1698 1699 uMULL v22.4s, v4.4h, v8.4h 1700 1701 SUB v28.4s, v24.4s , v26.4s 1702 1703 1704 mov v26.16b, v30.16b 1705 mov v24.16b, v28.16b 1706 1707// VUZP.16 D26, D27 1708 1709 UZP1 v19.8h, v26.8h, v26.8h 1710 UZP2 v21.8h, v26.8h, v26.8h 1711 MOV v26.d[0], v19.d[0] 1712 MOV v27.d[0], v21.d[0] 1713 1714 //VUZP.16 D24, D25 1715 1716 UZP1 v19.8h, v24.8h, v24.8h 1717 UZP2 v21.8h, v24.8h, v24.8h 1718 MOV v24.d[0], v19.d[0] 1719 MOV v25.d[0], v21.d[0] 1720 1721 uMULL v2.4s, v24.4h, v18.4h 1722 uMULL v0.4s, v26.4h, v18.4h 1723 1724 ushR v22.4s, v22.4s, #16 1725 sMLAL v22.4s, v5.4h, v8.4h 1726 1727 ushR v2.4s, v2.4s, #16 1728 ushR v0.4s, v0.4s, #16 1729 sMLAL v2.4s, v25.4h, v18.4h 1730 sMLAL v0.4s, v27.4h, v18.4h 1731 1732 uMULL v24.4s, v4.4h, v9.4h 1733 uMULL v26.4s, v6.4h, v8.4h 1734 1735 NEG v2.4s, v2.4s 1736 ADD v28.4s, v28.4s , v0.4s 1737 ADD v30.4s, v30.4s , v2.4s 1738 1739 uMULL v0.4s, v6.4h, v9.4h 1740 sshR v24.4s, v24.4s, #16 1741 sMLAL v24.4s, v5.4h, v9.4h 1742 sshR v26.4s, v26.4s, #16 1743 sshR v0.4s, v0.4s, #16 1744 sMLAL v26.4s, v7.4h, v8.4h 1745 sMLAL v0.4s, v7.4h, v9.4h 1746 1747 ADD v22.4s, v22.4s , v0.4s 1748 NEG v22.4s, v22.4s 1749 SUB v24.4s, v26.4s , v24.4s 1750 1751 //LDR w11, [sp, #120] 1752 //sxtw x11,w11 1753 MOV w11, w26 1754 dup v14.4s, w11 1755 SQADD v28.4s, v28.4s , v14.4s 1756 //LDR w11, [sp, #116] 1757 //sxtw x11,w11 1758 MOV w11, w25 1759 dup v0.4s, w11 1760 sQshL v28.4s, v28.4s, v0.4s 1761 1762 mov v0.16b, v22.16b 1763 mov v14.16b, v24.16b 1764 1765// VUZP.16 D22, D23 1766 1767 UZP1 v19.8h, v22.8h, v22.8h 1768 UZP2 v21.8h, v22.8h, v22.8h 1769 MOV v22.d[0], v19.d[0] 1770 MOV v23.d[0], v21.d[0] 1771 1772// VUZP.16 D24, D25 1773 1774 UZP1 v19.8h, v24.8h, v24.8h 1775 UZP2 v21.8h, v24.8h, v24.8h 1776 MOV v24.d[0], v19.d[0] 1777 MOV v25.d[0], v21.d[0] 1778 1779 uMULL v8.4s, v24.4h, v18.4h 1780 uMULL v26.4s, v22.4h, v18.4h 1781 1782 NEG v2.4s, v30.4s 1783// VUZP.16 D30, D31 1784 1785 UZP1 v19.8h, v30.8h, v30.8h 1786 UZP2 v21.8h, v30.8h, v30.8h 1787 MOV v30.d[0], v19.d[0] 1788 MOV v30.d[1], v21.d[0] 1789 1790// VUZP.16 D2, D3 1791 1792 UZP1 v19.8h, v2.8h, v2.8h 1793 UZP2 v21.8h, v2.8h, v2.8h 1794 MOV v2.d[0], v19.d[0] 1795 MOV v3.d[0], v21.d[0] 1796 1797 uMULL v4.4s, v30.4h, v12.4h 1798 uMULL v6.4s, v2.4h, v13.4h 1799 1800 ushR v8.4s, v8.4s, #16 1801 ushR v26.4s, v26.4s, #16 1802 1803 sMLAL v8.4s, v25.4h, v18.4h 1804 sMLAL v26.4s, v23.4h, v18.4h 1805 1806 ushR v4.4s, v4.4s, #16 1807 ushR v6.4s, v6.4s, #16 1808 1809 MOV v19.d[0], v30.d[1] 1810 1811 sMLAL v4.4s, v19.4h, v12.4h 1812 sMLAL v6.4s, v3.4h, v13.4h 1813 1814 NEG v8.4s, v8.4s 1815 ADD v14.4s, v14.4s , v26.4s 1816 ADD v0.4s, v0.4s , v8.4s 1817 1818 //LDR w11, [sp, #120] 1819 //sxtw x11,w11 1820 MOV w11, w26 1821 dup v8.4s, w11 1822 SQADD v0.4s, v0.4s , v8.4s 1823 //LDR w11, [sp, #116] 1824 //sxtw x11,w11 1825 MOV w11, w25 1826 dup v26.4s, w11 1827 sQshL v0.4s, v0.4s, v26.4s 1828 1829 mov v26.16b, v28.16b 1830 1831 MOV x6, x4 1832 1833 LD1 {v28.2s, v29.2s}, [x4], #16 1834 movi v19.2s, #0x00000000 1835 LD1 {v30.s}[0], [x4], #4 1836 LD1 {v30.s}[1], [x4], #4 1837 LD1 {v19.s}[0], [x4], #4 1838 1839 MOV v28.d[1], v29.d[0] 1840 MOV v30.d[1], v19.d[0] 1841 1842 //VUZP.32 Q14, Q15 1843 1844 UZP1 v19.4s, v28.4s, v30.4s 1845 UZP2 v30.4s, v28.4s, v30.4s 1846 MOV v28.16b, v19.16b 1847 MOV v29.d[0], v28.d[1] 1848 1849 ST1 {v26.s}[0], [x6], #4 1850 ST1 {v0.s}[0], [x6], #4 1851 ST1 {v26.s}[1], [x6], #4 1852 ST1 {v0.s}[1], [x6], #4 1853 ST1 {v26.s}[2], [x6], #4 1854 ST1 {v0.s}[2], [x6], #4 1855 ST1 {v26.s}[3], [x6], #4 1856 1857 movi v1.2s, #0 1858 //VADDL.S16 Q0, D13, D1 1859 SADDL v0.4s, v13.4h, v1.4h 1860 MOV v1.d[0], v0.d[1] 1861 1862 sMULL v26.2d, v28.2s, v0.2s 1863 Sqxtn v8.2s, v26.2d 1864 sMULL v26.2d, v29.2s, v1.2s 1865 Sqxtn v9.2s, v26.2d 1866 MOV v8.d[1], v9.d[0] 1867 movi v1.2s, #0 1868 //VADDL.S16 Q0, D12, D1 1869 SADDL v0.4s, v12.4h, v1.4h 1870 MOV v1.d[0], v0.d[1] 1871 1872 sMULL v24.2d, v28.2s, v0.2s 1873 Sqxtn v26.2s, v24.2d 1874 sMULL v24.2d, v29.2s, v1.2s 1875 Sqxtn v27.2s, v24.2d 1876 MOV v26.d[1], v27.d[0] 1877 1878 sQshL v4.4s, v4.4s, v16.4s 1879 sQshL v6.4s, v6.4s, v16.4s 1880 1881 SQSUB v4.4s, v4.4s , v8.4s 1882 SQSUB v6.4s, v6.4s , v26.4s 1883 1884 NEG v26.4s, v14.4s 1885 //VUZP.16 D14, D15 1886 1887 UZP1 v19.8h, v14.8h, v14.8h 1888 UZP2 v21.8h, v14.8h, v14.8h 1889 MOV v14.d[0], v19.d[0] 1890 MOV v15.d[0], v21.d[0] 1891 1892// VUZP.16 D26, D27 1893 1894 UZP1 v19.8h, v26.8h, v26.8h 1895 UZP2 v21.8h, v26.8h, v26.8h 1896 MOV v26.d[0], v19.d[0] 1897 MOV v27.d[0], v21.d[0] 1898 1899 1900 movi v1.2s, #0 1901 //VADDL.S16 Q0, D10, D1 1902 SADDL v0.4s, v10.4h, v1.4h 1903 MOV v1.d[0], v0.d[1] 1904 1905 sMULL v22.2d, v30.2s, v0.2s 1906 Sqxtn v24.2s, v22.2d 1907 sMULL2 v22.2d, v30.4s, v0.4s 1908 Sqxtn v25.2s, v22.2d 1909 MOV v24.d[1], v25.d[0] 1910 1911 movi v1.2s, #0 1912// VADDL.S16 Q0, D11, D1 1913 SADDL v0.4s, v11.4h, v1.4h 1914 MOV v1.d[0], v0.d[1] 1915 1916 sMULL v8.2d, v30.2s, v0.2s 1917 Sqxtn v22.2s, v8.2d 1918 sMULL2 v8.2d, v30.4s, v0.4s 1919 Sqxtn v23.2s, v8.2d 1920 MOV v22.d[1], v23.d[0] 1921 1922 uMULL v8.4s, v26.4h, v11.4h 1923 uMULL v30.4s, v14.4h, v10.4h 1924 1925 ushR v8.4s, v8.4s, #16 1926 1927 ushR v30.4s, v30.4s, #16 1928 1929 sMLAL v8.4s, v27.4h, v11.4h 1930 1931 sMLAL v30.4s, v15.4h, v10.4h 1932 1933 sQshL v4.4s, v4.4s, #2 1934 1935 sQshL v6.4s, v6.4s, #2 1936 1937 SQADD v14.4s, v4.4s , v20.4s 1938 1939 SQADD v6.4s, v6.4s , v20.4s 1940 1941 sshR v14.4s, v14.4s, #16 1942 1943// VUZP.16 D14, D15 1944 1945 UZP1 v19.8h, v14.8h, v14.8h 1946 UZP2 v21.8h, v14.8h, v14.8h 1947 MOV v14.d[0], v19.d[0] 1948 MOV v15.d[0], v21.d[0] 1949 1950 sshR v6.4s, v6.4s, #16 1951 1952 //VUZP.16 D6, D7 1953 1954 UZP1 v19.8h, v6.8h, v6.8h 1955 UZP2 v21.8h, v6.8h, v6.8h 1956 MOV v6.d[0], v19.d[0] 1957 MOV v7.d[0], v21.d[0] 1958 1959 mov v15.8b, v6.8b 1960 sQshL v8.4s, v8.4s, v16.4s 1961 1962 sQshL v30.4s, v30.4s, v16.4s 1963 1964 SQSUB v8.4s, v8.4s , v24.4s 1965 1966 SQSUB v22.4s, v30.4s , v22.4s 1967 1968 sQshL v30.4s, v8.4s, #2 1969 1970 sQshL v22.4s, v22.4s, #2 1971 1972 SQADD v30.4s, v30.4s , v20.4s 1973 SQADD v22.4s, v22.4s , v20.4s 1974 1975 sshR v30.4s, v30.4s, #16 1976 1977// VUZP.16 D30, D31 1978 1979 UZP1 v19.8h, v30.8h, v30.8h 1980 UZP2 v21.8h, v30.8h, v30.8h 1981 MOV v30.d[0], v19.d[0] 1982 MOV v30.d[1], v21.d[0] 1983 1984 sshR v22.4s, v22.4s, #16 1985 1986// VUZP.16 D22, D23 1987 1988 UZP1 v19.8h, v22.8h, v22.8h 1989 UZP2 v21.8h, v22.8h, v22.8h 1990 MOV v22.d[0], v19.d[0] 1991 MOV v23.d[0], v21.d[0] 1992 1993 mov v23.8b, v30.8b 1994 1995 1996 1997 1998 ST1 {v14.h}[0], [x0] 1999 ADD x0, x0, x9 2000 ST1 {v22.h}[0], [x0] 2001 ADD x0, x0, x9 2002 ST1 {v14.h}[1], [x0] 2003 ADD x0, x0, x9 2004 ST1 {v22.h}[1], [x0] 2005 ADD x0, x0, x9 2006 ST1 {v14.h}[2], [x0] 2007 ADD x0, x0, x9 2008 ST1 {v22.h}[2], [x0] 2009 ADD x0, x0, x9 2010 ST1 {v14.h}[3], [x0] 2011 ADD x0, x0, x9 2012 2013 ST1 {v15.h}[0], [x5] 2014 ADD x5, x5, x10 2015 ST1 {v23.h}[0], [x5] 2016 ADD x5, x5, x10 2017 ST1 {v15.h}[1], [x5] 2018 ADD x5, x5, x10 2019 ST1 {v23.h}[1], [x5] 2020 ADD x5, x5, x10 2021 ST1 {v15.h}[2], [x5] 2022 ADD x5, x5, x10 2023 ST1 {v23.h}[2], [x5] 2024 ADD x5, x5, x10 2025 ST1 {v15.h}[3], [x5] 2026 ADD x5, x5, x10 2027 2028 // VPOP {d8 - d15} 2029 // LDMFD sp!, {x4-x12} 2030 //ldp x19, x20,[sp],#16 2031 pop_v_regs 2032 ret 2033 //BX x14 2034