1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11.text 12 13 14 15.globl _GFp_bn_mul_mont 16.private_extern _GFp_bn_mul_mont 17 18.p2align 4 19_GFp_bn_mul_mont: 20 21 movl %r9d,%r9d 22 movq %rsp,%rax 23 24 testl $3,%r9d 25 jnz L$mul_enter 26 cmpl $8,%r9d 27 jb L$mul_enter 28 movl _GFp_ia32cap_P+8(%rip),%r11d 29 cmpq %rsi,%rdx 30 jne L$mul4x_enter 31 testl $7,%r9d 32 jz L$sqr8x_enter 33 jmp L$mul4x_enter 34 35.p2align 4 36L$mul_enter: 37 pushq %rbx 38 39 pushq %rbp 40 41 pushq %r12 42 43 pushq %r13 44 45 pushq %r14 46 47 pushq %r15 48 49 50 negq %r9 51 movq %rsp,%r11 52 leaq -16(%rsp,%r9,8),%r10 53 negq %r9 54 andq $-1024,%r10 55 56 57 58 59 60 61 62 63 64 subq %r10,%r11 65 andq $-4096,%r11 66 leaq (%r10,%r11,1),%rsp 67 movq (%rsp),%r11 68 cmpq %r10,%rsp 69 ja L$mul_page_walk 70 jmp L$mul_page_walk_done 71 72.p2align 4 73L$mul_page_walk: 74 leaq -4096(%rsp),%rsp 75 movq (%rsp),%r11 76 cmpq %r10,%rsp 77 ja L$mul_page_walk 78L$mul_page_walk_done: 79 80 movq %rax,8(%rsp,%r9,8) 81 82L$mul_body: 83 movq %rdx,%r12 84 movq (%r8),%r8 85 movq (%r12),%rbx 86 movq (%rsi),%rax 87 88 xorq %r14,%r14 89 xorq %r15,%r15 90 91 movq %r8,%rbp 92 mulq %rbx 93 movq %rax,%r10 94 movq (%rcx),%rax 95 96 imulq %r10,%rbp 97 movq %rdx,%r11 98 99 mulq %rbp 100 addq %rax,%r10 101 movq 8(%rsi),%rax 102 adcq $0,%rdx 103 movq %rdx,%r13 104 105 leaq 1(%r15),%r15 106 jmp L$1st_enter 107 108.p2align 4 109L$1st: 110 addq %rax,%r13 111 movq (%rsi,%r15,8),%rax 112 adcq $0,%rdx 113 addq %r11,%r13 114 movq %r10,%r11 115 adcq $0,%rdx 116 movq %r13,-16(%rsp,%r15,8) 117 movq %rdx,%r13 118 119L$1st_enter: 120 mulq %rbx 121 addq %rax,%r11 122 movq (%rcx,%r15,8),%rax 123 adcq $0,%rdx 124 leaq 1(%r15),%r15 125 movq %rdx,%r10 126 127 mulq %rbp 128 cmpq %r9,%r15 129 jne L$1st 130 131 addq %rax,%r13 132 movq (%rsi),%rax 133 adcq $0,%rdx 134 addq %r11,%r13 135 adcq $0,%rdx 136 movq %r13,-16(%rsp,%r15,8) 137 movq %rdx,%r13 138 movq %r10,%r11 139 140 xorq %rdx,%rdx 141 addq %r11,%r13 142 adcq $0,%rdx 143 movq %r13,-8(%rsp,%r9,8) 144 movq %rdx,(%rsp,%r9,8) 145 146 leaq 1(%r14),%r14 147 jmp L$outer 148.p2align 4 149L$outer: 150 movq (%r12,%r14,8),%rbx 151 xorq %r15,%r15 152 movq %r8,%rbp 153 movq (%rsp),%r10 154 mulq %rbx 155 addq %rax,%r10 156 movq (%rcx),%rax 157 adcq $0,%rdx 158 159 imulq %r10,%rbp 160 movq %rdx,%r11 161 162 mulq %rbp 163 addq %rax,%r10 164 movq 8(%rsi),%rax 165 adcq $0,%rdx 166 movq 8(%rsp),%r10 167 movq %rdx,%r13 168 169 leaq 1(%r15),%r15 170 jmp L$inner_enter 171 172.p2align 4 173L$inner: 174 addq %rax,%r13 175 movq (%rsi,%r15,8),%rax 176 adcq $0,%rdx 177 addq %r10,%r13 178 movq (%rsp,%r15,8),%r10 179 adcq $0,%rdx 180 movq %r13,-16(%rsp,%r15,8) 181 movq %rdx,%r13 182 183L$inner_enter: 184 mulq %rbx 185 addq %rax,%r11 186 movq (%rcx,%r15,8),%rax 187 adcq $0,%rdx 188 addq %r11,%r10 189 movq %rdx,%r11 190 adcq $0,%r11 191 leaq 1(%r15),%r15 192 193 mulq %rbp 194 cmpq %r9,%r15 195 jne L$inner 196 197 addq %rax,%r13 198 movq (%rsi),%rax 199 adcq $0,%rdx 200 addq %r10,%r13 201 movq (%rsp,%r15,8),%r10 202 adcq $0,%rdx 203 movq %r13,-16(%rsp,%r15,8) 204 movq %rdx,%r13 205 206 xorq %rdx,%rdx 207 addq %r11,%r13 208 adcq $0,%rdx 209 addq %r10,%r13 210 adcq $0,%rdx 211 movq %r13,-8(%rsp,%r9,8) 212 movq %rdx,(%rsp,%r9,8) 213 214 leaq 1(%r14),%r14 215 cmpq %r9,%r14 216 jb L$outer 217 218 xorq %r14,%r14 219 movq (%rsp),%rax 220 movq %r9,%r15 221 222.p2align 4 223L$sub: sbbq (%rcx,%r14,8),%rax 224 movq %rax,(%rdi,%r14,8) 225 movq 8(%rsp,%r14,8),%rax 226 leaq 1(%r14),%r14 227 decq %r15 228 jnz L$sub 229 230 sbbq $0,%rax 231 movq $-1,%rbx 232 xorq %rax,%rbx 233 xorq %r14,%r14 234 movq %r9,%r15 235 236L$copy: 237 movq (%rdi,%r14,8),%rcx 238 movq (%rsp,%r14,8),%rdx 239 andq %rbx,%rcx 240 andq %rax,%rdx 241 movq %r9,(%rsp,%r14,8) 242 orq %rcx,%rdx 243 movq %rdx,(%rdi,%r14,8) 244 leaq 1(%r14),%r14 245 subq $1,%r15 246 jnz L$copy 247 248 movq 8(%rsp,%r9,8),%rsi 249 250 movq $1,%rax 251 movq -48(%rsi),%r15 252 253 movq -40(%rsi),%r14 254 255 movq -32(%rsi),%r13 256 257 movq -24(%rsi),%r12 258 259 movq -16(%rsi),%rbp 260 261 movq -8(%rsi),%rbx 262 263 leaq (%rsi),%rsp 264 265L$mul_epilogue: 266 .byte 0xf3,0xc3 267 268 269 270.p2align 4 271bn_mul4x_mont: 272 273 movl %r9d,%r9d 274 movq %rsp,%rax 275 276L$mul4x_enter: 277 andl $0x80100,%r11d 278 cmpl $0x80100,%r11d 279 je L$mulx4x_enter 280 pushq %rbx 281 282 pushq %rbp 283 284 pushq %r12 285 286 pushq %r13 287 288 pushq %r14 289 290 pushq %r15 291 292 293 negq %r9 294 movq %rsp,%r11 295 leaq -32(%rsp,%r9,8),%r10 296 negq %r9 297 andq $-1024,%r10 298 299 subq %r10,%r11 300 andq $-4096,%r11 301 leaq (%r10,%r11,1),%rsp 302 movq (%rsp),%r11 303 cmpq %r10,%rsp 304 ja L$mul4x_page_walk 305 jmp L$mul4x_page_walk_done 306 307L$mul4x_page_walk: 308 leaq -4096(%rsp),%rsp 309 movq (%rsp),%r11 310 cmpq %r10,%rsp 311 ja L$mul4x_page_walk 312L$mul4x_page_walk_done: 313 314 movq %rax,8(%rsp,%r9,8) 315 316L$mul4x_body: 317 movq %rdi,16(%rsp,%r9,8) 318 movq %rdx,%r12 319 movq (%r8),%r8 320 movq (%r12),%rbx 321 movq (%rsi),%rax 322 323 xorq %r14,%r14 324 xorq %r15,%r15 325 326 movq %r8,%rbp 327 mulq %rbx 328 movq %rax,%r10 329 movq (%rcx),%rax 330 331 imulq %r10,%rbp 332 movq %rdx,%r11 333 334 mulq %rbp 335 addq %rax,%r10 336 movq 8(%rsi),%rax 337 adcq $0,%rdx 338 movq %rdx,%rdi 339 340 mulq %rbx 341 addq %rax,%r11 342 movq 8(%rcx),%rax 343 adcq $0,%rdx 344 movq %rdx,%r10 345 346 mulq %rbp 347 addq %rax,%rdi 348 movq 16(%rsi),%rax 349 adcq $0,%rdx 350 addq %r11,%rdi 351 leaq 4(%r15),%r15 352 adcq $0,%rdx 353 movq %rdi,(%rsp) 354 movq %rdx,%r13 355 jmp L$1st4x 356.p2align 4 357L$1st4x: 358 mulq %rbx 359 addq %rax,%r10 360 movq -16(%rcx,%r15,8),%rax 361 adcq $0,%rdx 362 movq %rdx,%r11 363 364 mulq %rbp 365 addq %rax,%r13 366 movq -8(%rsi,%r15,8),%rax 367 adcq $0,%rdx 368 addq %r10,%r13 369 adcq $0,%rdx 370 movq %r13,-24(%rsp,%r15,8) 371 movq %rdx,%rdi 372 373 mulq %rbx 374 addq %rax,%r11 375 movq -8(%rcx,%r15,8),%rax 376 adcq $0,%rdx 377 movq %rdx,%r10 378 379 mulq %rbp 380 addq %rax,%rdi 381 movq (%rsi,%r15,8),%rax 382 adcq $0,%rdx 383 addq %r11,%rdi 384 adcq $0,%rdx 385 movq %rdi,-16(%rsp,%r15,8) 386 movq %rdx,%r13 387 388 mulq %rbx 389 addq %rax,%r10 390 movq (%rcx,%r15,8),%rax 391 adcq $0,%rdx 392 movq %rdx,%r11 393 394 mulq %rbp 395 addq %rax,%r13 396 movq 8(%rsi,%r15,8),%rax 397 adcq $0,%rdx 398 addq %r10,%r13 399 adcq $0,%rdx 400 movq %r13,-8(%rsp,%r15,8) 401 movq %rdx,%rdi 402 403 mulq %rbx 404 addq %rax,%r11 405 movq 8(%rcx,%r15,8),%rax 406 adcq $0,%rdx 407 leaq 4(%r15),%r15 408 movq %rdx,%r10 409 410 mulq %rbp 411 addq %rax,%rdi 412 movq -16(%rsi,%r15,8),%rax 413 adcq $0,%rdx 414 addq %r11,%rdi 415 adcq $0,%rdx 416 movq %rdi,-32(%rsp,%r15,8) 417 movq %rdx,%r13 418 cmpq %r9,%r15 419 jb L$1st4x 420 421 mulq %rbx 422 addq %rax,%r10 423 movq -16(%rcx,%r15,8),%rax 424 adcq $0,%rdx 425 movq %rdx,%r11 426 427 mulq %rbp 428 addq %rax,%r13 429 movq -8(%rsi,%r15,8),%rax 430 adcq $0,%rdx 431 addq %r10,%r13 432 adcq $0,%rdx 433 movq %r13,-24(%rsp,%r15,8) 434 movq %rdx,%rdi 435 436 mulq %rbx 437 addq %rax,%r11 438 movq -8(%rcx,%r15,8),%rax 439 adcq $0,%rdx 440 movq %rdx,%r10 441 442 mulq %rbp 443 addq %rax,%rdi 444 movq (%rsi),%rax 445 adcq $0,%rdx 446 addq %r11,%rdi 447 adcq $0,%rdx 448 movq %rdi,-16(%rsp,%r15,8) 449 movq %rdx,%r13 450 451 xorq %rdi,%rdi 452 addq %r10,%r13 453 adcq $0,%rdi 454 movq %r13,-8(%rsp,%r15,8) 455 movq %rdi,(%rsp,%r15,8) 456 457 leaq 1(%r14),%r14 458.p2align 2 459L$outer4x: 460 movq (%r12,%r14,8),%rbx 461 xorq %r15,%r15 462 movq (%rsp),%r10 463 movq %r8,%rbp 464 mulq %rbx 465 addq %rax,%r10 466 movq (%rcx),%rax 467 adcq $0,%rdx 468 469 imulq %r10,%rbp 470 movq %rdx,%r11 471 472 mulq %rbp 473 addq %rax,%r10 474 movq 8(%rsi),%rax 475 adcq $0,%rdx 476 movq %rdx,%rdi 477 478 mulq %rbx 479 addq %rax,%r11 480 movq 8(%rcx),%rax 481 adcq $0,%rdx 482 addq 8(%rsp),%r11 483 adcq $0,%rdx 484 movq %rdx,%r10 485 486 mulq %rbp 487 addq %rax,%rdi 488 movq 16(%rsi),%rax 489 adcq $0,%rdx 490 addq %r11,%rdi 491 leaq 4(%r15),%r15 492 adcq $0,%rdx 493 movq %rdi,(%rsp) 494 movq %rdx,%r13 495 jmp L$inner4x 496.p2align 4 497L$inner4x: 498 mulq %rbx 499 addq %rax,%r10 500 movq -16(%rcx,%r15,8),%rax 501 adcq $0,%rdx 502 addq -16(%rsp,%r15,8),%r10 503 adcq $0,%rdx 504 movq %rdx,%r11 505 506 mulq %rbp 507 addq %rax,%r13 508 movq -8(%rsi,%r15,8),%rax 509 adcq $0,%rdx 510 addq %r10,%r13 511 adcq $0,%rdx 512 movq %r13,-24(%rsp,%r15,8) 513 movq %rdx,%rdi 514 515 mulq %rbx 516 addq %rax,%r11 517 movq -8(%rcx,%r15,8),%rax 518 adcq $0,%rdx 519 addq -8(%rsp,%r15,8),%r11 520 adcq $0,%rdx 521 movq %rdx,%r10 522 523 mulq %rbp 524 addq %rax,%rdi 525 movq (%rsi,%r15,8),%rax 526 adcq $0,%rdx 527 addq %r11,%rdi 528 adcq $0,%rdx 529 movq %rdi,-16(%rsp,%r15,8) 530 movq %rdx,%r13 531 532 mulq %rbx 533 addq %rax,%r10 534 movq (%rcx,%r15,8),%rax 535 adcq $0,%rdx 536 addq (%rsp,%r15,8),%r10 537 adcq $0,%rdx 538 movq %rdx,%r11 539 540 mulq %rbp 541 addq %rax,%r13 542 movq 8(%rsi,%r15,8),%rax 543 adcq $0,%rdx 544 addq %r10,%r13 545 adcq $0,%rdx 546 movq %r13,-8(%rsp,%r15,8) 547 movq %rdx,%rdi 548 549 mulq %rbx 550 addq %rax,%r11 551 movq 8(%rcx,%r15,8),%rax 552 adcq $0,%rdx 553 addq 8(%rsp,%r15,8),%r11 554 adcq $0,%rdx 555 leaq 4(%r15),%r15 556 movq %rdx,%r10 557 558 mulq %rbp 559 addq %rax,%rdi 560 movq -16(%rsi,%r15,8),%rax 561 adcq $0,%rdx 562 addq %r11,%rdi 563 adcq $0,%rdx 564 movq %rdi,-32(%rsp,%r15,8) 565 movq %rdx,%r13 566 cmpq %r9,%r15 567 jb L$inner4x 568 569 mulq %rbx 570 addq %rax,%r10 571 movq -16(%rcx,%r15,8),%rax 572 adcq $0,%rdx 573 addq -16(%rsp,%r15,8),%r10 574 adcq $0,%rdx 575 movq %rdx,%r11 576 577 mulq %rbp 578 addq %rax,%r13 579 movq -8(%rsi,%r15,8),%rax 580 adcq $0,%rdx 581 addq %r10,%r13 582 adcq $0,%rdx 583 movq %r13,-24(%rsp,%r15,8) 584 movq %rdx,%rdi 585 586 mulq %rbx 587 addq %rax,%r11 588 movq -8(%rcx,%r15,8),%rax 589 adcq $0,%rdx 590 addq -8(%rsp,%r15,8),%r11 591 adcq $0,%rdx 592 leaq 1(%r14),%r14 593 movq %rdx,%r10 594 595 mulq %rbp 596 addq %rax,%rdi 597 movq (%rsi),%rax 598 adcq $0,%rdx 599 addq %r11,%rdi 600 adcq $0,%rdx 601 movq %rdi,-16(%rsp,%r15,8) 602 movq %rdx,%r13 603 604 xorq %rdi,%rdi 605 addq %r10,%r13 606 adcq $0,%rdi 607 addq (%rsp,%r9,8),%r13 608 adcq $0,%rdi 609 movq %r13,-8(%rsp,%r15,8) 610 movq %rdi,(%rsp,%r15,8) 611 612 cmpq %r9,%r14 613 jb L$outer4x 614 movq 16(%rsp,%r9,8),%rdi 615 leaq -4(%r9),%r15 616 movq 0(%rsp),%rax 617 movq 8(%rsp),%rdx 618 shrq $2,%r15 619 leaq (%rsp),%rsi 620 xorq %r14,%r14 621 622 subq 0(%rcx),%rax 623 movq 16(%rsi),%rbx 624 movq 24(%rsi),%rbp 625 sbbq 8(%rcx),%rdx 626 627L$sub4x: 628 movq %rax,0(%rdi,%r14,8) 629 movq %rdx,8(%rdi,%r14,8) 630 sbbq 16(%rcx,%r14,8),%rbx 631 movq 32(%rsi,%r14,8),%rax 632 movq 40(%rsi,%r14,8),%rdx 633 sbbq 24(%rcx,%r14,8),%rbp 634 movq %rbx,16(%rdi,%r14,8) 635 movq %rbp,24(%rdi,%r14,8) 636 sbbq 32(%rcx,%r14,8),%rax 637 movq 48(%rsi,%r14,8),%rbx 638 movq 56(%rsi,%r14,8),%rbp 639 sbbq 40(%rcx,%r14,8),%rdx 640 leaq 4(%r14),%r14 641 decq %r15 642 jnz L$sub4x 643 644 movq %rax,0(%rdi,%r14,8) 645 movq 32(%rsi,%r14,8),%rax 646 sbbq 16(%rcx,%r14,8),%rbx 647 movq %rdx,8(%rdi,%r14,8) 648 sbbq 24(%rcx,%r14,8),%rbp 649 movq %rbx,16(%rdi,%r14,8) 650 651 sbbq $0,%rax 652 movq %rbp,24(%rdi,%r14,8) 653 pxor %xmm0,%xmm0 654.byte 102,72,15,110,224 655 pcmpeqd %xmm5,%xmm5 656 pshufd $0,%xmm4,%xmm4 657 movq %r9,%r15 658 pxor %xmm4,%xmm5 659 shrq $2,%r15 660 xorl %eax,%eax 661 662 jmp L$copy4x 663.p2align 4 664L$copy4x: 665 movdqa (%rsp,%rax,1),%xmm1 666 movdqu (%rdi,%rax,1),%xmm2 667 pand %xmm4,%xmm1 668 pand %xmm5,%xmm2 669 movdqa 16(%rsp,%rax,1),%xmm3 670 movdqa %xmm0,(%rsp,%rax,1) 671 por %xmm2,%xmm1 672 movdqu 16(%rdi,%rax,1),%xmm2 673 movdqu %xmm1,(%rdi,%rax,1) 674 pand %xmm4,%xmm3 675 pand %xmm5,%xmm2 676 movdqa %xmm0,16(%rsp,%rax,1) 677 por %xmm2,%xmm3 678 movdqu %xmm3,16(%rdi,%rax,1) 679 leaq 32(%rax),%rax 680 decq %r15 681 jnz L$copy4x 682 movq 8(%rsp,%r9,8),%rsi 683 684 movq $1,%rax 685 movq -48(%rsi),%r15 686 687 movq -40(%rsi),%r14 688 689 movq -32(%rsi),%r13 690 691 movq -24(%rsi),%r12 692 693 movq -16(%rsi),%rbp 694 695 movq -8(%rsi),%rbx 696 697 leaq (%rsi),%rsp 698 699L$mul4x_epilogue: 700 .byte 0xf3,0xc3 701 702 703 704 705 706 707.p2align 5 708bn_sqr8x_mont: 709 710 movq %rsp,%rax 711 712L$sqr8x_enter: 713 pushq %rbx 714 715 pushq %rbp 716 717 pushq %r12 718 719 pushq %r13 720 721 pushq %r14 722 723 pushq %r15 724 725L$sqr8x_prologue: 726 727 movl %r9d,%r10d 728 shll $3,%r9d 729 shlq $3+2,%r10 730 negq %r9 731 732 733 734 735 736 737 leaq -64(%rsp,%r9,2),%r11 738 movq %rsp,%rbp 739 movq (%r8),%r8 740 subq %rsi,%r11 741 andq $4095,%r11 742 cmpq %r11,%r10 743 jb L$sqr8x_sp_alt 744 subq %r11,%rbp 745 leaq -64(%rbp,%r9,2),%rbp 746 jmp L$sqr8x_sp_done 747 748.p2align 5 749L$sqr8x_sp_alt: 750 leaq 4096-64(,%r9,2),%r10 751 leaq -64(%rbp,%r9,2),%rbp 752 subq %r10,%r11 753 movq $0,%r10 754 cmovcq %r10,%r11 755 subq %r11,%rbp 756L$sqr8x_sp_done: 757 andq $-64,%rbp 758 movq %rsp,%r11 759 subq %rbp,%r11 760 andq $-4096,%r11 761 leaq (%r11,%rbp,1),%rsp 762 movq (%rsp),%r10 763 cmpq %rbp,%rsp 764 ja L$sqr8x_page_walk 765 jmp L$sqr8x_page_walk_done 766 767.p2align 4 768L$sqr8x_page_walk: 769 leaq -4096(%rsp),%rsp 770 movq (%rsp),%r10 771 cmpq %rbp,%rsp 772 ja L$sqr8x_page_walk 773L$sqr8x_page_walk_done: 774 775 movq %r9,%r10 776 negq %r9 777 778 movq %r8,32(%rsp) 779 movq %rax,40(%rsp) 780 781L$sqr8x_body: 782 783.byte 102,72,15,110,209 784 pxor %xmm0,%xmm0 785.byte 102,72,15,110,207 786.byte 102,73,15,110,218 787 movl _GFp_ia32cap_P+8(%rip),%eax 788 andl $0x80100,%eax 789 cmpl $0x80100,%eax 790 jne L$sqr8x_nox 791 792 call _GFp_bn_sqrx8x_internal 793 794 795 796 797 leaq (%r8,%rcx,1),%rbx 798 movq %rcx,%r9 799 movq %rcx,%rdx 800.byte 102,72,15,126,207 801 sarq $3+2,%rcx 802 jmp L$sqr8x_sub 803 804.p2align 5 805L$sqr8x_nox: 806 call _GFp_bn_sqr8x_internal 807 808 809 810 811 leaq (%rdi,%r9,1),%rbx 812 movq %r9,%rcx 813 movq %r9,%rdx 814.byte 102,72,15,126,207 815 sarq $3+2,%rcx 816 jmp L$sqr8x_sub 817 818.p2align 5 819L$sqr8x_sub: 820 movq 0(%rbx),%r12 821 movq 8(%rbx),%r13 822 movq 16(%rbx),%r14 823 movq 24(%rbx),%r15 824 leaq 32(%rbx),%rbx 825 sbbq 0(%rbp),%r12 826 sbbq 8(%rbp),%r13 827 sbbq 16(%rbp),%r14 828 sbbq 24(%rbp),%r15 829 leaq 32(%rbp),%rbp 830 movq %r12,0(%rdi) 831 movq %r13,8(%rdi) 832 movq %r14,16(%rdi) 833 movq %r15,24(%rdi) 834 leaq 32(%rdi),%rdi 835 incq %rcx 836 jnz L$sqr8x_sub 837 838 sbbq $0,%rax 839 leaq (%rbx,%r9,1),%rbx 840 leaq (%rdi,%r9,1),%rdi 841 842.byte 102,72,15,110,200 843 pxor %xmm0,%xmm0 844 pshufd $0,%xmm1,%xmm1 845 movq 40(%rsp),%rsi 846 847 jmp L$sqr8x_cond_copy 848 849.p2align 5 850L$sqr8x_cond_copy: 851 movdqa 0(%rbx),%xmm2 852 movdqa 16(%rbx),%xmm3 853 leaq 32(%rbx),%rbx 854 movdqu 0(%rdi),%xmm4 855 movdqu 16(%rdi),%xmm5 856 leaq 32(%rdi),%rdi 857 movdqa %xmm0,-32(%rbx) 858 movdqa %xmm0,-16(%rbx) 859 movdqa %xmm0,-32(%rbx,%rdx,1) 860 movdqa %xmm0,-16(%rbx,%rdx,1) 861 pcmpeqd %xmm1,%xmm0 862 pand %xmm1,%xmm2 863 pand %xmm1,%xmm3 864 pand %xmm0,%xmm4 865 pand %xmm0,%xmm5 866 pxor %xmm0,%xmm0 867 por %xmm2,%xmm4 868 por %xmm3,%xmm5 869 movdqu %xmm4,-32(%rdi) 870 movdqu %xmm5,-16(%rdi) 871 addq $32,%r9 872 jnz L$sqr8x_cond_copy 873 874 movq $1,%rax 875 movq -48(%rsi),%r15 876 877 movq -40(%rsi),%r14 878 879 movq -32(%rsi),%r13 880 881 movq -24(%rsi),%r12 882 883 movq -16(%rsi),%rbp 884 885 movq -8(%rsi),%rbx 886 887 leaq (%rsi),%rsp 888 889L$sqr8x_epilogue: 890 .byte 0xf3,0xc3 891 892 893 894.p2align 5 895bn_mulx4x_mont: 896 897 movq %rsp,%rax 898 899L$mulx4x_enter: 900 pushq %rbx 901 902 pushq %rbp 903 904 pushq %r12 905 906 pushq %r13 907 908 pushq %r14 909 910 pushq %r15 911 912L$mulx4x_prologue: 913 914 shll $3,%r9d 915 xorq %r10,%r10 916 subq %r9,%r10 917 movq (%r8),%r8 918 leaq -72(%rsp,%r10,1),%rbp 919 andq $-128,%rbp 920 movq %rsp,%r11 921 subq %rbp,%r11 922 andq $-4096,%r11 923 leaq (%r11,%rbp,1),%rsp 924 movq (%rsp),%r10 925 cmpq %rbp,%rsp 926 ja L$mulx4x_page_walk 927 jmp L$mulx4x_page_walk_done 928 929.p2align 4 930L$mulx4x_page_walk: 931 leaq -4096(%rsp),%rsp 932 movq (%rsp),%r10 933 cmpq %rbp,%rsp 934 ja L$mulx4x_page_walk 935L$mulx4x_page_walk_done: 936 937 leaq (%rdx,%r9,1),%r10 938 939 940 941 942 943 944 945 946 947 948 949 950 movq %r9,0(%rsp) 951 shrq $5,%r9 952 movq %r10,16(%rsp) 953 subq $1,%r9 954 movq %r8,24(%rsp) 955 movq %rdi,32(%rsp) 956 movq %rax,40(%rsp) 957 958 movq %r9,48(%rsp) 959 jmp L$mulx4x_body 960 961.p2align 5 962L$mulx4x_body: 963 leaq 8(%rdx),%rdi 964 movq (%rdx),%rdx 965 leaq 64+32(%rsp),%rbx 966 movq %rdx,%r9 967 968 mulxq 0(%rsi),%r8,%rax 969 mulxq 8(%rsi),%r11,%r14 970 addq %rax,%r11 971 movq %rdi,8(%rsp) 972 mulxq 16(%rsi),%r12,%r13 973 adcq %r14,%r12 974 adcq $0,%r13 975 976 movq %r8,%rdi 977 imulq 24(%rsp),%r8 978 xorq %rbp,%rbp 979 980 mulxq 24(%rsi),%rax,%r14 981 movq %r8,%rdx 982 leaq 32(%rsi),%rsi 983 adcxq %rax,%r13 984 adcxq %rbp,%r14 985 986 mulxq 0(%rcx),%rax,%r10 987 adcxq %rax,%rdi 988 adoxq %r11,%r10 989 mulxq 8(%rcx),%rax,%r11 990 adcxq %rax,%r10 991 adoxq %r12,%r11 992.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 993 movq 48(%rsp),%rdi 994 movq %r10,-32(%rbx) 995 adcxq %rax,%r11 996 adoxq %r13,%r12 997 mulxq 24(%rcx),%rax,%r15 998 movq %r9,%rdx 999 movq %r11,-24(%rbx) 1000 adcxq %rax,%r12 1001 adoxq %rbp,%r15 1002 leaq 32(%rcx),%rcx 1003 movq %r12,-16(%rbx) 1004 1005 jmp L$mulx4x_1st 1006 1007.p2align 5 1008L$mulx4x_1st: 1009 adcxq %rbp,%r15 1010 mulxq 0(%rsi),%r10,%rax 1011 adcxq %r14,%r10 1012 mulxq 8(%rsi),%r11,%r14 1013 adcxq %rax,%r11 1014 mulxq 16(%rsi),%r12,%rax 1015 adcxq %r14,%r12 1016 mulxq 24(%rsi),%r13,%r14 1017.byte 0x67,0x67 1018 movq %r8,%rdx 1019 adcxq %rax,%r13 1020 adcxq %rbp,%r14 1021 leaq 32(%rsi),%rsi 1022 leaq 32(%rbx),%rbx 1023 1024 adoxq %r15,%r10 1025 mulxq 0(%rcx),%rax,%r15 1026 adcxq %rax,%r10 1027 adoxq %r15,%r11 1028 mulxq 8(%rcx),%rax,%r15 1029 adcxq %rax,%r11 1030 adoxq %r15,%r12 1031 mulxq 16(%rcx),%rax,%r15 1032 movq %r10,-40(%rbx) 1033 adcxq %rax,%r12 1034 movq %r11,-32(%rbx) 1035 adoxq %r15,%r13 1036 mulxq 24(%rcx),%rax,%r15 1037 movq %r9,%rdx 1038 movq %r12,-24(%rbx) 1039 adcxq %rax,%r13 1040 adoxq %rbp,%r15 1041 leaq 32(%rcx),%rcx 1042 movq %r13,-16(%rbx) 1043 1044 decq %rdi 1045 jnz L$mulx4x_1st 1046 1047 movq 0(%rsp),%rax 1048 movq 8(%rsp),%rdi 1049 adcq %rbp,%r15 1050 addq %r15,%r14 1051 sbbq %r15,%r15 1052 movq %r14,-8(%rbx) 1053 jmp L$mulx4x_outer 1054 1055.p2align 5 1056L$mulx4x_outer: 1057 movq (%rdi),%rdx 1058 leaq 8(%rdi),%rdi 1059 subq %rax,%rsi 1060 movq %r15,(%rbx) 1061 leaq 64+32(%rsp),%rbx 1062 subq %rax,%rcx 1063 1064 mulxq 0(%rsi),%r8,%r11 1065 xorl %ebp,%ebp 1066 movq %rdx,%r9 1067 mulxq 8(%rsi),%r14,%r12 1068 adoxq -32(%rbx),%r8 1069 adcxq %r14,%r11 1070 mulxq 16(%rsi),%r15,%r13 1071 adoxq -24(%rbx),%r11 1072 adcxq %r15,%r12 1073 adoxq -16(%rbx),%r12 1074 adcxq %rbp,%r13 1075 adoxq %rbp,%r13 1076 1077 movq %rdi,8(%rsp) 1078 movq %r8,%r15 1079 imulq 24(%rsp),%r8 1080 xorl %ebp,%ebp 1081 1082 mulxq 24(%rsi),%rax,%r14 1083 movq %r8,%rdx 1084 adcxq %rax,%r13 1085 adoxq -8(%rbx),%r13 1086 adcxq %rbp,%r14 1087 leaq 32(%rsi),%rsi 1088 adoxq %rbp,%r14 1089 1090 mulxq 0(%rcx),%rax,%r10 1091 adcxq %rax,%r15 1092 adoxq %r11,%r10 1093 mulxq 8(%rcx),%rax,%r11 1094 adcxq %rax,%r10 1095 adoxq %r12,%r11 1096 mulxq 16(%rcx),%rax,%r12 1097 movq %r10,-32(%rbx) 1098 adcxq %rax,%r11 1099 adoxq %r13,%r12 1100 mulxq 24(%rcx),%rax,%r15 1101 movq %r9,%rdx 1102 movq %r11,-24(%rbx) 1103 leaq 32(%rcx),%rcx 1104 adcxq %rax,%r12 1105 adoxq %rbp,%r15 1106 movq 48(%rsp),%rdi 1107 movq %r12,-16(%rbx) 1108 1109 jmp L$mulx4x_inner 1110 1111.p2align 5 1112L$mulx4x_inner: 1113 mulxq 0(%rsi),%r10,%rax 1114 adcxq %rbp,%r15 1115 adoxq %r14,%r10 1116 mulxq 8(%rsi),%r11,%r14 1117 adcxq 0(%rbx),%r10 1118 adoxq %rax,%r11 1119 mulxq 16(%rsi),%r12,%rax 1120 adcxq 8(%rbx),%r11 1121 adoxq %r14,%r12 1122 mulxq 24(%rsi),%r13,%r14 1123 movq %r8,%rdx 1124 adcxq 16(%rbx),%r12 1125 adoxq %rax,%r13 1126 adcxq 24(%rbx),%r13 1127 adoxq %rbp,%r14 1128 leaq 32(%rsi),%rsi 1129 leaq 32(%rbx),%rbx 1130 adcxq %rbp,%r14 1131 1132 adoxq %r15,%r10 1133 mulxq 0(%rcx),%rax,%r15 1134 adcxq %rax,%r10 1135 adoxq %r15,%r11 1136 mulxq 8(%rcx),%rax,%r15 1137 adcxq %rax,%r11 1138 adoxq %r15,%r12 1139 mulxq 16(%rcx),%rax,%r15 1140 movq %r10,-40(%rbx) 1141 adcxq %rax,%r12 1142 adoxq %r15,%r13 1143 mulxq 24(%rcx),%rax,%r15 1144 movq %r9,%rdx 1145 movq %r11,-32(%rbx) 1146 movq %r12,-24(%rbx) 1147 adcxq %rax,%r13 1148 adoxq %rbp,%r15 1149 leaq 32(%rcx),%rcx 1150 movq %r13,-16(%rbx) 1151 1152 decq %rdi 1153 jnz L$mulx4x_inner 1154 1155 movq 0(%rsp),%rax 1156 movq 8(%rsp),%rdi 1157 adcq %rbp,%r15 1158 subq 0(%rbx),%rbp 1159 adcq %r15,%r14 1160 sbbq %r15,%r15 1161 movq %r14,-8(%rbx) 1162 1163 cmpq 16(%rsp),%rdi 1164 jne L$mulx4x_outer 1165 1166 leaq 64(%rsp),%rbx 1167 subq %rax,%rcx 1168 negq %r15 1169 movq %rax,%rdx 1170 shrq $3+2,%rax 1171 movq 32(%rsp),%rdi 1172 jmp L$mulx4x_sub 1173 1174.p2align 5 1175L$mulx4x_sub: 1176 movq 0(%rbx),%r11 1177 movq 8(%rbx),%r12 1178 movq 16(%rbx),%r13 1179 movq 24(%rbx),%r14 1180 leaq 32(%rbx),%rbx 1181 sbbq 0(%rcx),%r11 1182 sbbq 8(%rcx),%r12 1183 sbbq 16(%rcx),%r13 1184 sbbq 24(%rcx),%r14 1185 leaq 32(%rcx),%rcx 1186 movq %r11,0(%rdi) 1187 movq %r12,8(%rdi) 1188 movq %r13,16(%rdi) 1189 movq %r14,24(%rdi) 1190 leaq 32(%rdi),%rdi 1191 decq %rax 1192 jnz L$mulx4x_sub 1193 1194 sbbq $0,%r15 1195 leaq 64(%rsp),%rbx 1196 subq %rdx,%rdi 1197 1198.byte 102,73,15,110,207 1199 pxor %xmm0,%xmm0 1200 pshufd $0,%xmm1,%xmm1 1201 movq 40(%rsp),%rsi 1202 1203 jmp L$mulx4x_cond_copy 1204 1205.p2align 5 1206L$mulx4x_cond_copy: 1207 movdqa 0(%rbx),%xmm2 1208 movdqa 16(%rbx),%xmm3 1209 leaq 32(%rbx),%rbx 1210 movdqu 0(%rdi),%xmm4 1211 movdqu 16(%rdi),%xmm5 1212 leaq 32(%rdi),%rdi 1213 movdqa %xmm0,-32(%rbx) 1214 movdqa %xmm0,-16(%rbx) 1215 pcmpeqd %xmm1,%xmm0 1216 pand %xmm1,%xmm2 1217 pand %xmm1,%xmm3 1218 pand %xmm0,%xmm4 1219 pand %xmm0,%xmm5 1220 pxor %xmm0,%xmm0 1221 por %xmm2,%xmm4 1222 por %xmm3,%xmm5 1223 movdqu %xmm4,-32(%rdi) 1224 movdqu %xmm5,-16(%rdi) 1225 subq $32,%rdx 1226 jnz L$mulx4x_cond_copy 1227 1228 movq %rdx,(%rbx) 1229 1230 movq $1,%rax 1231 movq -48(%rsi),%r15 1232 1233 movq -40(%rsi),%r14 1234 1235 movq -32(%rsi),%r13 1236 1237 movq -24(%rsi),%r12 1238 1239 movq -16(%rsi),%rbp 1240 1241 movq -8(%rsi),%rbx 1242 1243 leaq (%rsi),%rsp 1244 1245L$mulx4x_epilogue: 1246 .byte 0xf3,0xc3 1247 1248 1249.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1250.p2align 4 1251#endif 1252