1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14.text 15 16 17 18.globl _bn_mul_mont 19.private_extern _bn_mul_mont 20 21.p2align 4 22_bn_mul_mont: 23 24 movl %r9d,%r9d 25 movq %rsp,%rax 26 27 testl $3,%r9d 28 jnz L$mul_enter 29 cmpl $8,%r9d 30 jb L$mul_enter 31 leaq _OPENSSL_ia32cap_P(%rip),%r11 32 movl 8(%r11),%r11d 33 cmpq %rsi,%rdx 34 jne L$mul4x_enter 35 testl $7,%r9d 36 jz L$sqr8x_enter 37 jmp L$mul4x_enter 38 39.p2align 4 40L$mul_enter: 41 pushq %rbx 42 43 pushq %rbp 44 45 pushq %r12 46 47 pushq %r13 48 49 pushq %r14 50 51 pushq %r15 52 53 54 negq %r9 55 movq %rsp,%r11 56 leaq -16(%rsp,%r9,8),%r10 57 negq %r9 58 andq $-1024,%r10 59 60 61 62 63 64 65 66 67 68 subq %r10,%r11 69 andq $-4096,%r11 70 leaq (%r10,%r11,1),%rsp 71 movq (%rsp),%r11 72 cmpq %r10,%rsp 73 ja L$mul_page_walk 74 jmp L$mul_page_walk_done 75 76.p2align 4 77L$mul_page_walk: 78 leaq -4096(%rsp),%rsp 79 movq (%rsp),%r11 80 cmpq %r10,%rsp 81 ja L$mul_page_walk 82L$mul_page_walk_done: 83 84 movq %rax,8(%rsp,%r9,8) 85 86L$mul_body: 87 movq %rdx,%r12 88 movq (%r8),%r8 89 movq (%r12),%rbx 90 movq (%rsi),%rax 91 92 xorq %r14,%r14 93 xorq %r15,%r15 94 95 movq %r8,%rbp 96 mulq %rbx 97 movq %rax,%r10 98 movq (%rcx),%rax 99 100 imulq %r10,%rbp 101 movq %rdx,%r11 102 103 mulq %rbp 104 addq %rax,%r10 105 movq 8(%rsi),%rax 106 adcq $0,%rdx 107 movq %rdx,%r13 108 109 leaq 1(%r15),%r15 110 jmp L$1st_enter 111 112.p2align 4 113L$1st: 114 addq %rax,%r13 115 movq (%rsi,%r15,8),%rax 116 adcq $0,%rdx 117 addq %r11,%r13 118 movq %r10,%r11 119 adcq $0,%rdx 120 movq %r13,-16(%rsp,%r15,8) 121 movq %rdx,%r13 122 123L$1st_enter: 124 mulq %rbx 125 addq %rax,%r11 126 movq (%rcx,%r15,8),%rax 127 adcq $0,%rdx 128 leaq 1(%r15),%r15 129 movq %rdx,%r10 130 131 mulq %rbp 132 cmpq %r9,%r15 133 jne L$1st 134 135 addq %rax,%r13 136 movq (%rsi),%rax 137 adcq $0,%rdx 138 addq %r11,%r13 139 adcq $0,%rdx 140 movq %r13,-16(%rsp,%r15,8) 141 movq %rdx,%r13 142 movq %r10,%r11 143 144 xorq %rdx,%rdx 145 addq %r11,%r13 146 adcq $0,%rdx 147 movq %r13,-8(%rsp,%r9,8) 148 movq %rdx,(%rsp,%r9,8) 149 150 leaq 1(%r14),%r14 151 jmp L$outer 152.p2align 4 153L$outer: 154 movq (%r12,%r14,8),%rbx 155 xorq %r15,%r15 156 movq %r8,%rbp 157 movq (%rsp),%r10 158 mulq %rbx 159 addq %rax,%r10 160 movq (%rcx),%rax 161 adcq $0,%rdx 162 163 imulq %r10,%rbp 164 movq %rdx,%r11 165 166 mulq %rbp 167 addq %rax,%r10 168 movq 8(%rsi),%rax 169 adcq $0,%rdx 170 movq 8(%rsp),%r10 171 movq %rdx,%r13 172 173 leaq 1(%r15),%r15 174 jmp L$inner_enter 175 176.p2align 4 177L$inner: 178 addq %rax,%r13 179 movq (%rsi,%r15,8),%rax 180 adcq $0,%rdx 181 addq %r10,%r13 182 movq (%rsp,%r15,8),%r10 183 adcq $0,%rdx 184 movq %r13,-16(%rsp,%r15,8) 185 movq %rdx,%r13 186 187L$inner_enter: 188 mulq %rbx 189 addq %rax,%r11 190 movq (%rcx,%r15,8),%rax 191 adcq $0,%rdx 192 addq %r11,%r10 193 movq %rdx,%r11 194 adcq $0,%r11 195 leaq 1(%r15),%r15 196 197 mulq %rbp 198 cmpq %r9,%r15 199 jne L$inner 200 201 addq %rax,%r13 202 movq (%rsi),%rax 203 adcq $0,%rdx 204 addq %r10,%r13 205 movq (%rsp,%r15,8),%r10 206 adcq $0,%rdx 207 movq %r13,-16(%rsp,%r15,8) 208 movq %rdx,%r13 209 210 xorq %rdx,%rdx 211 addq %r11,%r13 212 adcq $0,%rdx 213 addq %r10,%r13 214 adcq $0,%rdx 215 movq %r13,-8(%rsp,%r9,8) 216 movq %rdx,(%rsp,%r9,8) 217 218 leaq 1(%r14),%r14 219 cmpq %r9,%r14 220 jb L$outer 221 222 xorq %r14,%r14 223 movq (%rsp),%rax 224 movq %r9,%r15 225 226.p2align 4 227L$sub: sbbq (%rcx,%r14,8),%rax 228 movq %rax,(%rdi,%r14,8) 229 movq 8(%rsp,%r14,8),%rax 230 leaq 1(%r14),%r14 231 decq %r15 232 jnz L$sub 233 234 sbbq $0,%rax 235 movq $-1,%rbx 236 xorq %rax,%rbx 237 xorq %r14,%r14 238 movq %r9,%r15 239 240L$copy: 241 movq (%rdi,%r14,8),%rcx 242 movq (%rsp,%r14,8),%rdx 243 andq %rbx,%rcx 244 andq %rax,%rdx 245 movq %r9,(%rsp,%r14,8) 246 orq %rcx,%rdx 247 movq %rdx,(%rdi,%r14,8) 248 leaq 1(%r14),%r14 249 subq $1,%r15 250 jnz L$copy 251 252 movq 8(%rsp,%r9,8),%rsi 253 254 movq $1,%rax 255 movq -48(%rsi),%r15 256 257 movq -40(%rsi),%r14 258 259 movq -32(%rsi),%r13 260 261 movq -24(%rsi),%r12 262 263 movq -16(%rsi),%rbp 264 265 movq -8(%rsi),%rbx 266 267 leaq (%rsi),%rsp 268 269L$mul_epilogue: 270 .byte 0xf3,0xc3 271 272 273 274.p2align 4 275bn_mul4x_mont: 276 277 movl %r9d,%r9d 278 movq %rsp,%rax 279 280L$mul4x_enter: 281 andl $0x80100,%r11d 282 cmpl $0x80100,%r11d 283 je L$mulx4x_enter 284 pushq %rbx 285 286 pushq %rbp 287 288 pushq %r12 289 290 pushq %r13 291 292 pushq %r14 293 294 pushq %r15 295 296 297 negq %r9 298 movq %rsp,%r11 299 leaq -32(%rsp,%r9,8),%r10 300 negq %r9 301 andq $-1024,%r10 302 303 subq %r10,%r11 304 andq $-4096,%r11 305 leaq (%r10,%r11,1),%rsp 306 movq (%rsp),%r11 307 cmpq %r10,%rsp 308 ja L$mul4x_page_walk 309 jmp L$mul4x_page_walk_done 310 311L$mul4x_page_walk: 312 leaq -4096(%rsp),%rsp 313 movq (%rsp),%r11 314 cmpq %r10,%rsp 315 ja L$mul4x_page_walk 316L$mul4x_page_walk_done: 317 318 movq %rax,8(%rsp,%r9,8) 319 320L$mul4x_body: 321 movq %rdi,16(%rsp,%r9,8) 322 movq %rdx,%r12 323 movq (%r8),%r8 324 movq (%r12),%rbx 325 movq (%rsi),%rax 326 327 xorq %r14,%r14 328 xorq %r15,%r15 329 330 movq %r8,%rbp 331 mulq %rbx 332 movq %rax,%r10 333 movq (%rcx),%rax 334 335 imulq %r10,%rbp 336 movq %rdx,%r11 337 338 mulq %rbp 339 addq %rax,%r10 340 movq 8(%rsi),%rax 341 adcq $0,%rdx 342 movq %rdx,%rdi 343 344 mulq %rbx 345 addq %rax,%r11 346 movq 8(%rcx),%rax 347 adcq $0,%rdx 348 movq %rdx,%r10 349 350 mulq %rbp 351 addq %rax,%rdi 352 movq 16(%rsi),%rax 353 adcq $0,%rdx 354 addq %r11,%rdi 355 leaq 4(%r15),%r15 356 adcq $0,%rdx 357 movq %rdi,(%rsp) 358 movq %rdx,%r13 359 jmp L$1st4x 360.p2align 4 361L$1st4x: 362 mulq %rbx 363 addq %rax,%r10 364 movq -16(%rcx,%r15,8),%rax 365 adcq $0,%rdx 366 movq %rdx,%r11 367 368 mulq %rbp 369 addq %rax,%r13 370 movq -8(%rsi,%r15,8),%rax 371 adcq $0,%rdx 372 addq %r10,%r13 373 adcq $0,%rdx 374 movq %r13,-24(%rsp,%r15,8) 375 movq %rdx,%rdi 376 377 mulq %rbx 378 addq %rax,%r11 379 movq -8(%rcx,%r15,8),%rax 380 adcq $0,%rdx 381 movq %rdx,%r10 382 383 mulq %rbp 384 addq %rax,%rdi 385 movq (%rsi,%r15,8),%rax 386 adcq $0,%rdx 387 addq %r11,%rdi 388 adcq $0,%rdx 389 movq %rdi,-16(%rsp,%r15,8) 390 movq %rdx,%r13 391 392 mulq %rbx 393 addq %rax,%r10 394 movq (%rcx,%r15,8),%rax 395 adcq $0,%rdx 396 movq %rdx,%r11 397 398 mulq %rbp 399 addq %rax,%r13 400 movq 8(%rsi,%r15,8),%rax 401 adcq $0,%rdx 402 addq %r10,%r13 403 adcq $0,%rdx 404 movq %r13,-8(%rsp,%r15,8) 405 movq %rdx,%rdi 406 407 mulq %rbx 408 addq %rax,%r11 409 movq 8(%rcx,%r15,8),%rax 410 adcq $0,%rdx 411 leaq 4(%r15),%r15 412 movq %rdx,%r10 413 414 mulq %rbp 415 addq %rax,%rdi 416 movq -16(%rsi,%r15,8),%rax 417 adcq $0,%rdx 418 addq %r11,%rdi 419 adcq $0,%rdx 420 movq %rdi,-32(%rsp,%r15,8) 421 movq %rdx,%r13 422 cmpq %r9,%r15 423 jb L$1st4x 424 425 mulq %rbx 426 addq %rax,%r10 427 movq -16(%rcx,%r15,8),%rax 428 adcq $0,%rdx 429 movq %rdx,%r11 430 431 mulq %rbp 432 addq %rax,%r13 433 movq -8(%rsi,%r15,8),%rax 434 adcq $0,%rdx 435 addq %r10,%r13 436 adcq $0,%rdx 437 movq %r13,-24(%rsp,%r15,8) 438 movq %rdx,%rdi 439 440 mulq %rbx 441 addq %rax,%r11 442 movq -8(%rcx,%r15,8),%rax 443 adcq $0,%rdx 444 movq %rdx,%r10 445 446 mulq %rbp 447 addq %rax,%rdi 448 movq (%rsi),%rax 449 adcq $0,%rdx 450 addq %r11,%rdi 451 adcq $0,%rdx 452 movq %rdi,-16(%rsp,%r15,8) 453 movq %rdx,%r13 454 455 xorq %rdi,%rdi 456 addq %r10,%r13 457 adcq $0,%rdi 458 movq %r13,-8(%rsp,%r15,8) 459 movq %rdi,(%rsp,%r15,8) 460 461 leaq 1(%r14),%r14 462.p2align 2 463L$outer4x: 464 movq (%r12,%r14,8),%rbx 465 xorq %r15,%r15 466 movq (%rsp),%r10 467 movq %r8,%rbp 468 mulq %rbx 469 addq %rax,%r10 470 movq (%rcx),%rax 471 adcq $0,%rdx 472 473 imulq %r10,%rbp 474 movq %rdx,%r11 475 476 mulq %rbp 477 addq %rax,%r10 478 movq 8(%rsi),%rax 479 adcq $0,%rdx 480 movq %rdx,%rdi 481 482 mulq %rbx 483 addq %rax,%r11 484 movq 8(%rcx),%rax 485 adcq $0,%rdx 486 addq 8(%rsp),%r11 487 adcq $0,%rdx 488 movq %rdx,%r10 489 490 mulq %rbp 491 addq %rax,%rdi 492 movq 16(%rsi),%rax 493 adcq $0,%rdx 494 addq %r11,%rdi 495 leaq 4(%r15),%r15 496 adcq $0,%rdx 497 movq %rdi,(%rsp) 498 movq %rdx,%r13 499 jmp L$inner4x 500.p2align 4 501L$inner4x: 502 mulq %rbx 503 addq %rax,%r10 504 movq -16(%rcx,%r15,8),%rax 505 adcq $0,%rdx 506 addq -16(%rsp,%r15,8),%r10 507 adcq $0,%rdx 508 movq %rdx,%r11 509 510 mulq %rbp 511 addq %rax,%r13 512 movq -8(%rsi,%r15,8),%rax 513 adcq $0,%rdx 514 addq %r10,%r13 515 adcq $0,%rdx 516 movq %r13,-24(%rsp,%r15,8) 517 movq %rdx,%rdi 518 519 mulq %rbx 520 addq %rax,%r11 521 movq -8(%rcx,%r15,8),%rax 522 adcq $0,%rdx 523 addq -8(%rsp,%r15,8),%r11 524 adcq $0,%rdx 525 movq %rdx,%r10 526 527 mulq %rbp 528 addq %rax,%rdi 529 movq (%rsi,%r15,8),%rax 530 adcq $0,%rdx 531 addq %r11,%rdi 532 adcq $0,%rdx 533 movq %rdi,-16(%rsp,%r15,8) 534 movq %rdx,%r13 535 536 mulq %rbx 537 addq %rax,%r10 538 movq (%rcx,%r15,8),%rax 539 adcq $0,%rdx 540 addq (%rsp,%r15,8),%r10 541 adcq $0,%rdx 542 movq %rdx,%r11 543 544 mulq %rbp 545 addq %rax,%r13 546 movq 8(%rsi,%r15,8),%rax 547 adcq $0,%rdx 548 addq %r10,%r13 549 adcq $0,%rdx 550 movq %r13,-8(%rsp,%r15,8) 551 movq %rdx,%rdi 552 553 mulq %rbx 554 addq %rax,%r11 555 movq 8(%rcx,%r15,8),%rax 556 adcq $0,%rdx 557 addq 8(%rsp,%r15,8),%r11 558 adcq $0,%rdx 559 leaq 4(%r15),%r15 560 movq %rdx,%r10 561 562 mulq %rbp 563 addq %rax,%rdi 564 movq -16(%rsi,%r15,8),%rax 565 adcq $0,%rdx 566 addq %r11,%rdi 567 adcq $0,%rdx 568 movq %rdi,-32(%rsp,%r15,8) 569 movq %rdx,%r13 570 cmpq %r9,%r15 571 jb L$inner4x 572 573 mulq %rbx 574 addq %rax,%r10 575 movq -16(%rcx,%r15,8),%rax 576 adcq $0,%rdx 577 addq -16(%rsp,%r15,8),%r10 578 adcq $0,%rdx 579 movq %rdx,%r11 580 581 mulq %rbp 582 addq %rax,%r13 583 movq -8(%rsi,%r15,8),%rax 584 adcq $0,%rdx 585 addq %r10,%r13 586 adcq $0,%rdx 587 movq %r13,-24(%rsp,%r15,8) 588 movq %rdx,%rdi 589 590 mulq %rbx 591 addq %rax,%r11 592 movq -8(%rcx,%r15,8),%rax 593 adcq $0,%rdx 594 addq -8(%rsp,%r15,8),%r11 595 adcq $0,%rdx 596 leaq 1(%r14),%r14 597 movq %rdx,%r10 598 599 mulq %rbp 600 addq %rax,%rdi 601 movq (%rsi),%rax 602 adcq $0,%rdx 603 addq %r11,%rdi 604 adcq $0,%rdx 605 movq %rdi,-16(%rsp,%r15,8) 606 movq %rdx,%r13 607 608 xorq %rdi,%rdi 609 addq %r10,%r13 610 adcq $0,%rdi 611 addq (%rsp,%r9,8),%r13 612 adcq $0,%rdi 613 movq %r13,-8(%rsp,%r15,8) 614 movq %rdi,(%rsp,%r15,8) 615 616 cmpq %r9,%r14 617 jb L$outer4x 618 movq 16(%rsp,%r9,8),%rdi 619 leaq -4(%r9),%r15 620 movq 0(%rsp),%rax 621 movq 8(%rsp),%rdx 622 shrq $2,%r15 623 leaq (%rsp),%rsi 624 xorq %r14,%r14 625 626 subq 0(%rcx),%rax 627 movq 16(%rsi),%rbx 628 movq 24(%rsi),%rbp 629 sbbq 8(%rcx),%rdx 630 631L$sub4x: 632 movq %rax,0(%rdi,%r14,8) 633 movq %rdx,8(%rdi,%r14,8) 634 sbbq 16(%rcx,%r14,8),%rbx 635 movq 32(%rsi,%r14,8),%rax 636 movq 40(%rsi,%r14,8),%rdx 637 sbbq 24(%rcx,%r14,8),%rbp 638 movq %rbx,16(%rdi,%r14,8) 639 movq %rbp,24(%rdi,%r14,8) 640 sbbq 32(%rcx,%r14,8),%rax 641 movq 48(%rsi,%r14,8),%rbx 642 movq 56(%rsi,%r14,8),%rbp 643 sbbq 40(%rcx,%r14,8),%rdx 644 leaq 4(%r14),%r14 645 decq %r15 646 jnz L$sub4x 647 648 movq %rax,0(%rdi,%r14,8) 649 movq 32(%rsi,%r14,8),%rax 650 sbbq 16(%rcx,%r14,8),%rbx 651 movq %rdx,8(%rdi,%r14,8) 652 sbbq 24(%rcx,%r14,8),%rbp 653 movq %rbx,16(%rdi,%r14,8) 654 655 sbbq $0,%rax 656 movq %rbp,24(%rdi,%r14,8) 657 pxor %xmm0,%xmm0 658.byte 102,72,15,110,224 659 pcmpeqd %xmm5,%xmm5 660 pshufd $0,%xmm4,%xmm4 661 movq %r9,%r15 662 pxor %xmm4,%xmm5 663 shrq $2,%r15 664 xorl %eax,%eax 665 666 jmp L$copy4x 667.p2align 4 668L$copy4x: 669 movdqa (%rsp,%rax,1),%xmm1 670 movdqu (%rdi,%rax,1),%xmm2 671 pand %xmm4,%xmm1 672 pand %xmm5,%xmm2 673 movdqa 16(%rsp,%rax,1),%xmm3 674 movdqa %xmm0,(%rsp,%rax,1) 675 por %xmm2,%xmm1 676 movdqu 16(%rdi,%rax,1),%xmm2 677 movdqu %xmm1,(%rdi,%rax,1) 678 pand %xmm4,%xmm3 679 pand %xmm5,%xmm2 680 movdqa %xmm0,16(%rsp,%rax,1) 681 por %xmm2,%xmm3 682 movdqu %xmm3,16(%rdi,%rax,1) 683 leaq 32(%rax),%rax 684 decq %r15 685 jnz L$copy4x 686 movq 8(%rsp,%r9,8),%rsi 687 688 movq $1,%rax 689 movq -48(%rsi),%r15 690 691 movq -40(%rsi),%r14 692 693 movq -32(%rsi),%r13 694 695 movq -24(%rsi),%r12 696 697 movq -16(%rsi),%rbp 698 699 movq -8(%rsi),%rbx 700 701 leaq (%rsi),%rsp 702 703L$mul4x_epilogue: 704 .byte 0xf3,0xc3 705 706 707 708 709 710 711.p2align 5 712bn_sqr8x_mont: 713 714 movq %rsp,%rax 715 716L$sqr8x_enter: 717 pushq %rbx 718 719 pushq %rbp 720 721 pushq %r12 722 723 pushq %r13 724 725 pushq %r14 726 727 pushq %r15 728 729L$sqr8x_prologue: 730 731 movl %r9d,%r10d 732 shll $3,%r9d 733 shlq $3+2,%r10 734 negq %r9 735 736 737 738 739 740 741 leaq -64(%rsp,%r9,2),%r11 742 movq %rsp,%rbp 743 movq (%r8),%r8 744 subq %rsi,%r11 745 andq $4095,%r11 746 cmpq %r11,%r10 747 jb L$sqr8x_sp_alt 748 subq %r11,%rbp 749 leaq -64(%rbp,%r9,2),%rbp 750 jmp L$sqr8x_sp_done 751 752.p2align 5 753L$sqr8x_sp_alt: 754 leaq 4096-64(,%r9,2),%r10 755 leaq -64(%rbp,%r9,2),%rbp 756 subq %r10,%r11 757 movq $0,%r10 758 cmovcq %r10,%r11 759 subq %r11,%rbp 760L$sqr8x_sp_done: 761 andq $-64,%rbp 762 movq %rsp,%r11 763 subq %rbp,%r11 764 andq $-4096,%r11 765 leaq (%r11,%rbp,1),%rsp 766 movq (%rsp),%r10 767 cmpq %rbp,%rsp 768 ja L$sqr8x_page_walk 769 jmp L$sqr8x_page_walk_done 770 771.p2align 4 772L$sqr8x_page_walk: 773 leaq -4096(%rsp),%rsp 774 movq (%rsp),%r10 775 cmpq %rbp,%rsp 776 ja L$sqr8x_page_walk 777L$sqr8x_page_walk_done: 778 779 movq %r9,%r10 780 negq %r9 781 782 movq %r8,32(%rsp) 783 movq %rax,40(%rsp) 784 785L$sqr8x_body: 786 787.byte 102,72,15,110,209 788 pxor %xmm0,%xmm0 789.byte 102,72,15,110,207 790.byte 102,73,15,110,218 791 leaq _OPENSSL_ia32cap_P(%rip),%rax 792 movl 8(%rax),%eax 793 andl $0x80100,%eax 794 cmpl $0x80100,%eax 795 jne L$sqr8x_nox 796 797 call _bn_sqrx8x_internal 798 799 800 801 802 leaq (%r8,%rcx,1),%rbx 803 movq %rcx,%r9 804 movq %rcx,%rdx 805.byte 102,72,15,126,207 806 sarq $3+2,%rcx 807 jmp L$sqr8x_sub 808 809.p2align 5 810L$sqr8x_nox: 811 call _bn_sqr8x_internal 812 813 814 815 816 leaq (%rdi,%r9,1),%rbx 817 movq %r9,%rcx 818 movq %r9,%rdx 819.byte 102,72,15,126,207 820 sarq $3+2,%rcx 821 jmp L$sqr8x_sub 822 823.p2align 5 824L$sqr8x_sub: 825 movq 0(%rbx),%r12 826 movq 8(%rbx),%r13 827 movq 16(%rbx),%r14 828 movq 24(%rbx),%r15 829 leaq 32(%rbx),%rbx 830 sbbq 0(%rbp),%r12 831 sbbq 8(%rbp),%r13 832 sbbq 16(%rbp),%r14 833 sbbq 24(%rbp),%r15 834 leaq 32(%rbp),%rbp 835 movq %r12,0(%rdi) 836 movq %r13,8(%rdi) 837 movq %r14,16(%rdi) 838 movq %r15,24(%rdi) 839 leaq 32(%rdi),%rdi 840 incq %rcx 841 jnz L$sqr8x_sub 842 843 sbbq $0,%rax 844 leaq (%rbx,%r9,1),%rbx 845 leaq (%rdi,%r9,1),%rdi 846 847.byte 102,72,15,110,200 848 pxor %xmm0,%xmm0 849 pshufd $0,%xmm1,%xmm1 850 movq 40(%rsp),%rsi 851 852 jmp L$sqr8x_cond_copy 853 854.p2align 5 855L$sqr8x_cond_copy: 856 movdqa 0(%rbx),%xmm2 857 movdqa 16(%rbx),%xmm3 858 leaq 32(%rbx),%rbx 859 movdqu 0(%rdi),%xmm4 860 movdqu 16(%rdi),%xmm5 861 leaq 32(%rdi),%rdi 862 movdqa %xmm0,-32(%rbx) 863 movdqa %xmm0,-16(%rbx) 864 movdqa %xmm0,-32(%rbx,%rdx,1) 865 movdqa %xmm0,-16(%rbx,%rdx,1) 866 pcmpeqd %xmm1,%xmm0 867 pand %xmm1,%xmm2 868 pand %xmm1,%xmm3 869 pand %xmm0,%xmm4 870 pand %xmm0,%xmm5 871 pxor %xmm0,%xmm0 872 por %xmm2,%xmm4 873 por %xmm3,%xmm5 874 movdqu %xmm4,-32(%rdi) 875 movdqu %xmm5,-16(%rdi) 876 addq $32,%r9 877 jnz L$sqr8x_cond_copy 878 879 movq $1,%rax 880 movq -48(%rsi),%r15 881 882 movq -40(%rsi),%r14 883 884 movq -32(%rsi),%r13 885 886 movq -24(%rsi),%r12 887 888 movq -16(%rsi),%rbp 889 890 movq -8(%rsi),%rbx 891 892 leaq (%rsi),%rsp 893 894L$sqr8x_epilogue: 895 .byte 0xf3,0xc3 896 897 898 899.p2align 5 900bn_mulx4x_mont: 901 902 movq %rsp,%rax 903 904L$mulx4x_enter: 905 pushq %rbx 906 907 pushq %rbp 908 909 pushq %r12 910 911 pushq %r13 912 913 pushq %r14 914 915 pushq %r15 916 917L$mulx4x_prologue: 918 919 shll $3,%r9d 920 xorq %r10,%r10 921 subq %r9,%r10 922 movq (%r8),%r8 923 leaq -72(%rsp,%r10,1),%rbp 924 andq $-128,%rbp 925 movq %rsp,%r11 926 subq %rbp,%r11 927 andq $-4096,%r11 928 leaq (%r11,%rbp,1),%rsp 929 movq (%rsp),%r10 930 cmpq %rbp,%rsp 931 ja L$mulx4x_page_walk 932 jmp L$mulx4x_page_walk_done 933 934.p2align 4 935L$mulx4x_page_walk: 936 leaq -4096(%rsp),%rsp 937 movq (%rsp),%r10 938 cmpq %rbp,%rsp 939 ja L$mulx4x_page_walk 940L$mulx4x_page_walk_done: 941 942 leaq (%rdx,%r9,1),%r10 943 944 945 946 947 948 949 950 951 952 953 954 955 movq %r9,0(%rsp) 956 shrq $5,%r9 957 movq %r10,16(%rsp) 958 subq $1,%r9 959 movq %r8,24(%rsp) 960 movq %rdi,32(%rsp) 961 movq %rax,40(%rsp) 962 963 movq %r9,48(%rsp) 964 jmp L$mulx4x_body 965 966.p2align 5 967L$mulx4x_body: 968 leaq 8(%rdx),%rdi 969 movq (%rdx),%rdx 970 leaq 64+32(%rsp),%rbx 971 movq %rdx,%r9 972 973 mulxq 0(%rsi),%r8,%rax 974 mulxq 8(%rsi),%r11,%r14 975 addq %rax,%r11 976 movq %rdi,8(%rsp) 977 mulxq 16(%rsi),%r12,%r13 978 adcq %r14,%r12 979 adcq $0,%r13 980 981 movq %r8,%rdi 982 imulq 24(%rsp),%r8 983 xorq %rbp,%rbp 984 985 mulxq 24(%rsi),%rax,%r14 986 movq %r8,%rdx 987 leaq 32(%rsi),%rsi 988 adcxq %rax,%r13 989 adcxq %rbp,%r14 990 991 mulxq 0(%rcx),%rax,%r10 992 adcxq %rax,%rdi 993 adoxq %r11,%r10 994 mulxq 8(%rcx),%rax,%r11 995 adcxq %rax,%r10 996 adoxq %r12,%r11 997.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 998 movq 48(%rsp),%rdi 999 movq %r10,-32(%rbx) 1000 adcxq %rax,%r11 1001 adoxq %r13,%r12 1002 mulxq 24(%rcx),%rax,%r15 1003 movq %r9,%rdx 1004 movq %r11,-24(%rbx) 1005 adcxq %rax,%r12 1006 adoxq %rbp,%r15 1007 leaq 32(%rcx),%rcx 1008 movq %r12,-16(%rbx) 1009 1010 jmp L$mulx4x_1st 1011 1012.p2align 5 1013L$mulx4x_1st: 1014 adcxq %rbp,%r15 1015 mulxq 0(%rsi),%r10,%rax 1016 adcxq %r14,%r10 1017 mulxq 8(%rsi),%r11,%r14 1018 adcxq %rax,%r11 1019 mulxq 16(%rsi),%r12,%rax 1020 adcxq %r14,%r12 1021 mulxq 24(%rsi),%r13,%r14 1022.byte 0x67,0x67 1023 movq %r8,%rdx 1024 adcxq %rax,%r13 1025 adcxq %rbp,%r14 1026 leaq 32(%rsi),%rsi 1027 leaq 32(%rbx),%rbx 1028 1029 adoxq %r15,%r10 1030 mulxq 0(%rcx),%rax,%r15 1031 adcxq %rax,%r10 1032 adoxq %r15,%r11 1033 mulxq 8(%rcx),%rax,%r15 1034 adcxq %rax,%r11 1035 adoxq %r15,%r12 1036 mulxq 16(%rcx),%rax,%r15 1037 movq %r10,-40(%rbx) 1038 adcxq %rax,%r12 1039 movq %r11,-32(%rbx) 1040 adoxq %r15,%r13 1041 mulxq 24(%rcx),%rax,%r15 1042 movq %r9,%rdx 1043 movq %r12,-24(%rbx) 1044 adcxq %rax,%r13 1045 adoxq %rbp,%r15 1046 leaq 32(%rcx),%rcx 1047 movq %r13,-16(%rbx) 1048 1049 decq %rdi 1050 jnz L$mulx4x_1st 1051 1052 movq 0(%rsp),%rax 1053 movq 8(%rsp),%rdi 1054 adcq %rbp,%r15 1055 addq %r15,%r14 1056 sbbq %r15,%r15 1057 movq %r14,-8(%rbx) 1058 jmp L$mulx4x_outer 1059 1060.p2align 5 1061L$mulx4x_outer: 1062 movq (%rdi),%rdx 1063 leaq 8(%rdi),%rdi 1064 subq %rax,%rsi 1065 movq %r15,(%rbx) 1066 leaq 64+32(%rsp),%rbx 1067 subq %rax,%rcx 1068 1069 mulxq 0(%rsi),%r8,%r11 1070 xorl %ebp,%ebp 1071 movq %rdx,%r9 1072 mulxq 8(%rsi),%r14,%r12 1073 adoxq -32(%rbx),%r8 1074 adcxq %r14,%r11 1075 mulxq 16(%rsi),%r15,%r13 1076 adoxq -24(%rbx),%r11 1077 adcxq %r15,%r12 1078 adoxq -16(%rbx),%r12 1079 adcxq %rbp,%r13 1080 adoxq %rbp,%r13 1081 1082 movq %rdi,8(%rsp) 1083 movq %r8,%r15 1084 imulq 24(%rsp),%r8 1085 xorl %ebp,%ebp 1086 1087 mulxq 24(%rsi),%rax,%r14 1088 movq %r8,%rdx 1089 adcxq %rax,%r13 1090 adoxq -8(%rbx),%r13 1091 adcxq %rbp,%r14 1092 leaq 32(%rsi),%rsi 1093 adoxq %rbp,%r14 1094 1095 mulxq 0(%rcx),%rax,%r10 1096 adcxq %rax,%r15 1097 adoxq %r11,%r10 1098 mulxq 8(%rcx),%rax,%r11 1099 adcxq %rax,%r10 1100 adoxq %r12,%r11 1101 mulxq 16(%rcx),%rax,%r12 1102 movq %r10,-32(%rbx) 1103 adcxq %rax,%r11 1104 adoxq %r13,%r12 1105 mulxq 24(%rcx),%rax,%r15 1106 movq %r9,%rdx 1107 movq %r11,-24(%rbx) 1108 leaq 32(%rcx),%rcx 1109 adcxq %rax,%r12 1110 adoxq %rbp,%r15 1111 movq 48(%rsp),%rdi 1112 movq %r12,-16(%rbx) 1113 1114 jmp L$mulx4x_inner 1115 1116.p2align 5 1117L$mulx4x_inner: 1118 mulxq 0(%rsi),%r10,%rax 1119 adcxq %rbp,%r15 1120 adoxq %r14,%r10 1121 mulxq 8(%rsi),%r11,%r14 1122 adcxq 0(%rbx),%r10 1123 adoxq %rax,%r11 1124 mulxq 16(%rsi),%r12,%rax 1125 adcxq 8(%rbx),%r11 1126 adoxq %r14,%r12 1127 mulxq 24(%rsi),%r13,%r14 1128 movq %r8,%rdx 1129 adcxq 16(%rbx),%r12 1130 adoxq %rax,%r13 1131 adcxq 24(%rbx),%r13 1132 adoxq %rbp,%r14 1133 leaq 32(%rsi),%rsi 1134 leaq 32(%rbx),%rbx 1135 adcxq %rbp,%r14 1136 1137 adoxq %r15,%r10 1138 mulxq 0(%rcx),%rax,%r15 1139 adcxq %rax,%r10 1140 adoxq %r15,%r11 1141 mulxq 8(%rcx),%rax,%r15 1142 adcxq %rax,%r11 1143 adoxq %r15,%r12 1144 mulxq 16(%rcx),%rax,%r15 1145 movq %r10,-40(%rbx) 1146 adcxq %rax,%r12 1147 adoxq %r15,%r13 1148 mulxq 24(%rcx),%rax,%r15 1149 movq %r9,%rdx 1150 movq %r11,-32(%rbx) 1151 movq %r12,-24(%rbx) 1152 adcxq %rax,%r13 1153 adoxq %rbp,%r15 1154 leaq 32(%rcx),%rcx 1155 movq %r13,-16(%rbx) 1156 1157 decq %rdi 1158 jnz L$mulx4x_inner 1159 1160 movq 0(%rsp),%rax 1161 movq 8(%rsp),%rdi 1162 adcq %rbp,%r15 1163 subq 0(%rbx),%rbp 1164 adcq %r15,%r14 1165 sbbq %r15,%r15 1166 movq %r14,-8(%rbx) 1167 1168 cmpq 16(%rsp),%rdi 1169 jne L$mulx4x_outer 1170 1171 leaq 64(%rsp),%rbx 1172 subq %rax,%rcx 1173 negq %r15 1174 movq %rax,%rdx 1175 shrq $3+2,%rax 1176 movq 32(%rsp),%rdi 1177 jmp L$mulx4x_sub 1178 1179.p2align 5 1180L$mulx4x_sub: 1181 movq 0(%rbx),%r11 1182 movq 8(%rbx),%r12 1183 movq 16(%rbx),%r13 1184 movq 24(%rbx),%r14 1185 leaq 32(%rbx),%rbx 1186 sbbq 0(%rcx),%r11 1187 sbbq 8(%rcx),%r12 1188 sbbq 16(%rcx),%r13 1189 sbbq 24(%rcx),%r14 1190 leaq 32(%rcx),%rcx 1191 movq %r11,0(%rdi) 1192 movq %r12,8(%rdi) 1193 movq %r13,16(%rdi) 1194 movq %r14,24(%rdi) 1195 leaq 32(%rdi),%rdi 1196 decq %rax 1197 jnz L$mulx4x_sub 1198 1199 sbbq $0,%r15 1200 leaq 64(%rsp),%rbx 1201 subq %rdx,%rdi 1202 1203.byte 102,73,15,110,207 1204 pxor %xmm0,%xmm0 1205 pshufd $0,%xmm1,%xmm1 1206 movq 40(%rsp),%rsi 1207 1208 jmp L$mulx4x_cond_copy 1209 1210.p2align 5 1211L$mulx4x_cond_copy: 1212 movdqa 0(%rbx),%xmm2 1213 movdqa 16(%rbx),%xmm3 1214 leaq 32(%rbx),%rbx 1215 movdqu 0(%rdi),%xmm4 1216 movdqu 16(%rdi),%xmm5 1217 leaq 32(%rdi),%rdi 1218 movdqa %xmm0,-32(%rbx) 1219 movdqa %xmm0,-16(%rbx) 1220 pcmpeqd %xmm1,%xmm0 1221 pand %xmm1,%xmm2 1222 pand %xmm1,%xmm3 1223 pand %xmm0,%xmm4 1224 pand %xmm0,%xmm5 1225 pxor %xmm0,%xmm0 1226 por %xmm2,%xmm4 1227 por %xmm3,%xmm5 1228 movdqu %xmm4,-32(%rdi) 1229 movdqu %xmm5,-16(%rdi) 1230 subq $32,%rdx 1231 jnz L$mulx4x_cond_copy 1232 1233 movq %rdx,(%rbx) 1234 1235 movq $1,%rax 1236 movq -48(%rsi),%r15 1237 1238 movq -40(%rsi),%r14 1239 1240 movq -32(%rsi),%r13 1241 1242 movq -24(%rsi),%r12 1243 1244 movq -16(%rsi),%rbp 1245 1246 movq -8(%rsi),%rbx 1247 1248 leaq (%rsi),%rsp 1249 1250L$mulx4x_epilogue: 1251 .byte 0xf3,0xc3 1252 1253 1254.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1255.p2align 4 1256#endif 1257