1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#include "ring_core_generated/prefix_symbols_asm.h" 12.text 13 14 15 16.globl _bn_mul_mont 17.private_extern _bn_mul_mont 18 19.p2align 4 20_bn_mul_mont: 21 22 movl %r9d,%r9d 23 movq %rsp,%rax 24 25 testl $3,%r9d 26 jnz L$mul_enter 27 cmpl $8,%r9d 28 jb L$mul_enter 29 movl _OPENSSL_ia32cap_P+8(%rip),%r11d 30 cmpq %rsi,%rdx 31 jne L$mul4x_enter 32 testl $7,%r9d 33 jz L$sqr8x_enter 34 jmp L$mul4x_enter 35 36.p2align 4 37L$mul_enter: 38 pushq %rbx 39 40 pushq %rbp 41 42 pushq %r12 43 44 pushq %r13 45 46 pushq %r14 47 48 pushq %r15 49 50 51 negq %r9 52 movq %rsp,%r11 53 leaq -16(%rsp,%r9,8),%r10 54 negq %r9 55 andq $-1024,%r10 56 57 58 59 60 61 62 63 64 65 subq %r10,%r11 66 andq $-4096,%r11 67 leaq (%r10,%r11,1),%rsp 68 movq (%rsp),%r11 69 cmpq %r10,%rsp 70 ja L$mul_page_walk 71 jmp L$mul_page_walk_done 72 73.p2align 4 74L$mul_page_walk: 75 leaq -4096(%rsp),%rsp 76 movq (%rsp),%r11 77 cmpq %r10,%rsp 78 ja L$mul_page_walk 79L$mul_page_walk_done: 80 81 movq %rax,8(%rsp,%r9,8) 82 83L$mul_body: 84 movq %rdx,%r12 85 movq (%r8),%r8 86 movq (%r12),%rbx 87 movq (%rsi),%rax 88 89 xorq %r14,%r14 90 xorq %r15,%r15 91 92 movq %r8,%rbp 93 mulq %rbx 94 movq %rax,%r10 95 movq (%rcx),%rax 96 97 imulq %r10,%rbp 98 movq %rdx,%r11 99 100 mulq %rbp 101 addq %rax,%r10 102 movq 8(%rsi),%rax 103 adcq $0,%rdx 104 movq %rdx,%r13 105 106 leaq 1(%r15),%r15 107 jmp L$1st_enter 108 109.p2align 4 110L$1st: 111 addq %rax,%r13 112 movq (%rsi,%r15,8),%rax 113 adcq $0,%rdx 114 addq %r11,%r13 115 movq %r10,%r11 116 adcq $0,%rdx 117 movq %r13,-16(%rsp,%r15,8) 118 movq %rdx,%r13 119 120L$1st_enter: 121 mulq %rbx 122 addq %rax,%r11 123 movq (%rcx,%r15,8),%rax 124 adcq $0,%rdx 125 leaq 1(%r15),%r15 126 movq %rdx,%r10 127 128 mulq %rbp 129 cmpq %r9,%r15 130 jne L$1st 131 132 addq %rax,%r13 133 movq (%rsi),%rax 134 adcq $0,%rdx 135 addq %r11,%r13 136 adcq $0,%rdx 137 movq %r13,-16(%rsp,%r15,8) 138 movq %rdx,%r13 139 movq %r10,%r11 140 141 xorq %rdx,%rdx 142 addq %r11,%r13 143 adcq $0,%rdx 144 movq %r13,-8(%rsp,%r9,8) 145 movq %rdx,(%rsp,%r9,8) 146 147 leaq 1(%r14),%r14 148 jmp L$outer 149.p2align 4 150L$outer: 151 movq (%r12,%r14,8),%rbx 152 xorq %r15,%r15 153 movq %r8,%rbp 154 movq (%rsp),%r10 155 mulq %rbx 156 addq %rax,%r10 157 movq (%rcx),%rax 158 adcq $0,%rdx 159 160 imulq %r10,%rbp 161 movq %rdx,%r11 162 163 mulq %rbp 164 addq %rax,%r10 165 movq 8(%rsi),%rax 166 adcq $0,%rdx 167 movq 8(%rsp),%r10 168 movq %rdx,%r13 169 170 leaq 1(%r15),%r15 171 jmp L$inner_enter 172 173.p2align 4 174L$inner: 175 addq %rax,%r13 176 movq (%rsi,%r15,8),%rax 177 adcq $0,%rdx 178 addq %r10,%r13 179 movq (%rsp,%r15,8),%r10 180 adcq $0,%rdx 181 movq %r13,-16(%rsp,%r15,8) 182 movq %rdx,%r13 183 184L$inner_enter: 185 mulq %rbx 186 addq %rax,%r11 187 movq (%rcx,%r15,8),%rax 188 adcq $0,%rdx 189 addq %r11,%r10 190 movq %rdx,%r11 191 adcq $0,%r11 192 leaq 1(%r15),%r15 193 194 mulq %rbp 195 cmpq %r9,%r15 196 jne L$inner 197 198 addq %rax,%r13 199 movq (%rsi),%rax 200 adcq $0,%rdx 201 addq %r10,%r13 202 movq (%rsp,%r15,8),%r10 203 adcq $0,%rdx 204 movq %r13,-16(%rsp,%r15,8) 205 movq %rdx,%r13 206 207 xorq %rdx,%rdx 208 addq %r11,%r13 209 adcq $0,%rdx 210 addq %r10,%r13 211 adcq $0,%rdx 212 movq %r13,-8(%rsp,%r9,8) 213 movq %rdx,(%rsp,%r9,8) 214 215 leaq 1(%r14),%r14 216 cmpq %r9,%r14 217 jb L$outer 218 219 xorq %r14,%r14 220 movq (%rsp),%rax 221 movq %r9,%r15 222 223.p2align 4 224L$sub: sbbq (%rcx,%r14,8),%rax 225 movq %rax,(%rdi,%r14,8) 226 movq 8(%rsp,%r14,8),%rax 227 leaq 1(%r14),%r14 228 decq %r15 229 jnz L$sub 230 231 sbbq $0,%rax 232 movq $-1,%rbx 233 xorq %rax,%rbx 234 xorq %r14,%r14 235 movq %r9,%r15 236 237L$copy: 238 movq (%rdi,%r14,8),%rcx 239 movq (%rsp,%r14,8),%rdx 240 andq %rbx,%rcx 241 andq %rax,%rdx 242 movq %r9,(%rsp,%r14,8) 243 orq %rcx,%rdx 244 movq %rdx,(%rdi,%r14,8) 245 leaq 1(%r14),%r14 246 subq $1,%r15 247 jnz L$copy 248 249 movq 8(%rsp,%r9,8),%rsi 250 251 movq $1,%rax 252 movq -48(%rsi),%r15 253 254 movq -40(%rsi),%r14 255 256 movq -32(%rsi),%r13 257 258 movq -24(%rsi),%r12 259 260 movq -16(%rsi),%rbp 261 262 movq -8(%rsi),%rbx 263 264 leaq (%rsi),%rsp 265 266L$mul_epilogue: 267 .byte 0xf3,0xc3 268 269 270 271.p2align 4 272bn_mul4x_mont: 273 274 movl %r9d,%r9d 275 movq %rsp,%rax 276 277L$mul4x_enter: 278 andl $0x80100,%r11d 279 cmpl $0x80100,%r11d 280 je L$mulx4x_enter 281 pushq %rbx 282 283 pushq %rbp 284 285 pushq %r12 286 287 pushq %r13 288 289 pushq %r14 290 291 pushq %r15 292 293 294 negq %r9 295 movq %rsp,%r11 296 leaq -32(%rsp,%r9,8),%r10 297 negq %r9 298 andq $-1024,%r10 299 300 subq %r10,%r11 301 andq $-4096,%r11 302 leaq (%r10,%r11,1),%rsp 303 movq (%rsp),%r11 304 cmpq %r10,%rsp 305 ja L$mul4x_page_walk 306 jmp L$mul4x_page_walk_done 307 308L$mul4x_page_walk: 309 leaq -4096(%rsp),%rsp 310 movq (%rsp),%r11 311 cmpq %r10,%rsp 312 ja L$mul4x_page_walk 313L$mul4x_page_walk_done: 314 315 movq %rax,8(%rsp,%r9,8) 316 317L$mul4x_body: 318 movq %rdi,16(%rsp,%r9,8) 319 movq %rdx,%r12 320 movq (%r8),%r8 321 movq (%r12),%rbx 322 movq (%rsi),%rax 323 324 xorq %r14,%r14 325 xorq %r15,%r15 326 327 movq %r8,%rbp 328 mulq %rbx 329 movq %rax,%r10 330 movq (%rcx),%rax 331 332 imulq %r10,%rbp 333 movq %rdx,%r11 334 335 mulq %rbp 336 addq %rax,%r10 337 movq 8(%rsi),%rax 338 adcq $0,%rdx 339 movq %rdx,%rdi 340 341 mulq %rbx 342 addq %rax,%r11 343 movq 8(%rcx),%rax 344 adcq $0,%rdx 345 movq %rdx,%r10 346 347 mulq %rbp 348 addq %rax,%rdi 349 movq 16(%rsi),%rax 350 adcq $0,%rdx 351 addq %r11,%rdi 352 leaq 4(%r15),%r15 353 adcq $0,%rdx 354 movq %rdi,(%rsp) 355 movq %rdx,%r13 356 jmp L$1st4x 357.p2align 4 358L$1st4x: 359 mulq %rbx 360 addq %rax,%r10 361 movq -16(%rcx,%r15,8),%rax 362 adcq $0,%rdx 363 movq %rdx,%r11 364 365 mulq %rbp 366 addq %rax,%r13 367 movq -8(%rsi,%r15,8),%rax 368 adcq $0,%rdx 369 addq %r10,%r13 370 adcq $0,%rdx 371 movq %r13,-24(%rsp,%r15,8) 372 movq %rdx,%rdi 373 374 mulq %rbx 375 addq %rax,%r11 376 movq -8(%rcx,%r15,8),%rax 377 adcq $0,%rdx 378 movq %rdx,%r10 379 380 mulq %rbp 381 addq %rax,%rdi 382 movq (%rsi,%r15,8),%rax 383 adcq $0,%rdx 384 addq %r11,%rdi 385 adcq $0,%rdx 386 movq %rdi,-16(%rsp,%r15,8) 387 movq %rdx,%r13 388 389 mulq %rbx 390 addq %rax,%r10 391 movq (%rcx,%r15,8),%rax 392 adcq $0,%rdx 393 movq %rdx,%r11 394 395 mulq %rbp 396 addq %rax,%r13 397 movq 8(%rsi,%r15,8),%rax 398 adcq $0,%rdx 399 addq %r10,%r13 400 adcq $0,%rdx 401 movq %r13,-8(%rsp,%r15,8) 402 movq %rdx,%rdi 403 404 mulq %rbx 405 addq %rax,%r11 406 movq 8(%rcx,%r15,8),%rax 407 adcq $0,%rdx 408 leaq 4(%r15),%r15 409 movq %rdx,%r10 410 411 mulq %rbp 412 addq %rax,%rdi 413 movq -16(%rsi,%r15,8),%rax 414 adcq $0,%rdx 415 addq %r11,%rdi 416 adcq $0,%rdx 417 movq %rdi,-32(%rsp,%r15,8) 418 movq %rdx,%r13 419 cmpq %r9,%r15 420 jb L$1st4x 421 422 mulq %rbx 423 addq %rax,%r10 424 movq -16(%rcx,%r15,8),%rax 425 adcq $0,%rdx 426 movq %rdx,%r11 427 428 mulq %rbp 429 addq %rax,%r13 430 movq -8(%rsi,%r15,8),%rax 431 adcq $0,%rdx 432 addq %r10,%r13 433 adcq $0,%rdx 434 movq %r13,-24(%rsp,%r15,8) 435 movq %rdx,%rdi 436 437 mulq %rbx 438 addq %rax,%r11 439 movq -8(%rcx,%r15,8),%rax 440 adcq $0,%rdx 441 movq %rdx,%r10 442 443 mulq %rbp 444 addq %rax,%rdi 445 movq (%rsi),%rax 446 adcq $0,%rdx 447 addq %r11,%rdi 448 adcq $0,%rdx 449 movq %rdi,-16(%rsp,%r15,8) 450 movq %rdx,%r13 451 452 xorq %rdi,%rdi 453 addq %r10,%r13 454 adcq $0,%rdi 455 movq %r13,-8(%rsp,%r15,8) 456 movq %rdi,(%rsp,%r15,8) 457 458 leaq 1(%r14),%r14 459.p2align 2 460L$outer4x: 461 movq (%r12,%r14,8),%rbx 462 xorq %r15,%r15 463 movq (%rsp),%r10 464 movq %r8,%rbp 465 mulq %rbx 466 addq %rax,%r10 467 movq (%rcx),%rax 468 adcq $0,%rdx 469 470 imulq %r10,%rbp 471 movq %rdx,%r11 472 473 mulq %rbp 474 addq %rax,%r10 475 movq 8(%rsi),%rax 476 adcq $0,%rdx 477 movq %rdx,%rdi 478 479 mulq %rbx 480 addq %rax,%r11 481 movq 8(%rcx),%rax 482 adcq $0,%rdx 483 addq 8(%rsp),%r11 484 adcq $0,%rdx 485 movq %rdx,%r10 486 487 mulq %rbp 488 addq %rax,%rdi 489 movq 16(%rsi),%rax 490 adcq $0,%rdx 491 addq %r11,%rdi 492 leaq 4(%r15),%r15 493 adcq $0,%rdx 494 movq %rdi,(%rsp) 495 movq %rdx,%r13 496 jmp L$inner4x 497.p2align 4 498L$inner4x: 499 mulq %rbx 500 addq %rax,%r10 501 movq -16(%rcx,%r15,8),%rax 502 adcq $0,%rdx 503 addq -16(%rsp,%r15,8),%r10 504 adcq $0,%rdx 505 movq %rdx,%r11 506 507 mulq %rbp 508 addq %rax,%r13 509 movq -8(%rsi,%r15,8),%rax 510 adcq $0,%rdx 511 addq %r10,%r13 512 adcq $0,%rdx 513 movq %r13,-24(%rsp,%r15,8) 514 movq %rdx,%rdi 515 516 mulq %rbx 517 addq %rax,%r11 518 movq -8(%rcx,%r15,8),%rax 519 adcq $0,%rdx 520 addq -8(%rsp,%r15,8),%r11 521 adcq $0,%rdx 522 movq %rdx,%r10 523 524 mulq %rbp 525 addq %rax,%rdi 526 movq (%rsi,%r15,8),%rax 527 adcq $0,%rdx 528 addq %r11,%rdi 529 adcq $0,%rdx 530 movq %rdi,-16(%rsp,%r15,8) 531 movq %rdx,%r13 532 533 mulq %rbx 534 addq %rax,%r10 535 movq (%rcx,%r15,8),%rax 536 adcq $0,%rdx 537 addq (%rsp,%r15,8),%r10 538 adcq $0,%rdx 539 movq %rdx,%r11 540 541 mulq %rbp 542 addq %rax,%r13 543 movq 8(%rsi,%r15,8),%rax 544 adcq $0,%rdx 545 addq %r10,%r13 546 adcq $0,%rdx 547 movq %r13,-8(%rsp,%r15,8) 548 movq %rdx,%rdi 549 550 mulq %rbx 551 addq %rax,%r11 552 movq 8(%rcx,%r15,8),%rax 553 adcq $0,%rdx 554 addq 8(%rsp,%r15,8),%r11 555 adcq $0,%rdx 556 leaq 4(%r15),%r15 557 movq %rdx,%r10 558 559 mulq %rbp 560 addq %rax,%rdi 561 movq -16(%rsi,%r15,8),%rax 562 adcq $0,%rdx 563 addq %r11,%rdi 564 adcq $0,%rdx 565 movq %rdi,-32(%rsp,%r15,8) 566 movq %rdx,%r13 567 cmpq %r9,%r15 568 jb L$inner4x 569 570 mulq %rbx 571 addq %rax,%r10 572 movq -16(%rcx,%r15,8),%rax 573 adcq $0,%rdx 574 addq -16(%rsp,%r15,8),%r10 575 adcq $0,%rdx 576 movq %rdx,%r11 577 578 mulq %rbp 579 addq %rax,%r13 580 movq -8(%rsi,%r15,8),%rax 581 adcq $0,%rdx 582 addq %r10,%r13 583 adcq $0,%rdx 584 movq %r13,-24(%rsp,%r15,8) 585 movq %rdx,%rdi 586 587 mulq %rbx 588 addq %rax,%r11 589 movq -8(%rcx,%r15,8),%rax 590 adcq $0,%rdx 591 addq -8(%rsp,%r15,8),%r11 592 adcq $0,%rdx 593 leaq 1(%r14),%r14 594 movq %rdx,%r10 595 596 mulq %rbp 597 addq %rax,%rdi 598 movq (%rsi),%rax 599 adcq $0,%rdx 600 addq %r11,%rdi 601 adcq $0,%rdx 602 movq %rdi,-16(%rsp,%r15,8) 603 movq %rdx,%r13 604 605 xorq %rdi,%rdi 606 addq %r10,%r13 607 adcq $0,%rdi 608 addq (%rsp,%r9,8),%r13 609 adcq $0,%rdi 610 movq %r13,-8(%rsp,%r15,8) 611 movq %rdi,(%rsp,%r15,8) 612 613 cmpq %r9,%r14 614 jb L$outer4x 615 movq 16(%rsp,%r9,8),%rdi 616 leaq -4(%r9),%r15 617 movq 0(%rsp),%rax 618 movq 8(%rsp),%rdx 619 shrq $2,%r15 620 leaq (%rsp),%rsi 621 xorq %r14,%r14 622 623 subq 0(%rcx),%rax 624 movq 16(%rsi),%rbx 625 movq 24(%rsi),%rbp 626 sbbq 8(%rcx),%rdx 627 628L$sub4x: 629 movq %rax,0(%rdi,%r14,8) 630 movq %rdx,8(%rdi,%r14,8) 631 sbbq 16(%rcx,%r14,8),%rbx 632 movq 32(%rsi,%r14,8),%rax 633 movq 40(%rsi,%r14,8),%rdx 634 sbbq 24(%rcx,%r14,8),%rbp 635 movq %rbx,16(%rdi,%r14,8) 636 movq %rbp,24(%rdi,%r14,8) 637 sbbq 32(%rcx,%r14,8),%rax 638 movq 48(%rsi,%r14,8),%rbx 639 movq 56(%rsi,%r14,8),%rbp 640 sbbq 40(%rcx,%r14,8),%rdx 641 leaq 4(%r14),%r14 642 decq %r15 643 jnz L$sub4x 644 645 movq %rax,0(%rdi,%r14,8) 646 movq 32(%rsi,%r14,8),%rax 647 sbbq 16(%rcx,%r14,8),%rbx 648 movq %rdx,8(%rdi,%r14,8) 649 sbbq 24(%rcx,%r14,8),%rbp 650 movq %rbx,16(%rdi,%r14,8) 651 652 sbbq $0,%rax 653 movq %rbp,24(%rdi,%r14,8) 654 pxor %xmm0,%xmm0 655.byte 102,72,15,110,224 656 pcmpeqd %xmm5,%xmm5 657 pshufd $0,%xmm4,%xmm4 658 movq %r9,%r15 659 pxor %xmm4,%xmm5 660 shrq $2,%r15 661 xorl %eax,%eax 662 663 jmp L$copy4x 664.p2align 4 665L$copy4x: 666 movdqa (%rsp,%rax,1),%xmm1 667 movdqu (%rdi,%rax,1),%xmm2 668 pand %xmm4,%xmm1 669 pand %xmm5,%xmm2 670 movdqa 16(%rsp,%rax,1),%xmm3 671 movdqa %xmm0,(%rsp,%rax,1) 672 por %xmm2,%xmm1 673 movdqu 16(%rdi,%rax,1),%xmm2 674 movdqu %xmm1,(%rdi,%rax,1) 675 pand %xmm4,%xmm3 676 pand %xmm5,%xmm2 677 movdqa %xmm0,16(%rsp,%rax,1) 678 por %xmm2,%xmm3 679 movdqu %xmm3,16(%rdi,%rax,1) 680 leaq 32(%rax),%rax 681 decq %r15 682 jnz L$copy4x 683 movq 8(%rsp,%r9,8),%rsi 684 685 movq $1,%rax 686 movq -48(%rsi),%r15 687 688 movq -40(%rsi),%r14 689 690 movq -32(%rsi),%r13 691 692 movq -24(%rsi),%r12 693 694 movq -16(%rsi),%rbp 695 696 movq -8(%rsi),%rbx 697 698 leaq (%rsi),%rsp 699 700L$mul4x_epilogue: 701 .byte 0xf3,0xc3 702 703 704 705 706 707 708.p2align 5 709bn_sqr8x_mont: 710 711 movq %rsp,%rax 712 713L$sqr8x_enter: 714 pushq %rbx 715 716 pushq %rbp 717 718 pushq %r12 719 720 pushq %r13 721 722 pushq %r14 723 724 pushq %r15 725 726L$sqr8x_prologue: 727 728 movl %r9d,%r10d 729 shll $3,%r9d 730 shlq $3+2,%r10 731 negq %r9 732 733 734 735 736 737 738 leaq -64(%rsp,%r9,2),%r11 739 movq %rsp,%rbp 740 movq (%r8),%r8 741 subq %rsi,%r11 742 andq $4095,%r11 743 cmpq %r11,%r10 744 jb L$sqr8x_sp_alt 745 subq %r11,%rbp 746 leaq -64(%rbp,%r9,2),%rbp 747 jmp L$sqr8x_sp_done 748 749.p2align 5 750L$sqr8x_sp_alt: 751 leaq 4096-64(,%r9,2),%r10 752 leaq -64(%rbp,%r9,2),%rbp 753 subq %r10,%r11 754 movq $0,%r10 755 cmovcq %r10,%r11 756 subq %r11,%rbp 757L$sqr8x_sp_done: 758 andq $-64,%rbp 759 movq %rsp,%r11 760 subq %rbp,%r11 761 andq $-4096,%r11 762 leaq (%r11,%rbp,1),%rsp 763 movq (%rsp),%r10 764 cmpq %rbp,%rsp 765 ja L$sqr8x_page_walk 766 jmp L$sqr8x_page_walk_done 767 768.p2align 4 769L$sqr8x_page_walk: 770 leaq -4096(%rsp),%rsp 771 movq (%rsp),%r10 772 cmpq %rbp,%rsp 773 ja L$sqr8x_page_walk 774L$sqr8x_page_walk_done: 775 776 movq %r9,%r10 777 negq %r9 778 779 movq %r8,32(%rsp) 780 movq %rax,40(%rsp) 781 782L$sqr8x_body: 783 784.byte 102,72,15,110,209 785 pxor %xmm0,%xmm0 786.byte 102,72,15,110,207 787.byte 102,73,15,110,218 788 movl _OPENSSL_ia32cap_P+8(%rip),%eax 789 andl $0x80100,%eax 790 cmpl $0x80100,%eax 791 jne L$sqr8x_nox 792 793 call _bn_sqrx8x_internal 794 795 796 797 798 leaq (%r8,%rcx,1),%rbx 799 movq %rcx,%r9 800 movq %rcx,%rdx 801.byte 102,72,15,126,207 802 sarq $3+2,%rcx 803 jmp L$sqr8x_sub 804 805.p2align 5 806L$sqr8x_nox: 807 call _bn_sqr8x_internal 808 809 810 811 812 leaq (%rdi,%r9,1),%rbx 813 movq %r9,%rcx 814 movq %r9,%rdx 815.byte 102,72,15,126,207 816 sarq $3+2,%rcx 817 jmp L$sqr8x_sub 818 819.p2align 5 820L$sqr8x_sub: 821 movq 0(%rbx),%r12 822 movq 8(%rbx),%r13 823 movq 16(%rbx),%r14 824 movq 24(%rbx),%r15 825 leaq 32(%rbx),%rbx 826 sbbq 0(%rbp),%r12 827 sbbq 8(%rbp),%r13 828 sbbq 16(%rbp),%r14 829 sbbq 24(%rbp),%r15 830 leaq 32(%rbp),%rbp 831 movq %r12,0(%rdi) 832 movq %r13,8(%rdi) 833 movq %r14,16(%rdi) 834 movq %r15,24(%rdi) 835 leaq 32(%rdi),%rdi 836 incq %rcx 837 jnz L$sqr8x_sub 838 839 sbbq $0,%rax 840 leaq (%rbx,%r9,1),%rbx 841 leaq (%rdi,%r9,1),%rdi 842 843.byte 102,72,15,110,200 844 pxor %xmm0,%xmm0 845 pshufd $0,%xmm1,%xmm1 846 movq 40(%rsp),%rsi 847 848 jmp L$sqr8x_cond_copy 849 850.p2align 5 851L$sqr8x_cond_copy: 852 movdqa 0(%rbx),%xmm2 853 movdqa 16(%rbx),%xmm3 854 leaq 32(%rbx),%rbx 855 movdqu 0(%rdi),%xmm4 856 movdqu 16(%rdi),%xmm5 857 leaq 32(%rdi),%rdi 858 movdqa %xmm0,-32(%rbx) 859 movdqa %xmm0,-16(%rbx) 860 movdqa %xmm0,-32(%rbx,%rdx,1) 861 movdqa %xmm0,-16(%rbx,%rdx,1) 862 pcmpeqd %xmm1,%xmm0 863 pand %xmm1,%xmm2 864 pand %xmm1,%xmm3 865 pand %xmm0,%xmm4 866 pand %xmm0,%xmm5 867 pxor %xmm0,%xmm0 868 por %xmm2,%xmm4 869 por %xmm3,%xmm5 870 movdqu %xmm4,-32(%rdi) 871 movdqu %xmm5,-16(%rdi) 872 addq $32,%r9 873 jnz L$sqr8x_cond_copy 874 875 movq $1,%rax 876 movq -48(%rsi),%r15 877 878 movq -40(%rsi),%r14 879 880 movq -32(%rsi),%r13 881 882 movq -24(%rsi),%r12 883 884 movq -16(%rsi),%rbp 885 886 movq -8(%rsi),%rbx 887 888 leaq (%rsi),%rsp 889 890L$sqr8x_epilogue: 891 .byte 0xf3,0xc3 892 893 894 895.p2align 5 896bn_mulx4x_mont: 897 898 movq %rsp,%rax 899 900L$mulx4x_enter: 901 pushq %rbx 902 903 pushq %rbp 904 905 pushq %r12 906 907 pushq %r13 908 909 pushq %r14 910 911 pushq %r15 912 913L$mulx4x_prologue: 914 915 shll $3,%r9d 916 xorq %r10,%r10 917 subq %r9,%r10 918 movq (%r8),%r8 919 leaq -72(%rsp,%r10,1),%rbp 920 andq $-128,%rbp 921 movq %rsp,%r11 922 subq %rbp,%r11 923 andq $-4096,%r11 924 leaq (%r11,%rbp,1),%rsp 925 movq (%rsp),%r10 926 cmpq %rbp,%rsp 927 ja L$mulx4x_page_walk 928 jmp L$mulx4x_page_walk_done 929 930.p2align 4 931L$mulx4x_page_walk: 932 leaq -4096(%rsp),%rsp 933 movq (%rsp),%r10 934 cmpq %rbp,%rsp 935 ja L$mulx4x_page_walk 936L$mulx4x_page_walk_done: 937 938 leaq (%rdx,%r9,1),%r10 939 940 941 942 943 944 945 946 947 948 949 950 951 movq %r9,0(%rsp) 952 shrq $5,%r9 953 movq %r10,16(%rsp) 954 subq $1,%r9 955 movq %r8,24(%rsp) 956 movq %rdi,32(%rsp) 957 movq %rax,40(%rsp) 958 959 movq %r9,48(%rsp) 960 jmp L$mulx4x_body 961 962.p2align 5 963L$mulx4x_body: 964 leaq 8(%rdx),%rdi 965 movq (%rdx),%rdx 966 leaq 64+32(%rsp),%rbx 967 movq %rdx,%r9 968 969 mulxq 0(%rsi),%r8,%rax 970 mulxq 8(%rsi),%r11,%r14 971 addq %rax,%r11 972 movq %rdi,8(%rsp) 973 mulxq 16(%rsi),%r12,%r13 974 adcq %r14,%r12 975 adcq $0,%r13 976 977 movq %r8,%rdi 978 imulq 24(%rsp),%r8 979 xorq %rbp,%rbp 980 981 mulxq 24(%rsi),%rax,%r14 982 movq %r8,%rdx 983 leaq 32(%rsi),%rsi 984 adcxq %rax,%r13 985 adcxq %rbp,%r14 986 987 mulxq 0(%rcx),%rax,%r10 988 adcxq %rax,%rdi 989 adoxq %r11,%r10 990 mulxq 8(%rcx),%rax,%r11 991 adcxq %rax,%r10 992 adoxq %r12,%r11 993.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 994 movq 48(%rsp),%rdi 995 movq %r10,-32(%rbx) 996 adcxq %rax,%r11 997 adoxq %r13,%r12 998 mulxq 24(%rcx),%rax,%r15 999 movq %r9,%rdx 1000 movq %r11,-24(%rbx) 1001 adcxq %rax,%r12 1002 adoxq %rbp,%r15 1003 leaq 32(%rcx),%rcx 1004 movq %r12,-16(%rbx) 1005 1006 jmp L$mulx4x_1st 1007 1008.p2align 5 1009L$mulx4x_1st: 1010 adcxq %rbp,%r15 1011 mulxq 0(%rsi),%r10,%rax 1012 adcxq %r14,%r10 1013 mulxq 8(%rsi),%r11,%r14 1014 adcxq %rax,%r11 1015 mulxq 16(%rsi),%r12,%rax 1016 adcxq %r14,%r12 1017 mulxq 24(%rsi),%r13,%r14 1018.byte 0x67,0x67 1019 movq %r8,%rdx 1020 adcxq %rax,%r13 1021 adcxq %rbp,%r14 1022 leaq 32(%rsi),%rsi 1023 leaq 32(%rbx),%rbx 1024 1025 adoxq %r15,%r10 1026 mulxq 0(%rcx),%rax,%r15 1027 adcxq %rax,%r10 1028 adoxq %r15,%r11 1029 mulxq 8(%rcx),%rax,%r15 1030 adcxq %rax,%r11 1031 adoxq %r15,%r12 1032 mulxq 16(%rcx),%rax,%r15 1033 movq %r10,-40(%rbx) 1034 adcxq %rax,%r12 1035 movq %r11,-32(%rbx) 1036 adoxq %r15,%r13 1037 mulxq 24(%rcx),%rax,%r15 1038 movq %r9,%rdx 1039 movq %r12,-24(%rbx) 1040 adcxq %rax,%r13 1041 adoxq %rbp,%r15 1042 leaq 32(%rcx),%rcx 1043 movq %r13,-16(%rbx) 1044 1045 decq %rdi 1046 jnz L$mulx4x_1st 1047 1048 movq 0(%rsp),%rax 1049 movq 8(%rsp),%rdi 1050 adcq %rbp,%r15 1051 addq %r15,%r14 1052 sbbq %r15,%r15 1053 movq %r14,-8(%rbx) 1054 jmp L$mulx4x_outer 1055 1056.p2align 5 1057L$mulx4x_outer: 1058 movq (%rdi),%rdx 1059 leaq 8(%rdi),%rdi 1060 subq %rax,%rsi 1061 movq %r15,(%rbx) 1062 leaq 64+32(%rsp),%rbx 1063 subq %rax,%rcx 1064 1065 mulxq 0(%rsi),%r8,%r11 1066 xorl %ebp,%ebp 1067 movq %rdx,%r9 1068 mulxq 8(%rsi),%r14,%r12 1069 adoxq -32(%rbx),%r8 1070 adcxq %r14,%r11 1071 mulxq 16(%rsi),%r15,%r13 1072 adoxq -24(%rbx),%r11 1073 adcxq %r15,%r12 1074 adoxq -16(%rbx),%r12 1075 adcxq %rbp,%r13 1076 adoxq %rbp,%r13 1077 1078 movq %rdi,8(%rsp) 1079 movq %r8,%r15 1080 imulq 24(%rsp),%r8 1081 xorl %ebp,%ebp 1082 1083 mulxq 24(%rsi),%rax,%r14 1084 movq %r8,%rdx 1085 adcxq %rax,%r13 1086 adoxq -8(%rbx),%r13 1087 adcxq %rbp,%r14 1088 leaq 32(%rsi),%rsi 1089 adoxq %rbp,%r14 1090 1091 mulxq 0(%rcx),%rax,%r10 1092 adcxq %rax,%r15 1093 adoxq %r11,%r10 1094 mulxq 8(%rcx),%rax,%r11 1095 adcxq %rax,%r10 1096 adoxq %r12,%r11 1097 mulxq 16(%rcx),%rax,%r12 1098 movq %r10,-32(%rbx) 1099 adcxq %rax,%r11 1100 adoxq %r13,%r12 1101 mulxq 24(%rcx),%rax,%r15 1102 movq %r9,%rdx 1103 movq %r11,-24(%rbx) 1104 leaq 32(%rcx),%rcx 1105 adcxq %rax,%r12 1106 adoxq %rbp,%r15 1107 movq 48(%rsp),%rdi 1108 movq %r12,-16(%rbx) 1109 1110 jmp L$mulx4x_inner 1111 1112.p2align 5 1113L$mulx4x_inner: 1114 mulxq 0(%rsi),%r10,%rax 1115 adcxq %rbp,%r15 1116 adoxq %r14,%r10 1117 mulxq 8(%rsi),%r11,%r14 1118 adcxq 0(%rbx),%r10 1119 adoxq %rax,%r11 1120 mulxq 16(%rsi),%r12,%rax 1121 adcxq 8(%rbx),%r11 1122 adoxq %r14,%r12 1123 mulxq 24(%rsi),%r13,%r14 1124 movq %r8,%rdx 1125 adcxq 16(%rbx),%r12 1126 adoxq %rax,%r13 1127 adcxq 24(%rbx),%r13 1128 adoxq %rbp,%r14 1129 leaq 32(%rsi),%rsi 1130 leaq 32(%rbx),%rbx 1131 adcxq %rbp,%r14 1132 1133 adoxq %r15,%r10 1134 mulxq 0(%rcx),%rax,%r15 1135 adcxq %rax,%r10 1136 adoxq %r15,%r11 1137 mulxq 8(%rcx),%rax,%r15 1138 adcxq %rax,%r11 1139 adoxq %r15,%r12 1140 mulxq 16(%rcx),%rax,%r15 1141 movq %r10,-40(%rbx) 1142 adcxq %rax,%r12 1143 adoxq %r15,%r13 1144 mulxq 24(%rcx),%rax,%r15 1145 movq %r9,%rdx 1146 movq %r11,-32(%rbx) 1147 movq %r12,-24(%rbx) 1148 adcxq %rax,%r13 1149 adoxq %rbp,%r15 1150 leaq 32(%rcx),%rcx 1151 movq %r13,-16(%rbx) 1152 1153 decq %rdi 1154 jnz L$mulx4x_inner 1155 1156 movq 0(%rsp),%rax 1157 movq 8(%rsp),%rdi 1158 adcq %rbp,%r15 1159 subq 0(%rbx),%rbp 1160 adcq %r15,%r14 1161 sbbq %r15,%r15 1162 movq %r14,-8(%rbx) 1163 1164 cmpq 16(%rsp),%rdi 1165 jne L$mulx4x_outer 1166 1167 leaq 64(%rsp),%rbx 1168 subq %rax,%rcx 1169 negq %r15 1170 movq %rax,%rdx 1171 shrq $3+2,%rax 1172 movq 32(%rsp),%rdi 1173 jmp L$mulx4x_sub 1174 1175.p2align 5 1176L$mulx4x_sub: 1177 movq 0(%rbx),%r11 1178 movq 8(%rbx),%r12 1179 movq 16(%rbx),%r13 1180 movq 24(%rbx),%r14 1181 leaq 32(%rbx),%rbx 1182 sbbq 0(%rcx),%r11 1183 sbbq 8(%rcx),%r12 1184 sbbq 16(%rcx),%r13 1185 sbbq 24(%rcx),%r14 1186 leaq 32(%rcx),%rcx 1187 movq %r11,0(%rdi) 1188 movq %r12,8(%rdi) 1189 movq %r13,16(%rdi) 1190 movq %r14,24(%rdi) 1191 leaq 32(%rdi),%rdi 1192 decq %rax 1193 jnz L$mulx4x_sub 1194 1195 sbbq $0,%r15 1196 leaq 64(%rsp),%rbx 1197 subq %rdx,%rdi 1198 1199.byte 102,73,15,110,207 1200 pxor %xmm0,%xmm0 1201 pshufd $0,%xmm1,%xmm1 1202 movq 40(%rsp),%rsi 1203 1204 jmp L$mulx4x_cond_copy 1205 1206.p2align 5 1207L$mulx4x_cond_copy: 1208 movdqa 0(%rbx),%xmm2 1209 movdqa 16(%rbx),%xmm3 1210 leaq 32(%rbx),%rbx 1211 movdqu 0(%rdi),%xmm4 1212 movdqu 16(%rdi),%xmm5 1213 leaq 32(%rdi),%rdi 1214 movdqa %xmm0,-32(%rbx) 1215 movdqa %xmm0,-16(%rbx) 1216 pcmpeqd %xmm1,%xmm0 1217 pand %xmm1,%xmm2 1218 pand %xmm1,%xmm3 1219 pand %xmm0,%xmm4 1220 pand %xmm0,%xmm5 1221 pxor %xmm0,%xmm0 1222 por %xmm2,%xmm4 1223 por %xmm3,%xmm5 1224 movdqu %xmm4,-32(%rdi) 1225 movdqu %xmm5,-16(%rdi) 1226 subq $32,%rdx 1227 jnz L$mulx4x_cond_copy 1228 1229 movq %rdx,(%rbx) 1230 1231 movq $1,%rax 1232 movq -48(%rsi),%r15 1233 1234 movq -40(%rsi),%r14 1235 1236 movq -32(%rsi),%r13 1237 1238 movq -24(%rsi),%r12 1239 1240 movq -16(%rsi),%rbp 1241 1242 movq -8(%rsi),%rbx 1243 1244 leaq (%rsi),%rsp 1245 1246L$mulx4x_epilogue: 1247 .byte 0xf3,0xc3 1248 1249 1250.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1251.p2align 4 1252#endif 1253