1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4# Copyright (c) 2015 CloudFlare, Inc. 5# 6# Licensed under the OpenSSL license (the "License"). You may not use 7# this file except in compliance with the License. You can obtain a copy 8# in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12# (1) Intel Corporation, Israel Development Center, Haifa, Israel 13# (2) University of Haifa, Israel 14# (3) CloudFlare, Inc. 15# 16# Reference: 17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18# 256 Bit Primes" 19 20# Further optimization by <appro@openssl.org>: 21# 22# this/original with/without -DECP_NISTZ256_ASM(*) 23# Opteron +15-49% +150-195% 24# Bulldozer +18-45% +175-240% 25# P4 +24-46% +100-150% 26# Westmere +18-34% +87-160% 27# Sandy Bridge +14-35% +120-185% 28# Ivy Bridge +11-35% +125-180% 29# Haswell +10-37% +160-200% 30# Broadwell +24-58% +210-270% 31# Atom +20-50% +180-240% 32# VIA Nano +50-160% +480-480% 33# 34# (*) "without -DECP_NISTZ256_ASM" refers to build with 35# "enable-ec_nistp_64_gcc_128"; 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. In "this/original" column lower coefficient is for 39# ECDSA sign, while in "with/without" - for ECDH key agreement, and 40# higher - for ECDSA sign, relatively fastest server-side operation. 41# Keep in mind that +100% means 2x improvement. 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55*STDOUT=*OUT; 56 57$avx = 2; 58$addx = 1; 59 60$code.=<<___; 61.text 62.extern OPENSSL_ia32cap_P 63 64# The polynomial 65.section .rodata 66.align 64 67.Lpoly: 68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 69 70.LOne: 71.long 1,1,1,1,1,1,1,1 72.LTwo: 73.long 2,2,2,2,2,2,2,2 74.LThree: 75.long 3,3,3,3,3,3,3,3 76.LONE_mont: 77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 78 79# Constants for computations modulo ord(p256) 80.Lord: 81.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 82.LordK: 83.quad 0xccd1c8aaee00bc4f 84.text 85___ 86 87{ 88my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 89my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 90my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 91 92$code.=<<___; 93 94################################################################################ 95# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 96.globl ecp_nistz256_neg 97.type ecp_nistz256_neg,\@function,2 98.align 32 99ecp_nistz256_neg: 100.cfi_startproc 101 _CET_ENDBR 102 push %r12 103.cfi_push %r12 104 push %r13 105.cfi_push %r13 106.Lneg_body: 107 108 xor $a0, $a0 109 xor $a1, $a1 110 xor $a2, $a2 111 xor $a3, $a3 112 xor $t4, $t4 113 114 sub 8*0($a_ptr), $a0 115 sbb 8*1($a_ptr), $a1 116 sbb 8*2($a_ptr), $a2 117 mov $a0, $t0 118 sbb 8*3($a_ptr), $a3 119 lea .Lpoly(%rip), $a_ptr 120 mov $a1, $t1 121 sbb \$0, $t4 122 123 add 8*0($a_ptr), $a0 124 mov $a2, $t2 125 adc 8*1($a_ptr), $a1 126 adc 8*2($a_ptr), $a2 127 mov $a3, $t3 128 adc 8*3($a_ptr), $a3 129 test $t4, $t4 130 131 cmovz $t0, $a0 132 cmovz $t1, $a1 133 mov $a0, 8*0($r_ptr) 134 cmovz $t2, $a2 135 mov $a1, 8*1($r_ptr) 136 cmovz $t3, $a3 137 mov $a2, 8*2($r_ptr) 138 mov $a3, 8*3($r_ptr) 139 140 mov 0(%rsp),%r13 141.cfi_restore %r13 142 mov 8(%rsp),%r12 143.cfi_restore %r12 144 lea 16(%rsp),%rsp 145.cfi_adjust_cfa_offset -16 146.Lneg_epilogue: 147 ret 148.cfi_endproc 149.size ecp_nistz256_neg,.-ecp_nistz256_neg 150___ 151} 152{ 153my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 154my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 155my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 156my ($poly1,$poly3)=($acc6,$acc7); 157 158$code.=<<___; 159################################################################################ 160# void ecp_nistz256_ord_mul_mont( 161# uint64_t res[4], 162# uint64_t a[4], 163# uint64_t b[4]); 164 165.globl ecp_nistz256_ord_mul_mont 166.type ecp_nistz256_ord_mul_mont,\@function,3 167.align 32 168ecp_nistz256_ord_mul_mont: 169.cfi_startproc 170 _CET_ENDBR 171___ 172$code.=<<___ if ($addx); 173 leaq OPENSSL_ia32cap_P(%rip), %rcx 174 mov 8(%rcx), %rcx 175 and \$0x80100, %ecx 176 cmp \$0x80100, %ecx 177 je .Lecp_nistz256_ord_mul_montx 178___ 179$code.=<<___; 180 push %rbp 181.cfi_push %rbp 182 push %rbx 183.cfi_push %rbx 184 push %r12 185.cfi_push %r12 186 push %r13 187.cfi_push %r13 188 push %r14 189.cfi_push %r14 190 push %r15 191.cfi_push %r15 192.Lord_mul_body: 193 194 mov 8*0($b_org), %rax 195 mov $b_org, $b_ptr 196 lea .Lord(%rip), %r14 197 mov .LordK(%rip), %r15 198 199 ################################# * b[0] 200 mov %rax, $t0 201 mulq 8*0($a_ptr) 202 mov %rax, $acc0 203 mov $t0, %rax 204 mov %rdx, $acc1 205 206 mulq 8*1($a_ptr) 207 add %rax, $acc1 208 mov $t0, %rax 209 adc \$0, %rdx 210 mov %rdx, $acc2 211 212 mulq 8*2($a_ptr) 213 add %rax, $acc2 214 mov $t0, %rax 215 adc \$0, %rdx 216 217 mov $acc0, $acc5 218 imulq %r15,$acc0 219 220 mov %rdx, $acc3 221 mulq 8*3($a_ptr) 222 add %rax, $acc3 223 mov $acc0, %rax 224 adc \$0, %rdx 225 mov %rdx, $acc4 226 227 ################################# First reduction step 228 mulq 8*0(%r14) 229 mov $acc0, $t1 230 add %rax, $acc5 # guaranteed to be zero 231 mov $acc0, %rax 232 adc \$0, %rdx 233 mov %rdx, $t0 234 235 sub $acc0, $acc2 236 sbb \$0, $acc0 # can't borrow 237 238 mulq 8*1(%r14) 239 add $t0, $acc1 240 adc \$0, %rdx 241 add %rax, $acc1 242 mov $t1, %rax 243 adc %rdx, $acc2 244 mov $t1, %rdx 245 adc \$0, $acc0 # can't overflow 246 247 shl \$32, %rax 248 shr \$32, %rdx 249 sub %rax, $acc3 250 mov 8*1($b_ptr), %rax 251 sbb %rdx, $t1 # can't borrow 252 253 add $acc0, $acc3 254 adc $t1, $acc4 255 adc \$0, $acc5 256 257 ################################# * b[1] 258 mov %rax, $t0 259 mulq 8*0($a_ptr) 260 add %rax, $acc1 261 mov $t0, %rax 262 adc \$0, %rdx 263 mov %rdx, $t1 264 265 mulq 8*1($a_ptr) 266 add $t1, $acc2 267 adc \$0, %rdx 268 add %rax, $acc2 269 mov $t0, %rax 270 adc \$0, %rdx 271 mov %rdx, $t1 272 273 mulq 8*2($a_ptr) 274 add $t1, $acc3 275 adc \$0, %rdx 276 add %rax, $acc3 277 mov $t0, %rax 278 adc \$0, %rdx 279 280 mov $acc1, $t0 281 imulq %r15, $acc1 282 283 mov %rdx, $t1 284 mulq 8*3($a_ptr) 285 add $t1, $acc4 286 adc \$0, %rdx 287 xor $acc0, $acc0 288 add %rax, $acc4 289 mov $acc1, %rax 290 adc %rdx, $acc5 291 adc \$0, $acc0 292 293 ################################# Second reduction step 294 mulq 8*0(%r14) 295 mov $acc1, $t1 296 add %rax, $t0 # guaranteed to be zero 297 mov $acc1, %rax 298 adc %rdx, $t0 299 300 sub $acc1, $acc3 301 sbb \$0, $acc1 # can't borrow 302 303 mulq 8*1(%r14) 304 add $t0, $acc2 305 adc \$0, %rdx 306 add %rax, $acc2 307 mov $t1, %rax 308 adc %rdx, $acc3 309 mov $t1, %rdx 310 adc \$0, $acc1 # can't overflow 311 312 shl \$32, %rax 313 shr \$32, %rdx 314 sub %rax, $acc4 315 mov 8*2($b_ptr), %rax 316 sbb %rdx, $t1 # can't borrow 317 318 add $acc1, $acc4 319 adc $t1, $acc5 320 adc \$0, $acc0 321 322 ################################## * b[2] 323 mov %rax, $t0 324 mulq 8*0($a_ptr) 325 add %rax, $acc2 326 mov $t0, %rax 327 adc \$0, %rdx 328 mov %rdx, $t1 329 330 mulq 8*1($a_ptr) 331 add $t1, $acc3 332 adc \$0, %rdx 333 add %rax, $acc3 334 mov $t0, %rax 335 adc \$0, %rdx 336 mov %rdx, $t1 337 338 mulq 8*2($a_ptr) 339 add $t1, $acc4 340 adc \$0, %rdx 341 add %rax, $acc4 342 mov $t0, %rax 343 adc \$0, %rdx 344 345 mov $acc2, $t0 346 imulq %r15, $acc2 347 348 mov %rdx, $t1 349 mulq 8*3($a_ptr) 350 add $t1, $acc5 351 adc \$0, %rdx 352 xor $acc1, $acc1 353 add %rax, $acc5 354 mov $acc2, %rax 355 adc %rdx, $acc0 356 adc \$0, $acc1 357 358 ################################# Third reduction step 359 mulq 8*0(%r14) 360 mov $acc2, $t1 361 add %rax, $t0 # guaranteed to be zero 362 mov $acc2, %rax 363 adc %rdx, $t0 364 365 sub $acc2, $acc4 366 sbb \$0, $acc2 # can't borrow 367 368 mulq 8*1(%r14) 369 add $t0, $acc3 370 adc \$0, %rdx 371 add %rax, $acc3 372 mov $t1, %rax 373 adc %rdx, $acc4 374 mov $t1, %rdx 375 adc \$0, $acc2 # can't overflow 376 377 shl \$32, %rax 378 shr \$32, %rdx 379 sub %rax, $acc5 380 mov 8*3($b_ptr), %rax 381 sbb %rdx, $t1 # can't borrow 382 383 add $acc2, $acc5 384 adc $t1, $acc0 385 adc \$0, $acc1 386 387 ################################# * b[3] 388 mov %rax, $t0 389 mulq 8*0($a_ptr) 390 add %rax, $acc3 391 mov $t0, %rax 392 adc \$0, %rdx 393 mov %rdx, $t1 394 395 mulq 8*1($a_ptr) 396 add $t1, $acc4 397 adc \$0, %rdx 398 add %rax, $acc4 399 mov $t0, %rax 400 adc \$0, %rdx 401 mov %rdx, $t1 402 403 mulq 8*2($a_ptr) 404 add $t1, $acc5 405 adc \$0, %rdx 406 add %rax, $acc5 407 mov $t0, %rax 408 adc \$0, %rdx 409 410 mov $acc3, $t0 411 imulq %r15, $acc3 412 413 mov %rdx, $t1 414 mulq 8*3($a_ptr) 415 add $t1, $acc0 416 adc \$0, %rdx 417 xor $acc2, $acc2 418 add %rax, $acc0 419 mov $acc3, %rax 420 adc %rdx, $acc1 421 adc \$0, $acc2 422 423 ################################# Last reduction step 424 mulq 8*0(%r14) 425 mov $acc3, $t1 426 add %rax, $t0 # guaranteed to be zero 427 mov $acc3, %rax 428 adc %rdx, $t0 429 430 sub $acc3, $acc5 431 sbb \$0, $acc3 # can't borrow 432 433 mulq 8*1(%r14) 434 add $t0, $acc4 435 adc \$0, %rdx 436 add %rax, $acc4 437 mov $t1, %rax 438 adc %rdx, $acc5 439 mov $t1, %rdx 440 adc \$0, $acc3 # can't overflow 441 442 shl \$32, %rax 443 shr \$32, %rdx 444 sub %rax, $acc0 445 sbb %rdx, $t1 # can't borrow 446 447 add $acc3, $acc0 448 adc $t1, $acc1 449 adc \$0, $acc2 450 451 ################################# Subtract ord 452 mov $acc4, $a_ptr 453 sub 8*0(%r14), $acc4 454 mov $acc5, $acc3 455 sbb 8*1(%r14), $acc5 456 mov $acc0, $t0 457 sbb 8*2(%r14), $acc0 458 mov $acc1, $t1 459 sbb 8*3(%r14), $acc1 460 sbb \$0, $acc2 461 462 cmovc $a_ptr, $acc4 463 cmovc $acc3, $acc5 464 cmovc $t0, $acc0 465 cmovc $t1, $acc1 466 467 mov $acc4, 8*0($r_ptr) 468 mov $acc5, 8*1($r_ptr) 469 mov $acc0, 8*2($r_ptr) 470 mov $acc1, 8*3($r_ptr) 471 472 mov 0(%rsp),%r15 473.cfi_restore %r15 474 mov 8(%rsp),%r14 475.cfi_restore %r14 476 mov 16(%rsp),%r13 477.cfi_restore %r13 478 mov 24(%rsp),%r12 479.cfi_restore %r12 480 mov 32(%rsp),%rbx 481.cfi_restore %rbx 482 mov 40(%rsp),%rbp 483.cfi_restore %rbp 484 lea 48(%rsp),%rsp 485.cfi_adjust_cfa_offset -48 486.Lord_mul_epilogue: 487 ret 488.cfi_endproc 489.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 490 491################################################################################ 492# void ecp_nistz256_ord_sqr_mont( 493# uint64_t res[4], 494# uint64_t a[4], 495# uint64_t rep); 496 497.globl ecp_nistz256_ord_sqr_mont 498.type ecp_nistz256_ord_sqr_mont,\@function,3 499.align 32 500ecp_nistz256_ord_sqr_mont: 501.cfi_startproc 502 _CET_ENDBR 503___ 504$code.=<<___ if ($addx); 505 leaq OPENSSL_ia32cap_P(%rip), %rcx 506 mov 8(%rcx), %rcx 507 and \$0x80100, %ecx 508 cmp \$0x80100, %ecx 509 je .Lecp_nistz256_ord_sqr_montx 510___ 511$code.=<<___; 512 push %rbp 513.cfi_push %rbp 514 push %rbx 515.cfi_push %rbx 516 push %r12 517.cfi_push %r12 518 push %r13 519.cfi_push %r13 520 push %r14 521.cfi_push %r14 522 push %r15 523.cfi_push %r15 524.Lord_sqr_body: 525 526 mov 8*0($a_ptr), $acc0 527 mov 8*1($a_ptr), %rax 528 mov 8*2($a_ptr), $acc6 529 mov 8*3($a_ptr), $acc7 530 lea .Lord(%rip), $a_ptr # pointer to modulus 531 mov $b_org, $b_ptr 532 jmp .Loop_ord_sqr 533 534.align 32 535.Loop_ord_sqr: 536 ################################# a[1:] * a[0] 537 mov %rax, $t1 # put aside a[1] 538 mul $acc0 # a[1] * a[0] 539 mov %rax, $acc1 540 movq $t1, %xmm1 # offload a[1] 541 mov $acc6, %rax 542 mov %rdx, $acc2 543 544 mul $acc0 # a[2] * a[0] 545 add %rax, $acc2 546 mov $acc7, %rax 547 movq $acc6, %xmm2 # offload a[2] 548 adc \$0, %rdx 549 mov %rdx, $acc3 550 551 mul $acc0 # a[3] * a[0] 552 add %rax, $acc3 553 mov $acc7, %rax 554 movq $acc7, %xmm3 # offload a[3] 555 adc \$0, %rdx 556 mov %rdx, $acc4 557 558 ################################# a[3] * a[2] 559 mul $acc6 # a[3] * a[2] 560 mov %rax, $acc5 561 mov $acc6, %rax 562 mov %rdx, $acc6 563 564 ################################# a[2:] * a[1] 565 mul $t1 # a[2] * a[1] 566 add %rax, $acc3 567 mov $acc7, %rax 568 adc \$0, %rdx 569 mov %rdx, $acc7 570 571 mul $t1 # a[3] * a[1] 572 add %rax, $acc4 573 adc \$0, %rdx 574 575 add $acc7, $acc4 576 adc %rdx, $acc5 577 adc \$0, $acc6 # can't overflow 578 579 ################################# *2 580 xor $acc7, $acc7 581 mov $acc0, %rax 582 add $acc1, $acc1 583 adc $acc2, $acc2 584 adc $acc3, $acc3 585 adc $acc4, $acc4 586 adc $acc5, $acc5 587 adc $acc6, $acc6 588 adc \$0, $acc7 589 590 ################################# Missing products 591 mul %rax # a[0] * a[0] 592 mov %rax, $acc0 593 movq %xmm1, %rax 594 mov %rdx, $t1 595 596 mul %rax # a[1] * a[1] 597 add $t1, $acc1 598 adc %rax, $acc2 599 movq %xmm2, %rax 600 adc \$0, %rdx 601 mov %rdx, $t1 602 603 mul %rax # a[2] * a[2] 604 add $t1, $acc3 605 adc %rax, $acc4 606 movq %xmm3, %rax 607 adc \$0, %rdx 608 mov %rdx, $t1 609 610 mov $acc0, $t0 611 imulq 8*4($a_ptr), $acc0 # *= .LordK 612 613 mul %rax # a[3] * a[3] 614 add $t1, $acc5 615 adc %rax, $acc6 616 mov 8*0($a_ptr), %rax # modulus[0] 617 adc %rdx, $acc7 # can't overflow 618 619 ################################# First reduction step 620 mul $acc0 621 mov $acc0, $t1 622 add %rax, $t0 # guaranteed to be zero 623 mov 8*1($a_ptr), %rax # modulus[1] 624 adc %rdx, $t0 625 626 sub $acc0, $acc2 627 sbb \$0, $t1 # can't borrow 628 629 mul $acc0 630 add $t0, $acc1 631 adc \$0, %rdx 632 add %rax, $acc1 633 mov $acc0, %rax 634 adc %rdx, $acc2 635 mov $acc0, %rdx 636 adc \$0, $t1 # can't overflow 637 638 mov $acc1, $t0 639 imulq 8*4($a_ptr), $acc1 # *= .LordK 640 641 shl \$32, %rax 642 shr \$32, %rdx 643 sub %rax, $acc3 644 mov 8*0($a_ptr), %rax 645 sbb %rdx, $acc0 # can't borrow 646 647 add $t1, $acc3 648 adc \$0, $acc0 # can't overflow 649 650 ################################# Second reduction step 651 mul $acc1 652 mov $acc1, $t1 653 add %rax, $t0 # guaranteed to be zero 654 mov 8*1($a_ptr), %rax 655 adc %rdx, $t0 656 657 sub $acc1, $acc3 658 sbb \$0, $t1 # can't borrow 659 660 mul $acc1 661 add $t0, $acc2 662 adc \$0, %rdx 663 add %rax, $acc2 664 mov $acc1, %rax 665 adc %rdx, $acc3 666 mov $acc1, %rdx 667 adc \$0, $t1 # can't overflow 668 669 mov $acc2, $t0 670 imulq 8*4($a_ptr), $acc2 # *= .LordK 671 672 shl \$32, %rax 673 shr \$32, %rdx 674 sub %rax, $acc0 675 mov 8*0($a_ptr), %rax 676 sbb %rdx, $acc1 # can't borrow 677 678 add $t1, $acc0 679 adc \$0, $acc1 # can't overflow 680 681 ################################# Third reduction step 682 mul $acc2 683 mov $acc2, $t1 684 add %rax, $t0 # guaranteed to be zero 685 mov 8*1($a_ptr), %rax 686 adc %rdx, $t0 687 688 sub $acc2, $acc0 689 sbb \$0, $t1 # can't borrow 690 691 mul $acc2 692 add $t0, $acc3 693 adc \$0, %rdx 694 add %rax, $acc3 695 mov $acc2, %rax 696 adc %rdx, $acc0 697 mov $acc2, %rdx 698 adc \$0, $t1 # can't overflow 699 700 mov $acc3, $t0 701 imulq 8*4($a_ptr), $acc3 # *= .LordK 702 703 shl \$32, %rax 704 shr \$32, %rdx 705 sub %rax, $acc1 706 mov 8*0($a_ptr), %rax 707 sbb %rdx, $acc2 # can't borrow 708 709 add $t1, $acc1 710 adc \$0, $acc2 # can't overflow 711 712 ################################# Last reduction step 713 mul $acc3 714 mov $acc3, $t1 715 add %rax, $t0 # guaranteed to be zero 716 mov 8*1($a_ptr), %rax 717 adc %rdx, $t0 718 719 sub $acc3, $acc1 720 sbb \$0, $t1 # can't borrow 721 722 mul $acc3 723 add $t0, $acc0 724 adc \$0, %rdx 725 add %rax, $acc0 726 mov $acc3, %rax 727 adc %rdx, $acc1 728 mov $acc3, %rdx 729 adc \$0, $t1 # can't overflow 730 731 shl \$32, %rax 732 shr \$32, %rdx 733 sub %rax, $acc2 734 sbb %rdx, $acc3 # can't borrow 735 736 add $t1, $acc2 737 adc \$0, $acc3 # can't overflow 738 739 ################################# Add bits [511:256] of the sqr result 740 xor %rdx, %rdx 741 add $acc4, $acc0 742 adc $acc5, $acc1 743 mov $acc0, $acc4 744 adc $acc6, $acc2 745 adc $acc7, $acc3 746 mov $acc1, %rax 747 adc \$0, %rdx 748 749 ################################# Compare to modulus 750 sub 8*0($a_ptr), $acc0 751 mov $acc2, $acc6 752 sbb 8*1($a_ptr), $acc1 753 sbb 8*2($a_ptr), $acc2 754 mov $acc3, $acc7 755 sbb 8*3($a_ptr), $acc3 756 sbb \$0, %rdx 757 758 cmovc $acc4, $acc0 759 cmovnc $acc1, %rax 760 cmovnc $acc2, $acc6 761 cmovnc $acc3, $acc7 762 763 dec $b_ptr 764 jnz .Loop_ord_sqr 765 766 mov $acc0, 8*0($r_ptr) 767 mov %rax, 8*1($r_ptr) 768 pxor %xmm1, %xmm1 769 mov $acc6, 8*2($r_ptr) 770 pxor %xmm2, %xmm2 771 mov $acc7, 8*3($r_ptr) 772 pxor %xmm3, %xmm3 773 774 mov 0(%rsp),%r15 775.cfi_restore %r15 776 mov 8(%rsp),%r14 777.cfi_restore %r14 778 mov 16(%rsp),%r13 779.cfi_restore %r13 780 mov 24(%rsp),%r12 781.cfi_restore %r12 782 mov 32(%rsp),%rbx 783.cfi_restore %rbx 784 mov 40(%rsp),%rbp 785.cfi_restore %rbp 786 lea 48(%rsp),%rsp 787.cfi_adjust_cfa_offset -48 788.Lord_sqr_epilogue: 789 ret 790.cfi_endproc 791.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 792___ 793 794$code.=<<___ if ($addx); 795################################################################################ 796.type ecp_nistz256_ord_mul_montx,\@function,3 797.align 32 798ecp_nistz256_ord_mul_montx: 799.cfi_startproc 800.Lecp_nistz256_ord_mul_montx: 801 push %rbp 802.cfi_push %rbp 803 push %rbx 804.cfi_push %rbx 805 push %r12 806.cfi_push %r12 807 push %r13 808.cfi_push %r13 809 push %r14 810.cfi_push %r14 811 push %r15 812.cfi_push %r15 813.Lord_mulx_body: 814 815 mov $b_org, $b_ptr 816 mov 8*0($b_org), %rdx 817 mov 8*0($a_ptr), $acc1 818 mov 8*1($a_ptr), $acc2 819 mov 8*2($a_ptr), $acc3 820 mov 8*3($a_ptr), $acc4 821 lea -128($a_ptr), $a_ptr # control u-op density 822 lea .Lord-128(%rip), %r14 823 mov .LordK(%rip), %r15 824 825 ################################# Multiply by b[0] 826 mulx $acc1, $acc0, $acc1 827 mulx $acc2, $t0, $acc2 828 mulx $acc3, $t1, $acc3 829 add $t0, $acc1 830 mulx $acc4, $t0, $acc4 831 mov $acc0, %rdx 832 mulx %r15, %rdx, %rax 833 adc $t1, $acc2 834 adc $t0, $acc3 835 adc \$0, $acc4 836 837 ################################# reduction 838 xor $acc5, $acc5 # $acc5=0, cf=0, of=0 839 mulx 8*0+128(%r14), $t0, $t1 840 adcx $t0, $acc0 # guaranteed to be zero 841 adox $t1, $acc1 842 843 mulx 8*1+128(%r14), $t0, $t1 844 adcx $t0, $acc1 845 adox $t1, $acc2 846 847 mulx 8*2+128(%r14), $t0, $t1 848 adcx $t0, $acc2 849 adox $t1, $acc3 850 851 mulx 8*3+128(%r14), $t0, $t1 852 mov 8*1($b_ptr), %rdx 853 adcx $t0, $acc3 854 adox $t1, $acc4 855 adcx $acc0, $acc4 856 adox $acc0, $acc5 857 adc \$0, $acc5 # cf=0, of=0 858 859 ################################# Multiply by b[1] 860 mulx 8*0+128($a_ptr), $t0, $t1 861 adcx $t0, $acc1 862 adox $t1, $acc2 863 864 mulx 8*1+128($a_ptr), $t0, $t1 865 adcx $t0, $acc2 866 adox $t1, $acc3 867 868 mulx 8*2+128($a_ptr), $t0, $t1 869 adcx $t0, $acc3 870 adox $t1, $acc4 871 872 mulx 8*3+128($a_ptr), $t0, $t1 873 mov $acc1, %rdx 874 mulx %r15, %rdx, %rax 875 adcx $t0, $acc4 876 adox $t1, $acc5 877 878 adcx $acc0, $acc5 879 adox $acc0, $acc0 880 adc \$0, $acc0 # cf=0, of=0 881 882 ################################# reduction 883 mulx 8*0+128(%r14), $t0, $t1 884 adcx $t0, $acc1 # guaranteed to be zero 885 adox $t1, $acc2 886 887 mulx 8*1+128(%r14), $t0, $t1 888 adcx $t0, $acc2 889 adox $t1, $acc3 890 891 mulx 8*2+128(%r14), $t0, $t1 892 adcx $t0, $acc3 893 adox $t1, $acc4 894 895 mulx 8*3+128(%r14), $t0, $t1 896 mov 8*2($b_ptr), %rdx 897 adcx $t0, $acc4 898 adox $t1, $acc5 899 adcx $acc1, $acc5 900 adox $acc1, $acc0 901 adc \$0, $acc0 # cf=0, of=0 902 903 ################################# Multiply by b[2] 904 mulx 8*0+128($a_ptr), $t0, $t1 905 adcx $t0, $acc2 906 adox $t1, $acc3 907 908 mulx 8*1+128($a_ptr), $t0, $t1 909 adcx $t0, $acc3 910 adox $t1, $acc4 911 912 mulx 8*2+128($a_ptr), $t0, $t1 913 adcx $t0, $acc4 914 adox $t1, $acc5 915 916 mulx 8*3+128($a_ptr), $t0, $t1 917 mov $acc2, %rdx 918 mulx %r15, %rdx, %rax 919 adcx $t0, $acc5 920 adox $t1, $acc0 921 922 adcx $acc1, $acc0 923 adox $acc1, $acc1 924 adc \$0, $acc1 # cf=0, of=0 925 926 ################################# reduction 927 mulx 8*0+128(%r14), $t0, $t1 928 adcx $t0, $acc2 # guaranteed to be zero 929 adox $t1, $acc3 930 931 mulx 8*1+128(%r14), $t0, $t1 932 adcx $t0, $acc3 933 adox $t1, $acc4 934 935 mulx 8*2+128(%r14), $t0, $t1 936 adcx $t0, $acc4 937 adox $t1, $acc5 938 939 mulx 8*3+128(%r14), $t0, $t1 940 mov 8*3($b_ptr), %rdx 941 adcx $t0, $acc5 942 adox $t1, $acc0 943 adcx $acc2, $acc0 944 adox $acc2, $acc1 945 adc \$0, $acc1 # cf=0, of=0 946 947 ################################# Multiply by b[3] 948 mulx 8*0+128($a_ptr), $t0, $t1 949 adcx $t0, $acc3 950 adox $t1, $acc4 951 952 mulx 8*1+128($a_ptr), $t0, $t1 953 adcx $t0, $acc4 954 adox $t1, $acc5 955 956 mulx 8*2+128($a_ptr), $t0, $t1 957 adcx $t0, $acc5 958 adox $t1, $acc0 959 960 mulx 8*3+128($a_ptr), $t0, $t1 961 mov $acc3, %rdx 962 mulx %r15, %rdx, %rax 963 adcx $t0, $acc0 964 adox $t1, $acc1 965 966 adcx $acc2, $acc1 967 adox $acc2, $acc2 968 adc \$0, $acc2 # cf=0, of=0 969 970 ################################# reduction 971 mulx 8*0+128(%r14), $t0, $t1 972 adcx $t0, $acc3 # guranteed to be zero 973 adox $t1, $acc4 974 975 mulx 8*1+128(%r14), $t0, $t1 976 adcx $t0, $acc4 977 adox $t1, $acc5 978 979 mulx 8*2+128(%r14), $t0, $t1 980 adcx $t0, $acc5 981 adox $t1, $acc0 982 983 mulx 8*3+128(%r14), $t0, $t1 984 lea 128(%r14),%r14 985 mov $acc4, $t2 986 adcx $t0, $acc0 987 adox $t1, $acc1 988 mov $acc5, $t3 989 adcx $acc3, $acc1 990 adox $acc3, $acc2 991 adc \$0, $acc2 992 993 ################################# 994 # Branch-less conditional subtraction of P 995 mov $acc0, $t0 996 sub 8*0(%r14), $acc4 997 sbb 8*1(%r14), $acc5 998 sbb 8*2(%r14), $acc0 999 mov $acc1, $t1 1000 sbb 8*3(%r14), $acc1 1001 sbb \$0, $acc2 1002 1003 cmovc $t2, $acc4 1004 cmovc $t3, $acc5 1005 cmovc $t0, $acc0 1006 cmovc $t1, $acc1 1007 1008 mov $acc4, 8*0($r_ptr) 1009 mov $acc5, 8*1($r_ptr) 1010 mov $acc0, 8*2($r_ptr) 1011 mov $acc1, 8*3($r_ptr) 1012 1013 mov 0(%rsp),%r15 1014.cfi_restore %r15 1015 mov 8(%rsp),%r14 1016.cfi_restore %r14 1017 mov 16(%rsp),%r13 1018.cfi_restore %r13 1019 mov 24(%rsp),%r12 1020.cfi_restore %r12 1021 mov 32(%rsp),%rbx 1022.cfi_restore %rbx 1023 mov 40(%rsp),%rbp 1024.cfi_restore %rbp 1025 lea 48(%rsp),%rsp 1026.cfi_adjust_cfa_offset -48 1027.Lord_mulx_epilogue: 1028 ret 1029.cfi_endproc 1030.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx 1031 1032.type ecp_nistz256_ord_sqr_montx,\@function,3 1033.align 32 1034ecp_nistz256_ord_sqr_montx: 1035.cfi_startproc 1036.Lecp_nistz256_ord_sqr_montx: 1037 push %rbp 1038.cfi_push %rbp 1039 push %rbx 1040.cfi_push %rbx 1041 push %r12 1042.cfi_push %r12 1043 push %r13 1044.cfi_push %r13 1045 push %r14 1046.cfi_push %r14 1047 push %r15 1048.cfi_push %r15 1049.Lord_sqrx_body: 1050 1051 mov $b_org, $b_ptr 1052 mov 8*0($a_ptr), %rdx 1053 mov 8*1($a_ptr), $acc6 1054 mov 8*2($a_ptr), $acc7 1055 mov 8*3($a_ptr), $acc0 1056 lea .Lord(%rip), $a_ptr 1057 jmp .Loop_ord_sqrx 1058 1059.align 32 1060.Loop_ord_sqrx: 1061 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1062 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1063 mov %rdx, %rax # offload a[0] 1064 movq $acc6, %xmm1 # offload a[1] 1065 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1066 mov $acc6, %rdx 1067 add $t0, $acc2 1068 movq $acc7, %xmm2 # offload a[2] 1069 adc $t1, $acc3 1070 adc \$0, $acc4 1071 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1072 ################################# 1073 mulx $acc7, $t0, $t1 # a[1]*a[2] 1074 adcx $t0, $acc3 1075 adox $t1, $acc4 1076 1077 mulx $acc0, $t0, $t1 # a[1]*a[3] 1078 mov $acc7, %rdx 1079 adcx $t0, $acc4 1080 adox $t1, $acc5 1081 adc \$0, $acc5 1082 ################################# 1083 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1084 mov %rax, %rdx 1085 movq $acc0, %xmm3 # offload a[3] 1086 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1087 adcx $acc1, $acc1 # acc1:6<<1 1088 adox $t0, $acc5 1089 adcx $acc2, $acc2 1090 adox $acc7, $acc6 # of=0 1091 1092 ################################# a[i]*a[i] 1093 mulx %rdx, $acc0, $t1 1094 movq %xmm1, %rdx 1095 adcx $acc3, $acc3 1096 adox $t1, $acc1 1097 adcx $acc4, $acc4 1098 mulx %rdx, $t0, $t4 1099 movq %xmm2, %rdx 1100 adcx $acc5, $acc5 1101 adox $t0, $acc2 1102 adcx $acc6, $acc6 1103 mulx %rdx, $t0, $t1 1104 .byte 0x67 1105 movq %xmm3, %rdx 1106 adox $t4, $acc3 1107 adcx $acc7, $acc7 1108 adox $t0, $acc4 1109 adox $t1, $acc5 1110 mulx %rdx, $t0, $t4 1111 adox $t0, $acc6 1112 adox $t4, $acc7 1113 1114 ################################# reduction 1115 mov $acc0, %rdx 1116 mulx 8*4($a_ptr), %rdx, $t0 1117 1118 xor %rax, %rax # cf=0, of=0 1119 mulx 8*0($a_ptr), $t0, $t1 1120 adcx $t0, $acc0 # guaranteed to be zero 1121 adox $t1, $acc1 1122 mulx 8*1($a_ptr), $t0, $t1 1123 adcx $t0, $acc1 1124 adox $t1, $acc2 1125 mulx 8*2($a_ptr), $t0, $t1 1126 adcx $t0, $acc2 1127 adox $t1, $acc3 1128 mulx 8*3($a_ptr), $t0, $t1 1129 adcx $t0, $acc3 1130 adox $t1, $acc0 # of=0 1131 adcx %rax, $acc0 # cf=0 1132 1133 ################################# 1134 mov $acc1, %rdx 1135 mulx 8*4($a_ptr), %rdx, $t0 1136 1137 mulx 8*0($a_ptr), $t0, $t1 1138 adox $t0, $acc1 # guaranteed to be zero 1139 adcx $t1, $acc2 1140 mulx 8*1($a_ptr), $t0, $t1 1141 adox $t0, $acc2 1142 adcx $t1, $acc3 1143 mulx 8*2($a_ptr), $t0, $t1 1144 adox $t0, $acc3 1145 adcx $t1, $acc0 1146 mulx 8*3($a_ptr), $t0, $t1 1147 adox $t0, $acc0 1148 adcx $t1, $acc1 # cf=0 1149 adox %rax, $acc1 # of=0 1150 1151 ################################# 1152 mov $acc2, %rdx 1153 mulx 8*4($a_ptr), %rdx, $t0 1154 1155 mulx 8*0($a_ptr), $t0, $t1 1156 adcx $t0, $acc2 # guaranteed to be zero 1157 adox $t1, $acc3 1158 mulx 8*1($a_ptr), $t0, $t1 1159 adcx $t0, $acc3 1160 adox $t1, $acc0 1161 mulx 8*2($a_ptr), $t0, $t1 1162 adcx $t0, $acc0 1163 adox $t1, $acc1 1164 mulx 8*3($a_ptr), $t0, $t1 1165 adcx $t0, $acc1 1166 adox $t1, $acc2 # of=0 1167 adcx %rax, $acc2 # cf=0 1168 1169 ################################# 1170 mov $acc3, %rdx 1171 mulx 8*4($a_ptr), %rdx, $t0 1172 1173 mulx 8*0($a_ptr), $t0, $t1 1174 adox $t0, $acc3 # guaranteed to be zero 1175 adcx $t1, $acc0 1176 mulx 8*1($a_ptr), $t0, $t1 1177 adox $t0, $acc0 1178 adcx $t1, $acc1 1179 mulx 8*2($a_ptr), $t0, $t1 1180 adox $t0, $acc1 1181 adcx $t1, $acc2 1182 mulx 8*3($a_ptr), $t0, $t1 1183 adox $t0, $acc2 1184 adcx $t1, $acc3 1185 adox %rax, $acc3 1186 1187 ################################# accumulate upper half 1188 add $acc0, $acc4 # add $acc4, $acc0 1189 adc $acc5, $acc1 1190 mov $acc4, %rdx 1191 adc $acc6, $acc2 1192 adc $acc7, $acc3 1193 mov $acc1, $acc6 1194 adc \$0, %rax 1195 1196 ################################# compare to modulus 1197 sub 8*0($a_ptr), $acc4 1198 mov $acc2, $acc7 1199 sbb 8*1($a_ptr), $acc1 1200 sbb 8*2($a_ptr), $acc2 1201 mov $acc3, $acc0 1202 sbb 8*3($a_ptr), $acc3 1203 sbb \$0, %rax 1204 1205 cmovnc $acc4, %rdx 1206 cmovnc $acc1, $acc6 1207 cmovnc $acc2, $acc7 1208 cmovnc $acc3, $acc0 1209 1210 dec $b_ptr 1211 jnz .Loop_ord_sqrx 1212 1213 mov %rdx, 8*0($r_ptr) 1214 mov $acc6, 8*1($r_ptr) 1215 pxor %xmm1, %xmm1 1216 mov $acc7, 8*2($r_ptr) 1217 pxor %xmm2, %xmm2 1218 mov $acc0, 8*3($r_ptr) 1219 pxor %xmm3, %xmm3 1220 1221 mov 0(%rsp),%r15 1222.cfi_restore %r15 1223 mov 8(%rsp),%r14 1224.cfi_restore %r14 1225 mov 16(%rsp),%r13 1226.cfi_restore %r13 1227 mov 24(%rsp),%r12 1228.cfi_restore %r12 1229 mov 32(%rsp),%rbx 1230.cfi_restore %rbx 1231 mov 40(%rsp),%rbp 1232.cfi_restore %rbp 1233 lea 48(%rsp),%rsp 1234.cfi_adjust_cfa_offset -48 1235.Lord_sqrx_epilogue: 1236 ret 1237.cfi_endproc 1238.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx 1239___ 1240 1241$code.=<<___; 1242################################################################################ 1243# void ecp_nistz256_mul_mont( 1244# uint64_t res[4], 1245# uint64_t a[4], 1246# uint64_t b[4]); 1247 1248.globl ecp_nistz256_mul_mont 1249.type ecp_nistz256_mul_mont,\@function,3 1250.align 32 1251ecp_nistz256_mul_mont: 1252.cfi_startproc 1253 _CET_ENDBR 1254___ 1255$code.=<<___ if ($addx); 1256 leaq OPENSSL_ia32cap_P(%rip), %rcx 1257 mov 8(%rcx), %rcx 1258 and \$0x80100, %ecx 1259___ 1260$code.=<<___; 1261.Lmul_mont: 1262 push %rbp 1263.cfi_push %rbp 1264 push %rbx 1265.cfi_push %rbx 1266 push %r12 1267.cfi_push %r12 1268 push %r13 1269.cfi_push %r13 1270 push %r14 1271.cfi_push %r14 1272 push %r15 1273.cfi_push %r15 1274.Lmul_body: 1275___ 1276$code.=<<___ if ($addx); 1277 cmp \$0x80100, %ecx 1278 je .Lmul_montx 1279___ 1280$code.=<<___; 1281 mov $b_org, $b_ptr 1282 mov 8*0($b_org), %rax 1283 mov 8*0($a_ptr), $acc1 1284 mov 8*1($a_ptr), $acc2 1285 mov 8*2($a_ptr), $acc3 1286 mov 8*3($a_ptr), $acc4 1287 1288 call __ecp_nistz256_mul_montq 1289___ 1290$code.=<<___ if ($addx); 1291 jmp .Lmul_mont_done 1292 1293.align 32 1294.Lmul_montx: 1295 mov $b_org, $b_ptr 1296 mov 8*0($b_org), %rdx 1297 mov 8*0($a_ptr), $acc1 1298 mov 8*1($a_ptr), $acc2 1299 mov 8*2($a_ptr), $acc3 1300 mov 8*3($a_ptr), $acc4 1301 lea -128($a_ptr), $a_ptr # control u-op density 1302 1303 call __ecp_nistz256_mul_montx 1304___ 1305$code.=<<___; 1306.Lmul_mont_done: 1307 mov 0(%rsp),%r15 1308.cfi_restore %r15 1309 mov 8(%rsp),%r14 1310.cfi_restore %r14 1311 mov 16(%rsp),%r13 1312.cfi_restore %r13 1313 mov 24(%rsp),%r12 1314.cfi_restore %r12 1315 mov 32(%rsp),%rbx 1316.cfi_restore %rbx 1317 mov 40(%rsp),%rbp 1318.cfi_restore %rbp 1319 lea 48(%rsp),%rsp 1320.cfi_adjust_cfa_offset -48 1321.Lmul_epilogue: 1322 ret 1323.cfi_endproc 1324.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 1325 1326.type __ecp_nistz256_mul_montq,\@abi-omnipotent 1327.align 32 1328__ecp_nistz256_mul_montq: 1329.cfi_startproc 1330 ######################################################################## 1331 # Multiply a by b[0] 1332 mov %rax, $t1 1333 mulq $acc1 1334 mov .Lpoly+8*1(%rip),$poly1 1335 mov %rax, $acc0 1336 mov $t1, %rax 1337 mov %rdx, $acc1 1338 1339 mulq $acc2 1340 mov .Lpoly+8*3(%rip),$poly3 1341 add %rax, $acc1 1342 mov $t1, %rax 1343 adc \$0, %rdx 1344 mov %rdx, $acc2 1345 1346 mulq $acc3 1347 add %rax, $acc2 1348 mov $t1, %rax 1349 adc \$0, %rdx 1350 mov %rdx, $acc3 1351 1352 mulq $acc4 1353 add %rax, $acc3 1354 mov $acc0, %rax 1355 adc \$0, %rdx 1356 xor $acc5, $acc5 1357 mov %rdx, $acc4 1358 1359 ######################################################################## 1360 # First reduction step 1361 # Basically now we want to multiply acc[0] by p256, 1362 # and add the result to the acc. 1363 # Due to the special form of p256 we do some optimizations 1364 # 1365 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 1366 # then we add acc[0] and get acc[0] x 2^96 1367 1368 mov $acc0, $t1 1369 shl \$32, $acc0 1370 mulq $poly3 1371 shr \$32, $t1 1372 add $acc0, $acc1 # +=acc[0]<<96 1373 adc $t1, $acc2 1374 adc %rax, $acc3 1375 mov 8*1($b_ptr), %rax 1376 adc %rdx, $acc4 1377 adc \$0, $acc5 1378 xor $acc0, $acc0 1379 1380 ######################################################################## 1381 # Multiply by b[1] 1382 mov %rax, $t1 1383 mulq 8*0($a_ptr) 1384 add %rax, $acc1 1385 mov $t1, %rax 1386 adc \$0, %rdx 1387 mov %rdx, $t0 1388 1389 mulq 8*1($a_ptr) 1390 add $t0, $acc2 1391 adc \$0, %rdx 1392 add %rax, $acc2 1393 mov $t1, %rax 1394 adc \$0, %rdx 1395 mov %rdx, $t0 1396 1397 mulq 8*2($a_ptr) 1398 add $t0, $acc3 1399 adc \$0, %rdx 1400 add %rax, $acc3 1401 mov $t1, %rax 1402 adc \$0, %rdx 1403 mov %rdx, $t0 1404 1405 mulq 8*3($a_ptr) 1406 add $t0, $acc4 1407 adc \$0, %rdx 1408 add %rax, $acc4 1409 mov $acc1, %rax 1410 adc %rdx, $acc5 1411 adc \$0, $acc0 1412 1413 ######################################################################## 1414 # Second reduction step 1415 mov $acc1, $t1 1416 shl \$32, $acc1 1417 mulq $poly3 1418 shr \$32, $t1 1419 add $acc1, $acc2 1420 adc $t1, $acc3 1421 adc %rax, $acc4 1422 mov 8*2($b_ptr), %rax 1423 adc %rdx, $acc5 1424 adc \$0, $acc0 1425 xor $acc1, $acc1 1426 1427 ######################################################################## 1428 # Multiply by b[2] 1429 mov %rax, $t1 1430 mulq 8*0($a_ptr) 1431 add %rax, $acc2 1432 mov $t1, %rax 1433 adc \$0, %rdx 1434 mov %rdx, $t0 1435 1436 mulq 8*1($a_ptr) 1437 add $t0, $acc3 1438 adc \$0, %rdx 1439 add %rax, $acc3 1440 mov $t1, %rax 1441 adc \$0, %rdx 1442 mov %rdx, $t0 1443 1444 mulq 8*2($a_ptr) 1445 add $t0, $acc4 1446 adc \$0, %rdx 1447 add %rax, $acc4 1448 mov $t1, %rax 1449 adc \$0, %rdx 1450 mov %rdx, $t0 1451 1452 mulq 8*3($a_ptr) 1453 add $t0, $acc5 1454 adc \$0, %rdx 1455 add %rax, $acc5 1456 mov $acc2, %rax 1457 adc %rdx, $acc0 1458 adc \$0, $acc1 1459 1460 ######################################################################## 1461 # Third reduction step 1462 mov $acc2, $t1 1463 shl \$32, $acc2 1464 mulq $poly3 1465 shr \$32, $t1 1466 add $acc2, $acc3 1467 adc $t1, $acc4 1468 adc %rax, $acc5 1469 mov 8*3($b_ptr), %rax 1470 adc %rdx, $acc0 1471 adc \$0, $acc1 1472 xor $acc2, $acc2 1473 1474 ######################################################################## 1475 # Multiply by b[3] 1476 mov %rax, $t1 1477 mulq 8*0($a_ptr) 1478 add %rax, $acc3 1479 mov $t1, %rax 1480 adc \$0, %rdx 1481 mov %rdx, $t0 1482 1483 mulq 8*1($a_ptr) 1484 add $t0, $acc4 1485 adc \$0, %rdx 1486 add %rax, $acc4 1487 mov $t1, %rax 1488 adc \$0, %rdx 1489 mov %rdx, $t0 1490 1491 mulq 8*2($a_ptr) 1492 add $t0, $acc5 1493 adc \$0, %rdx 1494 add %rax, $acc5 1495 mov $t1, %rax 1496 adc \$0, %rdx 1497 mov %rdx, $t0 1498 1499 mulq 8*3($a_ptr) 1500 add $t0, $acc0 1501 adc \$0, %rdx 1502 add %rax, $acc0 1503 mov $acc3, %rax 1504 adc %rdx, $acc1 1505 adc \$0, $acc2 1506 1507 ######################################################################## 1508 # Final reduction step 1509 mov $acc3, $t1 1510 shl \$32, $acc3 1511 mulq $poly3 1512 shr \$32, $t1 1513 add $acc3, $acc4 1514 adc $t1, $acc5 1515 mov $acc4, $t0 1516 adc %rax, $acc0 1517 adc %rdx, $acc1 1518 mov $acc5, $t1 1519 adc \$0, $acc2 1520 1521 ######################################################################## 1522 # Branch-less conditional subtraction of P 1523 sub \$-1, $acc4 # .Lpoly[0] 1524 mov $acc0, $t2 1525 sbb $poly1, $acc5 # .Lpoly[1] 1526 sbb \$0, $acc0 # .Lpoly[2] 1527 mov $acc1, $t3 1528 sbb $poly3, $acc1 # .Lpoly[3] 1529 sbb \$0, $acc2 1530 1531 cmovc $t0, $acc4 1532 cmovc $t1, $acc5 1533 mov $acc4, 8*0($r_ptr) 1534 cmovc $t2, $acc0 1535 mov $acc5, 8*1($r_ptr) 1536 cmovc $t3, $acc1 1537 mov $acc0, 8*2($r_ptr) 1538 mov $acc1, 8*3($r_ptr) 1539 1540 ret 1541.cfi_endproc 1542.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 1543 1544################################################################################ 1545# void ecp_nistz256_sqr_mont( 1546# uint64_t res[4], 1547# uint64_t a[4]); 1548 1549# we optimize the square according to S.Gueron and V.Krasnov, 1550# "Speeding up Big-Number Squaring" 1551.globl ecp_nistz256_sqr_mont 1552.type ecp_nistz256_sqr_mont,\@function,2 1553.align 32 1554ecp_nistz256_sqr_mont: 1555.cfi_startproc 1556 _CET_ENDBR 1557___ 1558$code.=<<___ if ($addx); 1559 leaq OPENSSL_ia32cap_P(%rip), %rcx 1560 mov 8(%rcx), %rcx 1561 and \$0x80100, %ecx 1562___ 1563$code.=<<___; 1564 push %rbp 1565.cfi_push %rbp 1566 push %rbx 1567.cfi_push %rbx 1568 push %r12 1569.cfi_push %r12 1570 push %r13 1571.cfi_push %r13 1572 push %r14 1573.cfi_push %r14 1574 push %r15 1575.cfi_push %r15 1576.Lsqr_body: 1577___ 1578$code.=<<___ if ($addx); 1579 cmp \$0x80100, %ecx 1580 je .Lsqr_montx 1581___ 1582$code.=<<___; 1583 mov 8*0($a_ptr), %rax 1584 mov 8*1($a_ptr), $acc6 1585 mov 8*2($a_ptr), $acc7 1586 mov 8*3($a_ptr), $acc0 1587 1588 call __ecp_nistz256_sqr_montq 1589___ 1590$code.=<<___ if ($addx); 1591 jmp .Lsqr_mont_done 1592 1593.align 32 1594.Lsqr_montx: 1595 mov 8*0($a_ptr), %rdx 1596 mov 8*1($a_ptr), $acc6 1597 mov 8*2($a_ptr), $acc7 1598 mov 8*3($a_ptr), $acc0 1599 lea -128($a_ptr), $a_ptr # control u-op density 1600 1601 call __ecp_nistz256_sqr_montx 1602___ 1603$code.=<<___; 1604.Lsqr_mont_done: 1605 mov 0(%rsp),%r15 1606.cfi_restore %r15 1607 mov 8(%rsp),%r14 1608.cfi_restore %r14 1609 mov 16(%rsp),%r13 1610.cfi_restore %r13 1611 mov 24(%rsp),%r12 1612.cfi_restore %r12 1613 mov 32(%rsp),%rbx 1614.cfi_restore %rbx 1615 mov 40(%rsp),%rbp 1616.cfi_restore %rbp 1617 lea 48(%rsp),%rsp 1618.cfi_adjust_cfa_offset -48 1619.Lsqr_epilogue: 1620 ret 1621.cfi_endproc 1622.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 1623 1624.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 1625.align 32 1626__ecp_nistz256_sqr_montq: 1627.cfi_startproc 1628 mov %rax, $acc5 1629 mulq $acc6 # a[1]*a[0] 1630 mov %rax, $acc1 1631 mov $acc7, %rax 1632 mov %rdx, $acc2 1633 1634 mulq $acc5 # a[0]*a[2] 1635 add %rax, $acc2 1636 mov $acc0, %rax 1637 adc \$0, %rdx 1638 mov %rdx, $acc3 1639 1640 mulq $acc5 # a[0]*a[3] 1641 add %rax, $acc3 1642 mov $acc7, %rax 1643 adc \$0, %rdx 1644 mov %rdx, $acc4 1645 1646 ################################# 1647 mulq $acc6 # a[1]*a[2] 1648 add %rax, $acc3 1649 mov $acc0, %rax 1650 adc \$0, %rdx 1651 mov %rdx, $t1 1652 1653 mulq $acc6 # a[1]*a[3] 1654 add %rax, $acc4 1655 mov $acc0, %rax 1656 adc \$0, %rdx 1657 add $t1, $acc4 1658 mov %rdx, $acc5 1659 adc \$0, $acc5 1660 1661 ################################# 1662 mulq $acc7 # a[2]*a[3] 1663 xor $acc7, $acc7 1664 add %rax, $acc5 1665 mov 8*0($a_ptr), %rax 1666 mov %rdx, $acc6 1667 adc \$0, $acc6 1668 1669 add $acc1, $acc1 # acc1:6<<1 1670 adc $acc2, $acc2 1671 adc $acc3, $acc3 1672 adc $acc4, $acc4 1673 adc $acc5, $acc5 1674 adc $acc6, $acc6 1675 adc \$0, $acc7 1676 1677 mulq %rax 1678 mov %rax, $acc0 1679 mov 8*1($a_ptr), %rax 1680 mov %rdx, $t0 1681 1682 mulq %rax 1683 add $t0, $acc1 1684 adc %rax, $acc2 1685 mov 8*2($a_ptr), %rax 1686 adc \$0, %rdx 1687 mov %rdx, $t0 1688 1689 mulq %rax 1690 add $t0, $acc3 1691 adc %rax, $acc4 1692 mov 8*3($a_ptr), %rax 1693 adc \$0, %rdx 1694 mov %rdx, $t0 1695 1696 mulq %rax 1697 add $t0, $acc5 1698 adc %rax, $acc6 1699 mov $acc0, %rax 1700 adc %rdx, $acc7 1701 1702 mov .Lpoly+8*1(%rip), $a_ptr 1703 mov .Lpoly+8*3(%rip), $t1 1704 1705 ########################################## 1706 # Now the reduction 1707 # First iteration 1708 mov $acc0, $t0 1709 shl \$32, $acc0 1710 mulq $t1 1711 shr \$32, $t0 1712 add $acc0, $acc1 # +=acc[0]<<96 1713 adc $t0, $acc2 1714 adc %rax, $acc3 1715 mov $acc1, %rax 1716 adc \$0, %rdx 1717 1718 ########################################## 1719 # Second iteration 1720 mov $acc1, $t0 1721 shl \$32, $acc1 1722 mov %rdx, $acc0 1723 mulq $t1 1724 shr \$32, $t0 1725 add $acc1, $acc2 1726 adc $t0, $acc3 1727 adc %rax, $acc0 1728 mov $acc2, %rax 1729 adc \$0, %rdx 1730 1731 ########################################## 1732 # Third iteration 1733 mov $acc2, $t0 1734 shl \$32, $acc2 1735 mov %rdx, $acc1 1736 mulq $t1 1737 shr \$32, $t0 1738 add $acc2, $acc3 1739 adc $t0, $acc0 1740 adc %rax, $acc1 1741 mov $acc3, %rax 1742 adc \$0, %rdx 1743 1744 ########################################### 1745 # Last iteration 1746 mov $acc3, $t0 1747 shl \$32, $acc3 1748 mov %rdx, $acc2 1749 mulq $t1 1750 shr \$32, $t0 1751 add $acc3, $acc0 1752 adc $t0, $acc1 1753 adc %rax, $acc2 1754 adc \$0, %rdx 1755 xor $acc3, $acc3 1756 1757 ############################################ 1758 # Add the rest of the acc 1759 add $acc0, $acc4 1760 adc $acc1, $acc5 1761 mov $acc4, $acc0 1762 adc $acc2, $acc6 1763 adc %rdx, $acc7 1764 mov $acc5, $acc1 1765 adc \$0, $acc3 1766 1767 sub \$-1, $acc4 # .Lpoly[0] 1768 mov $acc6, $acc2 1769 sbb $a_ptr, $acc5 # .Lpoly[1] 1770 sbb \$0, $acc6 # .Lpoly[2] 1771 mov $acc7, $t0 1772 sbb $t1, $acc7 # .Lpoly[3] 1773 sbb \$0, $acc3 1774 1775 cmovc $acc0, $acc4 1776 cmovc $acc1, $acc5 1777 mov $acc4, 8*0($r_ptr) 1778 cmovc $acc2, $acc6 1779 mov $acc5, 8*1($r_ptr) 1780 cmovc $t0, $acc7 1781 mov $acc6, 8*2($r_ptr) 1782 mov $acc7, 8*3($r_ptr) 1783 1784 ret 1785.cfi_endproc 1786.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 1787___ 1788 1789if ($addx) { 1790$code.=<<___; 1791.type __ecp_nistz256_mul_montx,\@abi-omnipotent 1792.align 32 1793__ecp_nistz256_mul_montx: 1794.cfi_startproc 1795 ######################################################################## 1796 # Multiply by b[0] 1797 mulx $acc1, $acc0, $acc1 1798 mulx $acc2, $t0, $acc2 1799 mov \$32, $poly1 1800 xor $acc5, $acc5 # cf=0 1801 mulx $acc3, $t1, $acc3 1802 mov .Lpoly+8*3(%rip), $poly3 1803 adc $t0, $acc1 1804 mulx $acc4, $t0, $acc4 1805 mov $acc0, %rdx 1806 adc $t1, $acc2 1807 shlx $poly1,$acc0,$t1 1808 adc $t0, $acc3 1809 shrx $poly1,$acc0,$t0 1810 adc \$0, $acc4 1811 1812 ######################################################################## 1813 # First reduction step 1814 add $t1, $acc1 1815 adc $t0, $acc2 1816 1817 mulx $poly3, $t0, $t1 1818 mov 8*1($b_ptr), %rdx 1819 adc $t0, $acc3 1820 adc $t1, $acc4 1821 adc \$0, $acc5 1822 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 1823 1824 ######################################################################## 1825 # Multiply by b[1] 1826 mulx 8*0+128($a_ptr), $t0, $t1 1827 adcx $t0, $acc1 1828 adox $t1, $acc2 1829 1830 mulx 8*1+128($a_ptr), $t0, $t1 1831 adcx $t0, $acc2 1832 adox $t1, $acc3 1833 1834 mulx 8*2+128($a_ptr), $t0, $t1 1835 adcx $t0, $acc3 1836 adox $t1, $acc4 1837 1838 mulx 8*3+128($a_ptr), $t0, $t1 1839 mov $acc1, %rdx 1840 adcx $t0, $acc4 1841 shlx $poly1, $acc1, $t0 1842 adox $t1, $acc5 1843 shrx $poly1, $acc1, $t1 1844 1845 adcx $acc0, $acc5 1846 adox $acc0, $acc0 1847 adc \$0, $acc0 1848 1849 ######################################################################## 1850 # Second reduction step 1851 add $t0, $acc2 1852 adc $t1, $acc3 1853 1854 mulx $poly3, $t0, $t1 1855 mov 8*2($b_ptr), %rdx 1856 adc $t0, $acc4 1857 adc $t1, $acc5 1858 adc \$0, $acc0 1859 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1860 1861 ######################################################################## 1862 # Multiply by b[2] 1863 mulx 8*0+128($a_ptr), $t0, $t1 1864 adcx $t0, $acc2 1865 adox $t1, $acc3 1866 1867 mulx 8*1+128($a_ptr), $t0, $t1 1868 adcx $t0, $acc3 1869 adox $t1, $acc4 1870 1871 mulx 8*2+128($a_ptr), $t0, $t1 1872 adcx $t0, $acc4 1873 adox $t1, $acc5 1874 1875 mulx 8*3+128($a_ptr), $t0, $t1 1876 mov $acc2, %rdx 1877 adcx $t0, $acc5 1878 shlx $poly1, $acc2, $t0 1879 adox $t1, $acc0 1880 shrx $poly1, $acc2, $t1 1881 1882 adcx $acc1, $acc0 1883 adox $acc1, $acc1 1884 adc \$0, $acc1 1885 1886 ######################################################################## 1887 # Third reduction step 1888 add $t0, $acc3 1889 adc $t1, $acc4 1890 1891 mulx $poly3, $t0, $t1 1892 mov 8*3($b_ptr), %rdx 1893 adc $t0, $acc5 1894 adc $t1, $acc0 1895 adc \$0, $acc1 1896 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1897 1898 ######################################################################## 1899 # Multiply by b[3] 1900 mulx 8*0+128($a_ptr), $t0, $t1 1901 adcx $t0, $acc3 1902 adox $t1, $acc4 1903 1904 mulx 8*1+128($a_ptr), $t0, $t1 1905 adcx $t0, $acc4 1906 adox $t1, $acc5 1907 1908 mulx 8*2+128($a_ptr), $t0, $t1 1909 adcx $t0, $acc5 1910 adox $t1, $acc0 1911 1912 mulx 8*3+128($a_ptr), $t0, $t1 1913 mov $acc3, %rdx 1914 adcx $t0, $acc0 1915 shlx $poly1, $acc3, $t0 1916 adox $t1, $acc1 1917 shrx $poly1, $acc3, $t1 1918 1919 adcx $acc2, $acc1 1920 adox $acc2, $acc2 1921 adc \$0, $acc2 1922 1923 ######################################################################## 1924 # Fourth reduction step 1925 add $t0, $acc4 1926 adc $t1, $acc5 1927 1928 mulx $poly3, $t0, $t1 1929 mov $acc4, $t2 1930 mov .Lpoly+8*1(%rip), $poly1 1931 adc $t0, $acc0 1932 mov $acc5, $t3 1933 adc $t1, $acc1 1934 adc \$0, $acc2 1935 1936 ######################################################################## 1937 # Branch-less conditional subtraction of P 1938 xor %eax, %eax 1939 mov $acc0, $t0 1940 sbb \$-1, $acc4 # .Lpoly[0] 1941 sbb $poly1, $acc5 # .Lpoly[1] 1942 sbb \$0, $acc0 # .Lpoly[2] 1943 mov $acc1, $t1 1944 sbb $poly3, $acc1 # .Lpoly[3] 1945 sbb \$0, $acc2 1946 1947 cmovc $t2, $acc4 1948 cmovc $t3, $acc5 1949 mov $acc4, 8*0($r_ptr) 1950 cmovc $t0, $acc0 1951 mov $acc5, 8*1($r_ptr) 1952 cmovc $t1, $acc1 1953 mov $acc0, 8*2($r_ptr) 1954 mov $acc1, 8*3($r_ptr) 1955 1956 ret 1957.cfi_endproc 1958.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1959 1960.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1961.align 32 1962__ecp_nistz256_sqr_montx: 1963.cfi_startproc 1964 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1965 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1966 xor %eax, %eax 1967 adc $t0, $acc2 1968 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1969 mov $acc6, %rdx 1970 adc $t1, $acc3 1971 adc \$0, $acc4 1972 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1973 1974 ################################# 1975 mulx $acc7, $t0, $t1 # a[1]*a[2] 1976 adcx $t0, $acc3 1977 adox $t1, $acc4 1978 1979 mulx $acc0, $t0, $t1 # a[1]*a[3] 1980 mov $acc7, %rdx 1981 adcx $t0, $acc4 1982 adox $t1, $acc5 1983 adc \$0, $acc5 1984 1985 ################################# 1986 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1987 mov 8*0+128($a_ptr), %rdx 1988 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1989 adcx $acc1, $acc1 # acc1:6<<1 1990 adox $t0, $acc5 1991 adcx $acc2, $acc2 1992 adox $acc7, $acc6 # of=0 1993 1994 mulx %rdx, $acc0, $t1 1995 mov 8*1+128($a_ptr), %rdx 1996 adcx $acc3, $acc3 1997 adox $t1, $acc1 1998 adcx $acc4, $acc4 1999 mulx %rdx, $t0, $t4 2000 mov 8*2+128($a_ptr), %rdx 2001 adcx $acc5, $acc5 2002 adox $t0, $acc2 2003 adcx $acc6, $acc6 2004 .byte 0x67 2005 mulx %rdx, $t0, $t1 2006 mov 8*3+128($a_ptr), %rdx 2007 adox $t4, $acc3 2008 adcx $acc7, $acc7 2009 adox $t0, $acc4 2010 mov \$32, $a_ptr 2011 adox $t1, $acc5 2012 .byte 0x67,0x67 2013 mulx %rdx, $t0, $t4 2014 mov .Lpoly+8*3(%rip), %rdx 2015 adox $t0, $acc6 2016 shlx $a_ptr, $acc0, $t0 2017 adox $t4, $acc7 2018 shrx $a_ptr, $acc0, $t4 2019 mov %rdx,$t1 2020 2021 # reduction step 1 2022 add $t0, $acc1 2023 adc $t4, $acc2 2024 2025 mulx $acc0, $t0, $acc0 2026 adc $t0, $acc3 2027 shlx $a_ptr, $acc1, $t0 2028 adc \$0, $acc0 2029 shrx $a_ptr, $acc1, $t4 2030 2031 # reduction step 2 2032 add $t0, $acc2 2033 adc $t4, $acc3 2034 2035 mulx $acc1, $t0, $acc1 2036 adc $t0, $acc0 2037 shlx $a_ptr, $acc2, $t0 2038 adc \$0, $acc1 2039 shrx $a_ptr, $acc2, $t4 2040 2041 # reduction step 3 2042 add $t0, $acc3 2043 adc $t4, $acc0 2044 2045 mulx $acc2, $t0, $acc2 2046 adc $t0, $acc1 2047 shlx $a_ptr, $acc3, $t0 2048 adc \$0, $acc2 2049 shrx $a_ptr, $acc3, $t4 2050 2051 # reduction step 4 2052 add $t0, $acc0 2053 adc $t4, $acc1 2054 2055 mulx $acc3, $t0, $acc3 2056 adc $t0, $acc2 2057 adc \$0, $acc3 2058 2059 xor $t3, $t3 2060 add $acc0, $acc4 # accumulate upper half 2061 mov .Lpoly+8*1(%rip), $a_ptr 2062 adc $acc1, $acc5 2063 mov $acc4, $acc0 2064 adc $acc2, $acc6 2065 adc $acc3, $acc7 2066 mov $acc5, $acc1 2067 adc \$0, $t3 2068 2069 sub \$-1, $acc4 # .Lpoly[0] 2070 mov $acc6, $acc2 2071 sbb $a_ptr, $acc5 # .Lpoly[1] 2072 sbb \$0, $acc6 # .Lpoly[2] 2073 mov $acc7, $acc3 2074 sbb $t1, $acc7 # .Lpoly[3] 2075 sbb \$0, $t3 2076 2077 cmovc $acc0, $acc4 2078 cmovc $acc1, $acc5 2079 mov $acc4, 8*0($r_ptr) 2080 cmovc $acc2, $acc6 2081 mov $acc5, 8*1($r_ptr) 2082 cmovc $acc3, $acc7 2083 mov $acc6, 8*2($r_ptr) 2084 mov $acc7, 8*3($r_ptr) 2085 2086 ret 2087.cfi_endproc 2088.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 2089___ 2090} 2091} 2092{ 2093my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2094my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 2095my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 2096my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 2097 2098$code.=<<___; 2099################################################################################ 2100# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 2101.globl ecp_nistz256_select_w5 2102.type ecp_nistz256_select_w5,\@abi-omnipotent 2103.align 32 2104ecp_nistz256_select_w5: 2105.cfi_startproc 2106 _CET_ENDBR 2107___ 2108$code.=<<___ if ($avx>1); 2109 leaq OPENSSL_ia32cap_P(%rip), %rax 2110 mov 8(%rax), %rax 2111 test \$`1<<5`, %eax 2112 jnz .Lavx2_select_w5 2113___ 2114$code.=<<___ if ($win64); 2115 lea -0x88(%rsp), %rax 2116.LSEH_begin_ecp_nistz256_select_w5: 2117 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2118 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2119 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2120 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2121 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2122 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2123 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2124 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2125 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2126 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2127 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2128___ 2129$code.=<<___; 2130 movdqa .LOne(%rip), $ONE 2131 movd $index, $INDEX 2132 2133 pxor $Ra, $Ra 2134 pxor $Rb, $Rb 2135 pxor $Rc, $Rc 2136 pxor $Rd, $Rd 2137 pxor $Re, $Re 2138 pxor $Rf, $Rf 2139 2140 movdqa $ONE, $M0 2141 pshufd \$0, $INDEX, $INDEX 2142 2143 mov \$16, %rax 2144.Lselect_loop_sse_w5: 2145 2146 movdqa $M0, $TMP0 2147 paddd $ONE, $M0 2148 pcmpeqd $INDEX, $TMP0 2149 2150 movdqa 16*0($in_t), $T0a 2151 movdqa 16*1($in_t), $T0b 2152 movdqa 16*2($in_t), $T0c 2153 movdqa 16*3($in_t), $T0d 2154 movdqa 16*4($in_t), $T0e 2155 movdqa 16*5($in_t), $T0f 2156 lea 16*6($in_t), $in_t 2157 2158 pand $TMP0, $T0a 2159 pand $TMP0, $T0b 2160 por $T0a, $Ra 2161 pand $TMP0, $T0c 2162 por $T0b, $Rb 2163 pand $TMP0, $T0d 2164 por $T0c, $Rc 2165 pand $TMP0, $T0e 2166 por $T0d, $Rd 2167 pand $TMP0, $T0f 2168 por $T0e, $Re 2169 por $T0f, $Rf 2170 2171 dec %rax 2172 jnz .Lselect_loop_sse_w5 2173 2174 movdqu $Ra, 16*0($val) 2175 movdqu $Rb, 16*1($val) 2176 movdqu $Rc, 16*2($val) 2177 movdqu $Rd, 16*3($val) 2178 movdqu $Re, 16*4($val) 2179 movdqu $Rf, 16*5($val) 2180___ 2181$code.=<<___ if ($win64); 2182 movaps (%rsp), %xmm6 2183 movaps 0x10(%rsp), %xmm7 2184 movaps 0x20(%rsp), %xmm8 2185 movaps 0x30(%rsp), %xmm9 2186 movaps 0x40(%rsp), %xmm10 2187 movaps 0x50(%rsp), %xmm11 2188 movaps 0x60(%rsp), %xmm12 2189 movaps 0x70(%rsp), %xmm13 2190 movaps 0x80(%rsp), %xmm14 2191 movaps 0x90(%rsp), %xmm15 2192 lea 0xa8(%rsp), %rsp 2193___ 2194$code.=<<___; 2195 ret 2196.cfi_endproc 2197.LSEH_end_ecp_nistz256_select_w5: 2198.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 2199 2200################################################################################ 2201# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 2202.globl ecp_nistz256_select_w7 2203.type ecp_nistz256_select_w7,\@abi-omnipotent 2204.align 32 2205ecp_nistz256_select_w7: 2206.cfi_startproc 2207 _CET_ENDBR 2208___ 2209$code.=<<___ if ($avx>1); 2210 leaq OPENSSL_ia32cap_P(%rip), %rax 2211 mov 8(%rax), %rax 2212 test \$`1<<5`, %eax 2213 jnz .Lavx2_select_w7 2214___ 2215$code.=<<___ if ($win64); 2216 lea -0x88(%rsp), %rax 2217.LSEH_begin_ecp_nistz256_select_w7: 2218 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2219 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2220 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2221 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2222 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2223 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2224 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2225 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2226 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2227 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2228 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2229___ 2230$code.=<<___; 2231 movdqa .LOne(%rip), $M0 2232 movd $index, $INDEX 2233 2234 pxor $Ra, $Ra 2235 pxor $Rb, $Rb 2236 pxor $Rc, $Rc 2237 pxor $Rd, $Rd 2238 2239 movdqa $M0, $ONE 2240 pshufd \$0, $INDEX, $INDEX 2241 mov \$64, %rax 2242 2243.Lselect_loop_sse_w7: 2244 movdqa $M0, $TMP0 2245 paddd $ONE, $M0 2246 movdqa 16*0($in_t), $T0a 2247 movdqa 16*1($in_t), $T0b 2248 pcmpeqd $INDEX, $TMP0 2249 movdqa 16*2($in_t), $T0c 2250 movdqa 16*3($in_t), $T0d 2251 lea 16*4($in_t), $in_t 2252 2253 pand $TMP0, $T0a 2254 pand $TMP0, $T0b 2255 por $T0a, $Ra 2256 pand $TMP0, $T0c 2257 por $T0b, $Rb 2258 pand $TMP0, $T0d 2259 por $T0c, $Rc 2260 prefetcht0 255($in_t) 2261 por $T0d, $Rd 2262 2263 dec %rax 2264 jnz .Lselect_loop_sse_w7 2265 2266 movdqu $Ra, 16*0($val) 2267 movdqu $Rb, 16*1($val) 2268 movdqu $Rc, 16*2($val) 2269 movdqu $Rd, 16*3($val) 2270___ 2271$code.=<<___ if ($win64); 2272 movaps (%rsp), %xmm6 2273 movaps 0x10(%rsp), %xmm7 2274 movaps 0x20(%rsp), %xmm8 2275 movaps 0x30(%rsp), %xmm9 2276 movaps 0x40(%rsp), %xmm10 2277 movaps 0x50(%rsp), %xmm11 2278 movaps 0x60(%rsp), %xmm12 2279 movaps 0x70(%rsp), %xmm13 2280 movaps 0x80(%rsp), %xmm14 2281 movaps 0x90(%rsp), %xmm15 2282 lea 0xa8(%rsp), %rsp 2283___ 2284$code.=<<___; 2285 ret 2286.cfi_endproc 2287.LSEH_end_ecp_nistz256_select_w7: 2288.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 2289___ 2290} 2291if ($avx>1) { 2292my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2293my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 2294my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 2295my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 2296 2297$code.=<<___; 2298################################################################################ 2299# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 2300.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 2301.align 32 2302ecp_nistz256_avx2_select_w5: 2303.cfi_startproc 2304.Lavx2_select_w5: 2305 vzeroupper 2306___ 2307$code.=<<___ if ($win64); 2308 lea -0x88(%rsp), %rax 2309 mov %rsp,%r11 2310.LSEH_begin_ecp_nistz256_avx2_select_w5: 2311 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2312 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2313 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2314 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2315 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2316 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2317 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2318 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2319 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2320 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2321 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2322___ 2323$code.=<<___; 2324 vmovdqa .LTwo(%rip), $TWO 2325 2326 vpxor $Ra, $Ra, $Ra 2327 vpxor $Rb, $Rb, $Rb 2328 vpxor $Rc, $Rc, $Rc 2329 2330 vmovdqa .LOne(%rip), $M0 2331 vmovdqa .LTwo(%rip), $M1 2332 2333 vmovd $index, %xmm1 2334 vpermd $INDEX, $Ra, $INDEX 2335 2336 mov \$8, %rax 2337.Lselect_loop_avx2_w5: 2338 2339 vmovdqa 32*0($in_t), $T0a 2340 vmovdqa 32*1($in_t), $T0b 2341 vmovdqa 32*2($in_t), $T0c 2342 2343 vmovdqa 32*3($in_t), $T1a 2344 vmovdqa 32*4($in_t), $T1b 2345 vmovdqa 32*5($in_t), $T1c 2346 2347 vpcmpeqd $INDEX, $M0, $TMP0 2348 vpcmpeqd $INDEX, $M1, $TMP1 2349 2350 vpaddd $TWO, $M0, $M0 2351 vpaddd $TWO, $M1, $M1 2352 lea 32*6($in_t), $in_t 2353 2354 vpand $TMP0, $T0a, $T0a 2355 vpand $TMP0, $T0b, $T0b 2356 vpand $TMP0, $T0c, $T0c 2357 vpand $TMP1, $T1a, $T1a 2358 vpand $TMP1, $T1b, $T1b 2359 vpand $TMP1, $T1c, $T1c 2360 2361 vpxor $T0a, $Ra, $Ra 2362 vpxor $T0b, $Rb, $Rb 2363 vpxor $T0c, $Rc, $Rc 2364 vpxor $T1a, $Ra, $Ra 2365 vpxor $T1b, $Rb, $Rb 2366 vpxor $T1c, $Rc, $Rc 2367 2368 dec %rax 2369 jnz .Lselect_loop_avx2_w5 2370 2371 vmovdqu $Ra, 32*0($val) 2372 vmovdqu $Rb, 32*1($val) 2373 vmovdqu $Rc, 32*2($val) 2374 vzeroupper 2375___ 2376$code.=<<___ if ($win64); 2377 movaps (%rsp), %xmm6 2378 movaps 0x10(%rsp), %xmm7 2379 movaps 0x20(%rsp), %xmm8 2380 movaps 0x30(%rsp), %xmm9 2381 movaps 0x40(%rsp), %xmm10 2382 movaps 0x50(%rsp), %xmm11 2383 movaps 0x60(%rsp), %xmm12 2384 movaps 0x70(%rsp), %xmm13 2385 movaps 0x80(%rsp), %xmm14 2386 movaps 0x90(%rsp), %xmm15 2387 lea (%r11), %rsp 2388___ 2389$code.=<<___; 2390 ret 2391.cfi_endproc 2392.LSEH_end_ecp_nistz256_avx2_select_w5: 2393.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 2394___ 2395} 2396if ($avx>1) { 2397my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2398my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 2399my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 2400my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 2401my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 2402 2403$code.=<<___; 2404 2405################################################################################ 2406# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 2407.globl ecp_nistz256_avx2_select_w7 2408.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 2409.align 32 2410ecp_nistz256_avx2_select_w7: 2411.cfi_startproc 2412.Lavx2_select_w7: 2413 _CET_ENDBR 2414 vzeroupper 2415___ 2416$code.=<<___ if ($win64); 2417 mov %rsp,%r11 2418 lea -0x88(%rsp), %rax 2419.LSEH_begin_ecp_nistz256_avx2_select_w7: 2420 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2421 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2422 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2423 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2424 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2425 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2426 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2427 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2428 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2429 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2430 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2431___ 2432$code.=<<___; 2433 vmovdqa .LThree(%rip), $THREE 2434 2435 vpxor $Ra, $Ra, $Ra 2436 vpxor $Rb, $Rb, $Rb 2437 2438 vmovdqa .LOne(%rip), $M0 2439 vmovdqa .LTwo(%rip), $M1 2440 vmovdqa .LThree(%rip), $M2 2441 2442 vmovd $index, %xmm1 2443 vpermd $INDEX, $Ra, $INDEX 2444 # Skip index = 0, because it is implicitly the point at infinity 2445 2446 mov \$21, %rax 2447.Lselect_loop_avx2_w7: 2448 2449 vmovdqa 32*0($in_t), $T0a 2450 vmovdqa 32*1($in_t), $T0b 2451 2452 vmovdqa 32*2($in_t), $T1a 2453 vmovdqa 32*3($in_t), $T1b 2454 2455 vmovdqa 32*4($in_t), $T2a 2456 vmovdqa 32*5($in_t), $T2b 2457 2458 vpcmpeqd $INDEX, $M0, $TMP0 2459 vpcmpeqd $INDEX, $M1, $TMP1 2460 vpcmpeqd $INDEX, $M2, $TMP2 2461 2462 vpaddd $THREE, $M0, $M0 2463 vpaddd $THREE, $M1, $M1 2464 vpaddd $THREE, $M2, $M2 2465 lea 32*6($in_t), $in_t 2466 2467 vpand $TMP0, $T0a, $T0a 2468 vpand $TMP0, $T0b, $T0b 2469 vpand $TMP1, $T1a, $T1a 2470 vpand $TMP1, $T1b, $T1b 2471 vpand $TMP2, $T2a, $T2a 2472 vpand $TMP2, $T2b, $T2b 2473 2474 vpxor $T0a, $Ra, $Ra 2475 vpxor $T0b, $Rb, $Rb 2476 vpxor $T1a, $Ra, $Ra 2477 vpxor $T1b, $Rb, $Rb 2478 vpxor $T2a, $Ra, $Ra 2479 vpxor $T2b, $Rb, $Rb 2480 2481 dec %rax 2482 jnz .Lselect_loop_avx2_w7 2483 2484 2485 vmovdqa 32*0($in_t), $T0a 2486 vmovdqa 32*1($in_t), $T0b 2487 2488 vpcmpeqd $INDEX, $M0, $TMP0 2489 2490 vpand $TMP0, $T0a, $T0a 2491 vpand $TMP0, $T0b, $T0b 2492 2493 vpxor $T0a, $Ra, $Ra 2494 vpxor $T0b, $Rb, $Rb 2495 2496 vmovdqu $Ra, 32*0($val) 2497 vmovdqu $Rb, 32*1($val) 2498 vzeroupper 2499___ 2500$code.=<<___ if ($win64); 2501 movaps (%rsp), %xmm6 2502 movaps 0x10(%rsp), %xmm7 2503 movaps 0x20(%rsp), %xmm8 2504 movaps 0x30(%rsp), %xmm9 2505 movaps 0x40(%rsp), %xmm10 2506 movaps 0x50(%rsp), %xmm11 2507 movaps 0x60(%rsp), %xmm12 2508 movaps 0x70(%rsp), %xmm13 2509 movaps 0x80(%rsp), %xmm14 2510 movaps 0x90(%rsp), %xmm15 2511 lea (%r11), %rsp 2512___ 2513$code.=<<___; 2514 ret 2515.cfi_endproc 2516.LSEH_end_ecp_nistz256_avx2_select_w7: 2517.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2518___ 2519} else { 2520$code.=<<___; 2521.globl ecp_nistz256_avx2_select_w7 2522.type ecp_nistz256_avx2_select_w7,\@function,3 2523.align 32 2524ecp_nistz256_avx2_select_w7: 2525 _CET_ENDBR 2526 .byte 0x0f,0x0b # ud2 2527 ret 2528.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2529___ 2530} 2531{{{ 2532######################################################################## 2533# This block implements higher level point_double, point_add and 2534# point_add_affine. The key to performance in this case is to allow 2535# out-of-order execution logic to overlap computations from next step 2536# with tail processing from current step. By using tailored calling 2537# sequence we minimize inter-step overhead to give processor better 2538# shot at overlapping operations... 2539# 2540# You will notice that input data is copied to stack. Trouble is that 2541# there are no registers to spare for holding original pointers and 2542# reloading them, pointers, would create undesired dependencies on 2543# effective addresses calculation paths. In other words it's too done 2544# to favour out-of-order execution logic. 2545# <appro@openssl.org> 2546 2547my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 2548my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 2549my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 2550my ($poly1,$poly3)=($acc6,$acc7); 2551 2552sub load_for_mul () { 2553my ($a,$b,$src0) = @_; 2554my $bias = $src0 eq "%rax" ? 0 : -128; 2555 2556" mov $b, $src0 2557 lea $b, $b_ptr 2558 mov 8*0+$a, $acc1 2559 mov 8*1+$a, $acc2 2560 lea $bias+$a, $a_ptr 2561 mov 8*2+$a, $acc3 2562 mov 8*3+$a, $acc4" 2563} 2564 2565sub load_for_sqr () { 2566my ($a,$src0) = @_; 2567my $bias = $src0 eq "%rax" ? 0 : -128; 2568 2569" mov 8*0+$a, $src0 2570 mov 8*1+$a, $acc6 2571 lea $bias+$a, $a_ptr 2572 mov 8*2+$a, $acc7 2573 mov 8*3+$a, $acc0" 2574} 2575 2576 { 2577######################################################################## 2578# operate in 4-5-0-1 "name space" that matches multiplication output 2579# 2580my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2581 2582$code.=<<___; 2583.type __ecp_nistz256_add_toq,\@abi-omnipotent 2584.align 32 2585__ecp_nistz256_add_toq: 2586.cfi_startproc 2587 xor $t4,$t4 2588 add 8*0($b_ptr), $a0 2589 adc 8*1($b_ptr), $a1 2590 mov $a0, $t0 2591 adc 8*2($b_ptr), $a2 2592 adc 8*3($b_ptr), $a3 2593 mov $a1, $t1 2594 adc \$0, $t4 2595 2596 sub \$-1, $a0 2597 mov $a2, $t2 2598 sbb $poly1, $a1 2599 sbb \$0, $a2 2600 mov $a3, $t3 2601 sbb $poly3, $a3 2602 sbb \$0, $t4 2603 2604 cmovc $t0, $a0 2605 cmovc $t1, $a1 2606 mov $a0, 8*0($r_ptr) 2607 cmovc $t2, $a2 2608 mov $a1, 8*1($r_ptr) 2609 cmovc $t3, $a3 2610 mov $a2, 8*2($r_ptr) 2611 mov $a3, 8*3($r_ptr) 2612 2613 ret 2614.cfi_endproc 2615.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 2616 2617.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 2618.align 32 2619__ecp_nistz256_sub_fromq: 2620.cfi_startproc 2621 sub 8*0($b_ptr), $a0 2622 sbb 8*1($b_ptr), $a1 2623 mov $a0, $t0 2624 sbb 8*2($b_ptr), $a2 2625 sbb 8*3($b_ptr), $a3 2626 mov $a1, $t1 2627 sbb $t4, $t4 2628 2629 add \$-1, $a0 2630 mov $a2, $t2 2631 adc $poly1, $a1 2632 adc \$0, $a2 2633 mov $a3, $t3 2634 adc $poly3, $a3 2635 test $t4, $t4 2636 2637 cmovz $t0, $a0 2638 cmovz $t1, $a1 2639 mov $a0, 8*0($r_ptr) 2640 cmovz $t2, $a2 2641 mov $a1, 8*1($r_ptr) 2642 cmovz $t3, $a3 2643 mov $a2, 8*2($r_ptr) 2644 mov $a3, 8*3($r_ptr) 2645 2646 ret 2647.cfi_endproc 2648.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 2649 2650.type __ecp_nistz256_subq,\@abi-omnipotent 2651.align 32 2652__ecp_nistz256_subq: 2653.cfi_startproc 2654 sub $a0, $t0 2655 sbb $a1, $t1 2656 mov $t0, $a0 2657 sbb $a2, $t2 2658 sbb $a3, $t3 2659 mov $t1, $a1 2660 sbb $t4, $t4 2661 2662 add \$-1, $t0 2663 mov $t2, $a2 2664 adc $poly1, $t1 2665 adc \$0, $t2 2666 mov $t3, $a3 2667 adc $poly3, $t3 2668 test $t4, $t4 2669 2670 cmovnz $t0, $a0 2671 cmovnz $t1, $a1 2672 cmovnz $t2, $a2 2673 cmovnz $t3, $a3 2674 2675 ret 2676.cfi_endproc 2677.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 2678 2679.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 2680.align 32 2681__ecp_nistz256_mul_by_2q: 2682.cfi_startproc 2683 xor $t4, $t4 2684 add $a0, $a0 # a0:a3+a0:a3 2685 adc $a1, $a1 2686 mov $a0, $t0 2687 adc $a2, $a2 2688 adc $a3, $a3 2689 mov $a1, $t1 2690 adc \$0, $t4 2691 2692 sub \$-1, $a0 2693 mov $a2, $t2 2694 sbb $poly1, $a1 2695 sbb \$0, $a2 2696 mov $a3, $t3 2697 sbb $poly3, $a3 2698 sbb \$0, $t4 2699 2700 cmovc $t0, $a0 2701 cmovc $t1, $a1 2702 mov $a0, 8*0($r_ptr) 2703 cmovc $t2, $a2 2704 mov $a1, 8*1($r_ptr) 2705 cmovc $t3, $a3 2706 mov $a2, 8*2($r_ptr) 2707 mov $a3, 8*3($r_ptr) 2708 2709 ret 2710.cfi_endproc 2711.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 2712___ 2713 } 2714sub gen_double () { 2715 my $x = shift; 2716 my ($src0,$sfx,$bias); 2717 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 2718 2719 if ($x ne "x") { 2720 $src0 = "%rax"; 2721 $sfx = ""; 2722 $bias = 0; 2723 2724$code.=<<___; 2725.globl ecp_nistz256_point_double 2726.type ecp_nistz256_point_double,\@function,2 2727.align 32 2728ecp_nistz256_point_double: 2729.cfi_startproc 2730 _CET_ENDBR 2731___ 2732$code.=<<___ if ($addx); 2733 leaq OPENSSL_ia32cap_P(%rip), %rcx 2734 mov 8(%rcx), %rcx 2735 and \$0x80100, %ecx 2736 cmp \$0x80100, %ecx 2737 je .Lpoint_doublex 2738___ 2739 } else { 2740 $src0 = "%rdx"; 2741 $sfx = "x"; 2742 $bias = 128; 2743 2744$code.=<<___; 2745.type ecp_nistz256_point_doublex,\@function,2 2746.align 32 2747ecp_nistz256_point_doublex: 2748.cfi_startproc 2749.Lpoint_doublex: 2750___ 2751 } 2752$code.=<<___; 2753 push %rbp 2754.cfi_push %rbp 2755 push %rbx 2756.cfi_push %rbx 2757 push %r12 2758.cfi_push %r12 2759 push %r13 2760.cfi_push %r13 2761 push %r14 2762.cfi_push %r14 2763 push %r15 2764.cfi_push %r15 2765 sub \$32*5+8, %rsp 2766.cfi_adjust_cfa_offset 32*5+8 2767.Lpoint_double${x}_body: 2768 2769.Lpoint_double_shortcut$x: 2770 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2771 mov $a_ptr, $b_ptr # backup copy 2772 movdqu 0x10($a_ptr), %xmm1 2773 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2774 mov 0x20+8*1($a_ptr), $acc5 2775 mov 0x20+8*2($a_ptr), $acc0 2776 mov 0x20+8*3($a_ptr), $acc1 2777 mov .Lpoly+8*1(%rip), $poly1 2778 mov .Lpoly+8*3(%rip), $poly3 2779 movdqa %xmm0, $in_x(%rsp) 2780 movdqa %xmm1, $in_x+0x10(%rsp) 2781 lea 0x20($r_ptr), $acc2 2782 lea 0x40($r_ptr), $acc3 2783 movq $r_ptr, %xmm0 2784 movq $acc2, %xmm1 2785 movq $acc3, %xmm2 2786 2787 lea $S(%rsp), $r_ptr 2788 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2789 2790 mov 0x40+8*0($a_ptr), $src0 2791 mov 0x40+8*1($a_ptr), $acc6 2792 mov 0x40+8*2($a_ptr), $acc7 2793 mov 0x40+8*3($a_ptr), $acc0 2794 lea 0x40-$bias($a_ptr), $a_ptr 2795 lea $Zsqr(%rsp), $r_ptr 2796 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2797 2798 `&load_for_sqr("$S(%rsp)", "$src0")` 2799 lea $S(%rsp), $r_ptr 2800 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2801 2802 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2803 mov 0x40+8*0($b_ptr), $acc1 2804 mov 0x40+8*1($b_ptr), $acc2 2805 mov 0x40+8*2($b_ptr), $acc3 2806 mov 0x40+8*3($b_ptr), $acc4 2807 lea 0x40-$bias($b_ptr), $a_ptr 2808 lea 0x20($b_ptr), $b_ptr 2809 movq %xmm2, $r_ptr 2810 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2811 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2812 2813 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2814 mov $in_x+8*1(%rsp), $acc5 2815 lea $Zsqr(%rsp), $b_ptr 2816 mov $in_x+8*2(%rsp), $acc0 2817 mov $in_x+8*3(%rsp), $acc1 2818 lea $M(%rsp), $r_ptr 2819 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2820 2821 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2822 mov $in_x+8*1(%rsp), $acc5 2823 lea $Zsqr(%rsp), $b_ptr 2824 mov $in_x+8*2(%rsp), $acc0 2825 mov $in_x+8*3(%rsp), $acc1 2826 lea $Zsqr(%rsp), $r_ptr 2827 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2828 2829 `&load_for_sqr("$S(%rsp)", "$src0")` 2830 movq %xmm1, $r_ptr 2831 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2832___ 2833{ 2834######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2835# operate in 4-5-6-7 "name space" that matches squaring output 2836# 2837my ($poly1,$poly3)=($a_ptr,$t1); 2838my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2839 2840$code.=<<___; 2841 xor $t4, $t4 2842 mov $a0, $t0 2843 add \$-1, $a0 2844 mov $a1, $t1 2845 adc $poly1, $a1 2846 mov $a2, $t2 2847 adc \$0, $a2 2848 mov $a3, $t3 2849 adc $poly3, $a3 2850 adc \$0, $t4 2851 xor $a_ptr, $a_ptr # borrow $a_ptr 2852 test \$1, $t0 2853 2854 cmovz $t0, $a0 2855 cmovz $t1, $a1 2856 cmovz $t2, $a2 2857 cmovz $t3, $a3 2858 cmovz $a_ptr, $t4 2859 2860 mov $a1, $t0 # a0:a3>>1 2861 shr \$1, $a0 2862 shl \$63, $t0 2863 mov $a2, $t1 2864 shr \$1, $a1 2865 or $t0, $a0 2866 shl \$63, $t1 2867 mov $a3, $t2 2868 shr \$1, $a2 2869 or $t1, $a1 2870 shl \$63, $t2 2871 mov $a0, 8*0($r_ptr) 2872 shr \$1, $a3 2873 mov $a1, 8*1($r_ptr) 2874 shl \$63, $t4 2875 or $t2, $a2 2876 or $t4, $a3 2877 mov $a2, 8*2($r_ptr) 2878 mov $a3, 8*3($r_ptr) 2879___ 2880} 2881$code.=<<___; 2882 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2883 lea $M(%rsp), $r_ptr 2884 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2885 2886 lea $tmp0(%rsp), $r_ptr 2887 call __ecp_nistz256_mul_by_2$x 2888 2889 lea $M(%rsp), $b_ptr 2890 lea $M(%rsp), $r_ptr 2891 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2892 2893 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2894 lea $S(%rsp), $r_ptr 2895 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2896 2897 lea $tmp0(%rsp), $r_ptr 2898 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2899 2900 `&load_for_sqr("$M(%rsp)", "$src0")` 2901 movq %xmm0, $r_ptr 2902 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2903 2904 lea $tmp0(%rsp), $b_ptr 2905 mov $acc6, $acc0 # harmonize sqr output and sub input 2906 mov $acc7, $acc1 2907 mov $a_ptr, $poly1 2908 mov $t1, $poly3 2909 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2910 2911 mov $S+8*0(%rsp), $t0 2912 mov $S+8*1(%rsp), $t1 2913 mov $S+8*2(%rsp), $t2 2914 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2915 lea $S(%rsp), $r_ptr 2916 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2917 2918 mov $M(%rsp), $src0 2919 lea $M(%rsp), $b_ptr 2920 mov $acc4, $acc6 # harmonize sub output and mul input 2921 xor %ecx, %ecx 2922 mov $acc4, $S+8*0(%rsp) # have to save:-( 2923 mov $acc5, $acc2 2924 mov $acc5, $S+8*1(%rsp) 2925 cmovz $acc0, $acc3 2926 mov $acc0, $S+8*2(%rsp) 2927 lea $S-$bias(%rsp), $a_ptr 2928 cmovz $acc1, $acc4 2929 mov $acc1, $S+8*3(%rsp) 2930 mov $acc6, $acc1 2931 lea $S(%rsp), $r_ptr 2932 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2933 2934 movq %xmm1, $b_ptr 2935 movq %xmm1, $r_ptr 2936 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2937 2938 lea 32*5+56(%rsp), %rsi 2939.cfi_def_cfa %rsi,8 2940 mov -48(%rsi),%r15 2941.cfi_restore %r15 2942 mov -40(%rsi),%r14 2943.cfi_restore %r14 2944 mov -32(%rsi),%r13 2945.cfi_restore %r13 2946 mov -24(%rsi),%r12 2947.cfi_restore %r12 2948 mov -16(%rsi),%rbx 2949.cfi_restore %rbx 2950 mov -8(%rsi),%rbp 2951.cfi_restore %rbp 2952 lea (%rsi),%rsp 2953.cfi_def_cfa_register %rsp 2954.Lpoint_double${x}_epilogue: 2955 ret 2956.cfi_endproc 2957.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2958___ 2959} 2960&gen_double("q"); 2961 2962sub gen_add () { 2963 my $x = shift; 2964 my ($src0,$sfx,$bias); 2965 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2966 $U1,$U2,$S1,$S2, 2967 $res_x,$res_y,$res_z, 2968 $in1_x,$in1_y,$in1_z, 2969 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2970 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2971 2972 if ($x ne "x") { 2973 $src0 = "%rax"; 2974 $sfx = ""; 2975 $bias = 0; 2976 2977$code.=<<___; 2978.globl ecp_nistz256_point_add 2979.type ecp_nistz256_point_add,\@function,3 2980.align 32 2981ecp_nistz256_point_add: 2982.cfi_startproc 2983 _CET_ENDBR 2984___ 2985$code.=<<___ if ($addx); 2986 leaq OPENSSL_ia32cap_P(%rip), %rcx 2987 mov 8(%rcx), %rcx 2988 and \$0x80100, %ecx 2989 cmp \$0x80100, %ecx 2990 je .Lpoint_addx 2991___ 2992 } else { 2993 $src0 = "%rdx"; 2994 $sfx = "x"; 2995 $bias = 128; 2996 2997$code.=<<___; 2998.type ecp_nistz256_point_addx,\@function,3 2999.align 32 3000ecp_nistz256_point_addx: 3001.cfi_startproc 3002.Lpoint_addx: 3003___ 3004 } 3005$code.=<<___; 3006 push %rbp 3007.cfi_push %rbp 3008 push %rbx 3009.cfi_push %rbx 3010 push %r12 3011.cfi_push %r12 3012 push %r13 3013.cfi_push %r13 3014 push %r14 3015.cfi_push %r14 3016 push %r15 3017.cfi_push %r15 3018 sub \$32*18+8, %rsp 3019.cfi_adjust_cfa_offset 32*18+8 3020.Lpoint_add${x}_body: 3021 3022 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3023 movdqu 0x10($a_ptr), %xmm1 3024 movdqu 0x20($a_ptr), %xmm2 3025 movdqu 0x30($a_ptr), %xmm3 3026 movdqu 0x40($a_ptr), %xmm4 3027 movdqu 0x50($a_ptr), %xmm5 3028 mov $a_ptr, $b_ptr # reassign 3029 mov $b_org, $a_ptr # reassign 3030 movdqa %xmm0, $in1_x(%rsp) 3031 movdqa %xmm1, $in1_x+0x10(%rsp) 3032 movdqa %xmm2, $in1_y(%rsp) 3033 movdqa %xmm3, $in1_y+0x10(%rsp) 3034 movdqa %xmm4, $in1_z(%rsp) 3035 movdqa %xmm5, $in1_z+0x10(%rsp) 3036 por %xmm4, %xmm5 3037 3038 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 3039 pshufd \$0xb1, %xmm5, %xmm3 3040 movdqu 0x10($a_ptr), %xmm1 3041 movdqu 0x20($a_ptr), %xmm2 3042 por %xmm3, %xmm5 3043 movdqu 0x30($a_ptr), %xmm3 3044 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 3045 mov 0x40+8*1($a_ptr), $acc6 3046 mov 0x40+8*2($a_ptr), $acc7 3047 mov 0x40+8*3($a_ptr), $acc0 3048 movdqa %xmm0, $in2_x(%rsp) 3049 pshufd \$0x1e, %xmm5, %xmm4 3050 movdqa %xmm1, $in2_x+0x10(%rsp) 3051 movdqu 0x40($a_ptr),%xmm0 # in2_z again 3052 movdqu 0x50($a_ptr),%xmm1 3053 movdqa %xmm2, $in2_y(%rsp) 3054 movdqa %xmm3, $in2_y+0x10(%rsp) 3055 por %xmm4, %xmm5 3056 pxor %xmm4, %xmm4 3057 por %xmm0, %xmm1 3058 movq $r_ptr, %xmm0 # save $r_ptr 3059 3060 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3061 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 3062 mov $acc6, $in2_z+8*1(%rsp) 3063 mov $acc7, $in2_z+8*2(%rsp) 3064 mov $acc0, $in2_z+8*3(%rsp) 3065 lea $Z2sqr(%rsp), $r_ptr # Z2^2 3066 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 3067 3068 pcmpeqd %xmm4, %xmm5 3069 pshufd \$0xb1, %xmm1, %xmm4 3070 por %xmm1, %xmm4 3071 pshufd \$0, %xmm5, %xmm5 # in1infty 3072 pshufd \$0x1e, %xmm4, %xmm3 3073 por %xmm3, %xmm4 3074 pxor %xmm3, %xmm3 3075 pcmpeqd %xmm3, %xmm4 3076 pshufd \$0, %xmm4, %xmm4 # in2infty 3077 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 3078 mov 0x40+8*1($b_ptr), $acc6 3079 mov 0x40+8*2($b_ptr), $acc7 3080 mov 0x40+8*3($b_ptr), $acc0 3081 movq $b_ptr, %xmm1 3082 3083 lea 0x40-$bias($b_ptr), $a_ptr 3084 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3085 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3086 3087 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 3088 lea $S1(%rsp), $r_ptr # S1 = Z2^3 3089 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 3090 3091 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3092 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3093 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3094 3095 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 3096 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 3097 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 3098 3099 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3100 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3101 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3102 3103 lea $S1(%rsp), $b_ptr 3104 lea $R(%rsp), $r_ptr # R = S2 - S1 3105 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 3106 3107 or $acc5, $acc4 # see if result is zero 3108 movdqa %xmm4, %xmm2 3109 or $acc0, $acc4 3110 or $acc1, $acc4 3111 por %xmm5, %xmm2 # in1infty || in2infty 3112 movq $acc4, %xmm3 3113 3114 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3115 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 3116 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 3117 3118 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 3119 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3120 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 3121 3122 lea $U1(%rsp), $b_ptr 3123 lea $H(%rsp), $r_ptr # H = U2 - U1 3124 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 3125 3126 or $acc5, $acc4 # see if result is zero 3127 or $acc0, $acc4 3128 or $acc1, $acc4 # !is_equal(U1, U2) 3129 3130 movq %xmm2, $acc0 3131 movq %xmm3, $acc1 3132 or $acc0, $acc4 3133 .byte 0x3e # predict taken 3134 jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty 3135 3136 # We now know A = B or A = -B and neither is infinity. Compare the 3137 # y-coordinates via S1 and S2. 3138 test $acc1, $acc1 3139 jz .Ladd_double$x # is_equal(S1, S2) 3140 3141 # A = -B, so the result is infinity. 3142 # 3143 # TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in 3144 # which case we should eliminate this special-case and simplify the 3145 # timing analysis. 3146 movq %xmm0, $r_ptr # restore $r_ptr 3147 pxor %xmm0, %xmm0 3148 movdqu %xmm0, 0x00($r_ptr) 3149 movdqu %xmm0, 0x10($r_ptr) 3150 movdqu %xmm0, 0x20($r_ptr) 3151 movdqu %xmm0, 0x30($r_ptr) 3152 movdqu %xmm0, 0x40($r_ptr) 3153 movdqu %xmm0, 0x50($r_ptr) 3154 jmp .Ladd_done$x 3155 3156.align 32 3157.Ladd_double$x: 3158 movq %xmm1, $a_ptr # restore $a_ptr 3159 movq %xmm0, $r_ptr # restore $r_ptr 3160 add \$`32*(18-5)`, %rsp # difference in frame sizes 3161.cfi_adjust_cfa_offset `-32*(18-5)` 3162 jmp .Lpoint_double_shortcut$x 3163.cfi_adjust_cfa_offset `32*(18-5)` 3164 3165.align 32 3166.Ladd_proceed$x: 3167 `&load_for_sqr("$R(%rsp)", "$src0")` 3168 lea $Rsqr(%rsp), $r_ptr # R^2 3169 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3170 3171 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3172 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3173 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3174 3175 `&load_for_sqr("$H(%rsp)", "$src0")` 3176 lea $Hsqr(%rsp), $r_ptr # H^2 3177 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3178 3179 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 3180 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3181 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 3182 3183 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 3184 lea $Hcub(%rsp), $r_ptr # H^3 3185 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3186 3187 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 3188 lea $U2(%rsp), $r_ptr # U1*H^2 3189 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 3190___ 3191{ 3192####################################################################### 3193# operate in 4-5-0-1 "name space" that matches multiplication output 3194# 3195my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3196my ($poly1, $poly3)=($acc6,$acc7); 3197 3198$code.=<<___; 3199 #lea $U2(%rsp), $a_ptr 3200 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3201 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3202 3203 xor $t4, $t4 3204 add $acc0, $acc0 # a0:a3+a0:a3 3205 lea $Rsqr(%rsp), $a_ptr 3206 adc $acc1, $acc1 3207 mov $acc0, $t0 3208 adc $acc2, $acc2 3209 adc $acc3, $acc3 3210 mov $acc1, $t1 3211 adc \$0, $t4 3212 3213 sub \$-1, $acc0 3214 mov $acc2, $t2 3215 sbb $poly1, $acc1 3216 sbb \$0, $acc2 3217 mov $acc3, $t3 3218 sbb $poly3, $acc3 3219 sbb \$0, $t4 3220 3221 cmovc $t0, $acc0 3222 mov 8*0($a_ptr), $t0 3223 cmovc $t1, $acc1 3224 mov 8*1($a_ptr), $t1 3225 cmovc $t2, $acc2 3226 mov 8*2($a_ptr), $t2 3227 cmovc $t3, $acc3 3228 mov 8*3($a_ptr), $t3 3229 3230 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3231 3232 lea $Hcub(%rsp), $b_ptr 3233 lea $res_x(%rsp), $r_ptr 3234 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3235 3236 mov $U2+8*0(%rsp), $t0 3237 mov $U2+8*1(%rsp), $t1 3238 mov $U2+8*2(%rsp), $t2 3239 mov $U2+8*3(%rsp), $t3 3240 lea $res_y(%rsp), $r_ptr 3241 3242 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 3243 3244 mov $acc0, 8*0($r_ptr) # save the result, as 3245 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3246 mov $acc2, 8*2($r_ptr) 3247 mov $acc3, 8*3($r_ptr) 3248___ 3249} 3250$code.=<<___; 3251 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 3252 lea $S2(%rsp), $r_ptr 3253 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 3254 3255 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 3256 lea $res_y(%rsp), $r_ptr 3257 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 3258 3259 lea $S2(%rsp), $b_ptr 3260 lea $res_y(%rsp), $r_ptr 3261 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 3262 3263 movq %xmm0, $r_ptr # restore $r_ptr 3264 3265 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 3266 movdqa %xmm5, %xmm1 3267 pandn $res_z(%rsp), %xmm0 3268 movdqa %xmm5, %xmm2 3269 pandn $res_z+0x10(%rsp), %xmm1 3270 movdqa %xmm5, %xmm3 3271 pand $in2_z(%rsp), %xmm2 3272 pand $in2_z+0x10(%rsp), %xmm3 3273 por %xmm0, %xmm2 3274 por %xmm1, %xmm3 3275 3276 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3277 movdqa %xmm4, %xmm1 3278 pandn %xmm2, %xmm0 3279 movdqa %xmm4, %xmm2 3280 pandn %xmm3, %xmm1 3281 movdqa %xmm4, %xmm3 3282 pand $in1_z(%rsp), %xmm2 3283 pand $in1_z+0x10(%rsp), %xmm3 3284 por %xmm0, %xmm2 3285 por %xmm1, %xmm3 3286 movdqu %xmm2, 0x40($r_ptr) 3287 movdqu %xmm3, 0x50($r_ptr) 3288 3289 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3290 movdqa %xmm5, %xmm1 3291 pandn $res_x(%rsp), %xmm0 3292 movdqa %xmm5, %xmm2 3293 pandn $res_x+0x10(%rsp), %xmm1 3294 movdqa %xmm5, %xmm3 3295 pand $in2_x(%rsp), %xmm2 3296 pand $in2_x+0x10(%rsp), %xmm3 3297 por %xmm0, %xmm2 3298 por %xmm1, %xmm3 3299 3300 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3301 movdqa %xmm4, %xmm1 3302 pandn %xmm2, %xmm0 3303 movdqa %xmm4, %xmm2 3304 pandn %xmm3, %xmm1 3305 movdqa %xmm4, %xmm3 3306 pand $in1_x(%rsp), %xmm2 3307 pand $in1_x+0x10(%rsp), %xmm3 3308 por %xmm0, %xmm2 3309 por %xmm1, %xmm3 3310 movdqu %xmm2, 0x00($r_ptr) 3311 movdqu %xmm3, 0x10($r_ptr) 3312 3313 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3314 movdqa %xmm5, %xmm1 3315 pandn $res_y(%rsp), %xmm0 3316 movdqa %xmm5, %xmm2 3317 pandn $res_y+0x10(%rsp), %xmm1 3318 movdqa %xmm5, %xmm3 3319 pand $in2_y(%rsp), %xmm2 3320 pand $in2_y+0x10(%rsp), %xmm3 3321 por %xmm0, %xmm2 3322 por %xmm1, %xmm3 3323 3324 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3325 movdqa %xmm4, %xmm1 3326 pandn %xmm2, %xmm0 3327 movdqa %xmm4, %xmm2 3328 pandn %xmm3, %xmm1 3329 movdqa %xmm4, %xmm3 3330 pand $in1_y(%rsp), %xmm2 3331 pand $in1_y+0x10(%rsp), %xmm3 3332 por %xmm0, %xmm2 3333 por %xmm1, %xmm3 3334 movdqu %xmm2, 0x20($r_ptr) 3335 movdqu %xmm3, 0x30($r_ptr) 3336 3337.Ladd_done$x: 3338 lea 32*18+56(%rsp), %rsi 3339.cfi_def_cfa %rsi,8 3340 mov -48(%rsi),%r15 3341.cfi_restore %r15 3342 mov -40(%rsi),%r14 3343.cfi_restore %r14 3344 mov -32(%rsi),%r13 3345.cfi_restore %r13 3346 mov -24(%rsi),%r12 3347.cfi_restore %r12 3348 mov -16(%rsi),%rbx 3349.cfi_restore %rbx 3350 mov -8(%rsi),%rbp 3351.cfi_restore %rbp 3352 lea (%rsi),%rsp 3353.cfi_def_cfa_register %rsp 3354.Lpoint_add${x}_epilogue: 3355 ret 3356.cfi_endproc 3357.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 3358___ 3359} 3360&gen_add("q"); 3361 3362sub gen_add_affine () { 3363 my $x = shift; 3364 my ($src0,$sfx,$bias); 3365 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 3366 $res_x,$res_y,$res_z, 3367 $in1_x,$in1_y,$in1_z, 3368 $in2_x,$in2_y)=map(32*$_,(0..14)); 3369 my $Z1sqr = $S2; 3370 3371 if ($x ne "x") { 3372 $src0 = "%rax"; 3373 $sfx = ""; 3374 $bias = 0; 3375 3376$code.=<<___; 3377.globl ecp_nistz256_point_add_affine 3378.type ecp_nistz256_point_add_affine,\@function,3 3379.align 32 3380ecp_nistz256_point_add_affine: 3381.cfi_startproc 3382 _CET_ENDBR 3383___ 3384$code.=<<___ if ($addx); 3385 leaq OPENSSL_ia32cap_P(%rip), %rcx 3386 mov 8(%rcx), %rcx 3387 and \$0x80100, %ecx 3388 cmp \$0x80100, %ecx 3389 je .Lpoint_add_affinex 3390___ 3391 } else { 3392 $src0 = "%rdx"; 3393 $sfx = "x"; 3394 $bias = 128; 3395 3396$code.=<<___; 3397.type ecp_nistz256_point_add_affinex,\@function,3 3398.align 32 3399ecp_nistz256_point_add_affinex: 3400.cfi_startproc 3401.Lpoint_add_affinex: 3402___ 3403 } 3404$code.=<<___; 3405 push %rbp 3406.cfi_push %rbp 3407 push %rbx 3408.cfi_push %rbx 3409 push %r12 3410.cfi_push %r12 3411 push %r13 3412.cfi_push %r13 3413 push %r14 3414.cfi_push %r14 3415 push %r15 3416.cfi_push %r15 3417 sub \$32*15+8, %rsp 3418.cfi_adjust_cfa_offset 32*15+8 3419.Ladd_affine${x}_body: 3420 3421 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3422 mov $b_org, $b_ptr # reassign 3423 movdqu 0x10($a_ptr), %xmm1 3424 movdqu 0x20($a_ptr), %xmm2 3425 movdqu 0x30($a_ptr), %xmm3 3426 movdqu 0x40($a_ptr), %xmm4 3427 movdqu 0x50($a_ptr), %xmm5 3428 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 3429 mov 0x40+8*1($a_ptr), $acc6 3430 mov 0x40+8*2($a_ptr), $acc7 3431 mov 0x40+8*3($a_ptr), $acc0 3432 movdqa %xmm0, $in1_x(%rsp) 3433 movdqa %xmm1, $in1_x+0x10(%rsp) 3434 movdqa %xmm2, $in1_y(%rsp) 3435 movdqa %xmm3, $in1_y+0x10(%rsp) 3436 movdqa %xmm4, $in1_z(%rsp) 3437 movdqa %xmm5, $in1_z+0x10(%rsp) 3438 por %xmm4, %xmm5 3439 3440 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 3441 pshufd \$0xb1, %xmm5, %xmm3 3442 movdqu 0x10($b_ptr), %xmm1 3443 movdqu 0x20($b_ptr), %xmm2 3444 por %xmm3, %xmm5 3445 movdqu 0x30($b_ptr), %xmm3 3446 movdqa %xmm0, $in2_x(%rsp) 3447 pshufd \$0x1e, %xmm5, %xmm4 3448 movdqa %xmm1, $in2_x+0x10(%rsp) 3449 por %xmm0, %xmm1 3450 movq $r_ptr, %xmm0 # save $r_ptr 3451 movdqa %xmm2, $in2_y(%rsp) 3452 movdqa %xmm3, $in2_y+0x10(%rsp) 3453 por %xmm2, %xmm3 3454 por %xmm4, %xmm5 3455 pxor %xmm4, %xmm4 3456 por %xmm1, %xmm3 3457 3458 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3459 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3460 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3461 3462 pcmpeqd %xmm4, %xmm5 3463 pshufd \$0xb1, %xmm3, %xmm4 3464 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 3465 #lea 0x00($b_ptr), $b_ptr 3466 mov $acc4, $acc1 # harmonize sqr output and mul input 3467 por %xmm3, %xmm4 3468 pshufd \$0, %xmm5, %xmm5 # in1infty 3469 pshufd \$0x1e, %xmm4, %xmm3 3470 mov $acc5, $acc2 3471 por %xmm3, %xmm4 3472 pxor %xmm3, %xmm3 3473 mov $acc6, $acc3 3474 pcmpeqd %xmm3, %xmm4 3475 pshufd \$0, %xmm4, %xmm4 # in2infty 3476 3477 lea $Z1sqr-$bias(%rsp), $a_ptr 3478 mov $acc7, $acc4 3479 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3480 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 3481 3482 lea $in1_x(%rsp), $b_ptr 3483 lea $H(%rsp), $r_ptr # H = U2 - U1 3484 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 3485 3486 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3487 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3488 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3489 3490 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3491 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3492 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3493 3494 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3495 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3496 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3497 3498 lea $in1_y(%rsp), $b_ptr 3499 lea $R(%rsp), $r_ptr # R = S2 - S1 3500 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 3501 3502 `&load_for_sqr("$H(%rsp)", "$src0")` 3503 lea $Hsqr(%rsp), $r_ptr # H^2 3504 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3505 3506 `&load_for_sqr("$R(%rsp)", "$src0")` 3507 lea $Rsqr(%rsp), $r_ptr # R^2 3508 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3509 3510 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 3511 lea $Hcub(%rsp), $r_ptr # H^3 3512 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3513 3514 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3515 lea $U2(%rsp), $r_ptr # U1*H^2 3516 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 3517___ 3518{ 3519####################################################################### 3520# operate in 4-5-0-1 "name space" that matches multiplication output 3521# 3522my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3523my ($poly1, $poly3)=($acc6,$acc7); 3524 3525$code.=<<___; 3526 #lea $U2(%rsp), $a_ptr 3527 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3528 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3529 3530 xor $t4, $t4 3531 add $acc0, $acc0 # a0:a3+a0:a3 3532 lea $Rsqr(%rsp), $a_ptr 3533 adc $acc1, $acc1 3534 mov $acc0, $t0 3535 adc $acc2, $acc2 3536 adc $acc3, $acc3 3537 mov $acc1, $t1 3538 adc \$0, $t4 3539 3540 sub \$-1, $acc0 3541 mov $acc2, $t2 3542 sbb $poly1, $acc1 3543 sbb \$0, $acc2 3544 mov $acc3, $t3 3545 sbb $poly3, $acc3 3546 sbb \$0, $t4 3547 3548 cmovc $t0, $acc0 3549 mov 8*0($a_ptr), $t0 3550 cmovc $t1, $acc1 3551 mov 8*1($a_ptr), $t1 3552 cmovc $t2, $acc2 3553 mov 8*2($a_ptr), $t2 3554 cmovc $t3, $acc3 3555 mov 8*3($a_ptr), $t3 3556 3557 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3558 3559 lea $Hcub(%rsp), $b_ptr 3560 lea $res_x(%rsp), $r_ptr 3561 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3562 3563 mov $U2+8*0(%rsp), $t0 3564 mov $U2+8*1(%rsp), $t1 3565 mov $U2+8*2(%rsp), $t2 3566 mov $U2+8*3(%rsp), $t3 3567 lea $H(%rsp), $r_ptr 3568 3569 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 3570 3571 mov $acc0, 8*0($r_ptr) # save the result, as 3572 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3573 mov $acc2, 8*2($r_ptr) 3574 mov $acc3, 8*3($r_ptr) 3575___ 3576} 3577$code.=<<___; 3578 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 3579 lea $S2(%rsp), $r_ptr 3580 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 3581 3582 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 3583 lea $H(%rsp), $r_ptr 3584 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 3585 3586 lea $S2(%rsp), $b_ptr 3587 lea $res_y(%rsp), $r_ptr 3588 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 3589 3590 movq %xmm0, $r_ptr # restore $r_ptr 3591 3592 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 3593 movdqa %xmm5, %xmm1 3594 pandn $res_z(%rsp), %xmm0 3595 movdqa %xmm5, %xmm2 3596 pandn $res_z+0x10(%rsp), %xmm1 3597 movdqa %xmm5, %xmm3 3598 pand .LONE_mont(%rip), %xmm2 3599 pand .LONE_mont+0x10(%rip), %xmm3 3600 por %xmm0, %xmm2 3601 por %xmm1, %xmm3 3602 3603 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3604 movdqa %xmm4, %xmm1 3605 pandn %xmm2, %xmm0 3606 movdqa %xmm4, %xmm2 3607 pandn %xmm3, %xmm1 3608 movdqa %xmm4, %xmm3 3609 pand $in1_z(%rsp), %xmm2 3610 pand $in1_z+0x10(%rsp), %xmm3 3611 por %xmm0, %xmm2 3612 por %xmm1, %xmm3 3613 movdqu %xmm2, 0x40($r_ptr) 3614 movdqu %xmm3, 0x50($r_ptr) 3615 3616 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3617 movdqa %xmm5, %xmm1 3618 pandn $res_x(%rsp), %xmm0 3619 movdqa %xmm5, %xmm2 3620 pandn $res_x+0x10(%rsp), %xmm1 3621 movdqa %xmm5, %xmm3 3622 pand $in2_x(%rsp), %xmm2 3623 pand $in2_x+0x10(%rsp), %xmm3 3624 por %xmm0, %xmm2 3625 por %xmm1, %xmm3 3626 3627 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3628 movdqa %xmm4, %xmm1 3629 pandn %xmm2, %xmm0 3630 movdqa %xmm4, %xmm2 3631 pandn %xmm3, %xmm1 3632 movdqa %xmm4, %xmm3 3633 pand $in1_x(%rsp), %xmm2 3634 pand $in1_x+0x10(%rsp), %xmm3 3635 por %xmm0, %xmm2 3636 por %xmm1, %xmm3 3637 movdqu %xmm2, 0x00($r_ptr) 3638 movdqu %xmm3, 0x10($r_ptr) 3639 3640 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3641 movdqa %xmm5, %xmm1 3642 pandn $res_y(%rsp), %xmm0 3643 movdqa %xmm5, %xmm2 3644 pandn $res_y+0x10(%rsp), %xmm1 3645 movdqa %xmm5, %xmm3 3646 pand $in2_y(%rsp), %xmm2 3647 pand $in2_y+0x10(%rsp), %xmm3 3648 por %xmm0, %xmm2 3649 por %xmm1, %xmm3 3650 3651 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3652 movdqa %xmm4, %xmm1 3653 pandn %xmm2, %xmm0 3654 movdqa %xmm4, %xmm2 3655 pandn %xmm3, %xmm1 3656 movdqa %xmm4, %xmm3 3657 pand $in1_y(%rsp), %xmm2 3658 pand $in1_y+0x10(%rsp), %xmm3 3659 por %xmm0, %xmm2 3660 por %xmm1, %xmm3 3661 movdqu %xmm2, 0x20($r_ptr) 3662 movdqu %xmm3, 0x30($r_ptr) 3663 3664 lea 32*15+56(%rsp), %rsi 3665.cfi_def_cfa %rsi,8 3666 mov -48(%rsi),%r15 3667.cfi_restore %r15 3668 mov -40(%rsi),%r14 3669.cfi_restore %r14 3670 mov -32(%rsi),%r13 3671.cfi_restore %r13 3672 mov -24(%rsi),%r12 3673.cfi_restore %r12 3674 mov -16(%rsi),%rbx 3675.cfi_restore %rbx 3676 mov -8(%rsi),%rbp 3677.cfi_restore %rbp 3678 lea (%rsi),%rsp 3679.cfi_def_cfa_register %rsp 3680.Ladd_affine${x}_epilogue: 3681 ret 3682.cfi_endproc 3683.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 3684___ 3685} 3686&gen_add_affine("q"); 3687 3688######################################################################## 3689# AD*X magic 3690# 3691if ($addx) { { 3692######################################################################## 3693# operate in 4-5-0-1 "name space" that matches multiplication output 3694# 3695my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3696 3697$code.=<<___; 3698.type __ecp_nistz256_add_tox,\@abi-omnipotent 3699.align 32 3700__ecp_nistz256_add_tox: 3701.cfi_startproc 3702 xor $t4, $t4 3703 adc 8*0($b_ptr), $a0 3704 adc 8*1($b_ptr), $a1 3705 mov $a0, $t0 3706 adc 8*2($b_ptr), $a2 3707 adc 8*3($b_ptr), $a3 3708 mov $a1, $t1 3709 adc \$0, $t4 3710 3711 xor $t3, $t3 3712 sbb \$-1, $a0 3713 mov $a2, $t2 3714 sbb $poly1, $a1 3715 sbb \$0, $a2 3716 mov $a3, $t3 3717 sbb $poly3, $a3 3718 sbb \$0, $t4 3719 3720 cmovc $t0, $a0 3721 cmovc $t1, $a1 3722 mov $a0, 8*0($r_ptr) 3723 cmovc $t2, $a2 3724 mov $a1, 8*1($r_ptr) 3725 cmovc $t3, $a3 3726 mov $a2, 8*2($r_ptr) 3727 mov $a3, 8*3($r_ptr) 3728 3729 ret 3730.cfi_endproc 3731.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 3732 3733.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 3734.align 32 3735__ecp_nistz256_sub_fromx: 3736.cfi_startproc 3737 xor $t4, $t4 3738 sbb 8*0($b_ptr), $a0 3739 sbb 8*1($b_ptr), $a1 3740 mov $a0, $t0 3741 sbb 8*2($b_ptr), $a2 3742 sbb 8*3($b_ptr), $a3 3743 mov $a1, $t1 3744 sbb \$0, $t4 3745 3746 xor $t3, $t3 3747 adc \$-1, $a0 3748 mov $a2, $t2 3749 adc $poly1, $a1 3750 adc \$0, $a2 3751 mov $a3, $t3 3752 adc $poly3, $a3 3753 3754 bt \$0, $t4 3755 cmovnc $t0, $a0 3756 cmovnc $t1, $a1 3757 mov $a0, 8*0($r_ptr) 3758 cmovnc $t2, $a2 3759 mov $a1, 8*1($r_ptr) 3760 cmovnc $t3, $a3 3761 mov $a2, 8*2($r_ptr) 3762 mov $a3, 8*3($r_ptr) 3763 3764 ret 3765.cfi_endproc 3766.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 3767 3768.type __ecp_nistz256_subx,\@abi-omnipotent 3769.align 32 3770__ecp_nistz256_subx: 3771.cfi_startproc 3772 xor $t4, $t4 3773 sbb $a0, $t0 3774 sbb $a1, $t1 3775 mov $t0, $a0 3776 sbb $a2, $t2 3777 sbb $a3, $t3 3778 mov $t1, $a1 3779 sbb \$0, $t4 3780 3781 xor $a3 ,$a3 3782 adc \$-1, $t0 3783 mov $t2, $a2 3784 adc $poly1, $t1 3785 adc \$0, $t2 3786 mov $t3, $a3 3787 adc $poly3, $t3 3788 3789 bt \$0, $t4 3790 cmovc $t0, $a0 3791 cmovc $t1, $a1 3792 cmovc $t2, $a2 3793 cmovc $t3, $a3 3794 3795 ret 3796.cfi_endproc 3797.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 3798 3799.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 3800.align 32 3801__ecp_nistz256_mul_by_2x: 3802.cfi_startproc 3803 xor $t4, $t4 3804 adc $a0, $a0 # a0:a3+a0:a3 3805 adc $a1, $a1 3806 mov $a0, $t0 3807 adc $a2, $a2 3808 adc $a3, $a3 3809 mov $a1, $t1 3810 adc \$0, $t4 3811 3812 xor $t3, $t3 3813 sbb \$-1, $a0 3814 mov $a2, $t2 3815 sbb $poly1, $a1 3816 sbb \$0, $a2 3817 mov $a3, $t3 3818 sbb $poly3, $a3 3819 sbb \$0, $t4 3820 3821 cmovc $t0, $a0 3822 cmovc $t1, $a1 3823 mov $a0, 8*0($r_ptr) 3824 cmovc $t2, $a2 3825 mov $a1, 8*1($r_ptr) 3826 cmovc $t3, $a3 3827 mov $a2, 8*2($r_ptr) 3828 mov $a3, 8*3($r_ptr) 3829 3830 ret 3831.cfi_endproc 3832.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 3833___ 3834 } 3835&gen_double("x"); 3836&gen_add("x"); 3837&gen_add_affine("x"); 3838} 3839}}} 3840 3841# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3842# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3843if ($win64) { 3844$rec="%rcx"; 3845$frame="%rdx"; 3846$context="%r8"; 3847$disp="%r9"; 3848 3849$code.=<<___; 3850.extern __imp_RtlVirtualUnwind 3851 3852.type short_handler,\@abi-omnipotent 3853.align 16 3854short_handler: 3855 push %rsi 3856 push %rdi 3857 push %rbx 3858 push %rbp 3859 push %r12 3860 push %r13 3861 push %r14 3862 push %r15 3863 pushfq 3864 sub \$64,%rsp 3865 3866 mov 120($context),%rax # pull context->Rax 3867 mov 248($context),%rbx # pull context->Rip 3868 3869 mov 8($disp),%rsi # disp->ImageBase 3870 mov 56($disp),%r11 # disp->HandlerData 3871 3872 mov 0(%r11),%r10d # HandlerData[0] 3873 lea (%rsi,%r10),%r10 # end of prologue label 3874 cmp %r10,%rbx # context->Rip<end of prologue label 3875 jb .Lcommon_seh_tail 3876 3877 mov 152($context),%rax # pull context->Rsp 3878 3879 mov 4(%r11),%r10d # HandlerData[1] 3880 lea (%rsi,%r10),%r10 # epilogue label 3881 cmp %r10,%rbx # context->Rip>=epilogue label 3882 jae .Lcommon_seh_tail 3883 3884 lea 16(%rax),%rax 3885 3886 mov -8(%rax),%r12 3887 mov -16(%rax),%r13 3888 mov %r12,216($context) # restore context->R12 3889 mov %r13,224($context) # restore context->R13 3890 3891 jmp .Lcommon_seh_tail 3892.size short_handler,.-short_handler 3893 3894.type full_handler,\@abi-omnipotent 3895.align 16 3896full_handler: 3897 push %rsi 3898 push %rdi 3899 push %rbx 3900 push %rbp 3901 push %r12 3902 push %r13 3903 push %r14 3904 push %r15 3905 pushfq 3906 sub \$64,%rsp 3907 3908 mov 120($context),%rax # pull context->Rax 3909 mov 248($context),%rbx # pull context->Rip 3910 3911 mov 8($disp),%rsi # disp->ImageBase 3912 mov 56($disp),%r11 # disp->HandlerData 3913 3914 mov 0(%r11),%r10d # HandlerData[0] 3915 lea (%rsi,%r10),%r10 # end of prologue label 3916 cmp %r10,%rbx # context->Rip<end of prologue label 3917 jb .Lcommon_seh_tail 3918 3919 mov 152($context),%rax # pull context->Rsp 3920 3921 mov 4(%r11),%r10d # HandlerData[1] 3922 lea (%rsi,%r10),%r10 # epilogue label 3923 cmp %r10,%rbx # context->Rip>=epilogue label 3924 jae .Lcommon_seh_tail 3925 3926 mov 8(%r11),%r10d # HandlerData[2] 3927 lea (%rax,%r10),%rax 3928 3929 mov -8(%rax),%rbp 3930 mov -16(%rax),%rbx 3931 mov -24(%rax),%r12 3932 mov -32(%rax),%r13 3933 mov -40(%rax),%r14 3934 mov -48(%rax),%r15 3935 mov %rbx,144($context) # restore context->Rbx 3936 mov %rbp,160($context) # restore context->Rbp 3937 mov %r12,216($context) # restore context->R12 3938 mov %r13,224($context) # restore context->R13 3939 mov %r14,232($context) # restore context->R14 3940 mov %r15,240($context) # restore context->R15 3941 3942.Lcommon_seh_tail: 3943 mov 8(%rax),%rdi 3944 mov 16(%rax),%rsi 3945 mov %rax,152($context) # restore context->Rsp 3946 mov %rsi,168($context) # restore context->Rsi 3947 mov %rdi,176($context) # restore context->Rdi 3948 3949 mov 40($disp),%rdi # disp->ContextRecord 3950 mov $context,%rsi # context 3951 mov \$154,%ecx # sizeof(CONTEXT) 3952 .long 0xa548f3fc # cld; rep movsq 3953 3954 mov $disp,%rsi 3955 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3956 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3957 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3958 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3959 mov 40(%rsi),%r10 # disp->ContextRecord 3960 lea 56(%rsi),%r11 # &disp->HandlerData 3961 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3962 mov %r10,32(%rsp) # arg5 3963 mov %r11,40(%rsp) # arg6 3964 mov %r12,48(%rsp) # arg7 3965 mov %rcx,56(%rsp) # arg8, (NULL) 3966 call *__imp_RtlVirtualUnwind(%rip) 3967 3968 mov \$1,%eax # ExceptionContinueSearch 3969 add \$64,%rsp 3970 popfq 3971 pop %r15 3972 pop %r14 3973 pop %r13 3974 pop %r12 3975 pop %rbp 3976 pop %rbx 3977 pop %rdi 3978 pop %rsi 3979 ret 3980.size full_handler,.-full_handler 3981 3982.section .pdata 3983.align 4 3984 .rva .LSEH_begin_ecp_nistz256_neg 3985 .rva .LSEH_end_ecp_nistz256_neg 3986 .rva .LSEH_info_ecp_nistz256_neg 3987 3988 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont 3989 .rva .LSEH_end_ecp_nistz256_ord_mul_mont 3990 .rva .LSEH_info_ecp_nistz256_ord_mul_mont 3991 3992 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont 3993 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont 3994 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont 3995___ 3996$code.=<<___ if ($addx); 3997 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx 3998 .rva .LSEH_end_ecp_nistz256_ord_mul_montx 3999 .rva .LSEH_info_ecp_nistz256_ord_mul_montx 4000 4001 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx 4002 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx 4003 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx 4004___ 4005$code.=<<___; 4006 .rva .LSEH_begin_ecp_nistz256_mul_mont 4007 .rva .LSEH_end_ecp_nistz256_mul_mont 4008 .rva .LSEH_info_ecp_nistz256_mul_mont 4009 4010 .rva .LSEH_begin_ecp_nistz256_sqr_mont 4011 .rva .LSEH_end_ecp_nistz256_sqr_mont 4012 .rva .LSEH_info_ecp_nistz256_sqr_mont 4013 4014 .rva .LSEH_begin_ecp_nistz256_select_w5 4015 .rva .LSEH_end_ecp_nistz256_select_w5 4016 .rva .LSEH_info_ecp_nistz256_select_wX 4017 4018 .rva .LSEH_begin_ecp_nistz256_select_w7 4019 .rva .LSEH_end_ecp_nistz256_select_w7 4020 .rva .LSEH_info_ecp_nistz256_select_wX 4021___ 4022$code.=<<___ if ($avx>1); 4023 .rva .LSEH_begin_ecp_nistz256_avx2_select_w5 4024 .rva .LSEH_end_ecp_nistz256_avx2_select_w5 4025 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4026 4027 .rva .LSEH_begin_ecp_nistz256_avx2_select_w7 4028 .rva .LSEH_end_ecp_nistz256_avx2_select_w7 4029 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4030___ 4031$code.=<<___; 4032 .rva .LSEH_begin_ecp_nistz256_point_double 4033 .rva .LSEH_end_ecp_nistz256_point_double 4034 .rva .LSEH_info_ecp_nistz256_point_double 4035 4036 .rva .LSEH_begin_ecp_nistz256_point_add 4037 .rva .LSEH_end_ecp_nistz256_point_add 4038 .rva .LSEH_info_ecp_nistz256_point_add 4039 4040 .rva .LSEH_begin_ecp_nistz256_point_add_affine 4041 .rva .LSEH_end_ecp_nistz256_point_add_affine 4042 .rva .LSEH_info_ecp_nistz256_point_add_affine 4043___ 4044$code.=<<___ if ($addx); 4045 .rva .LSEH_begin_ecp_nistz256_point_doublex 4046 .rva .LSEH_end_ecp_nistz256_point_doublex 4047 .rva .LSEH_info_ecp_nistz256_point_doublex 4048 4049 .rva .LSEH_begin_ecp_nistz256_point_addx 4050 .rva .LSEH_end_ecp_nistz256_point_addx 4051 .rva .LSEH_info_ecp_nistz256_point_addx 4052 4053 .rva .LSEH_begin_ecp_nistz256_point_add_affinex 4054 .rva .LSEH_end_ecp_nistz256_point_add_affinex 4055 .rva .LSEH_info_ecp_nistz256_point_add_affinex 4056___ 4057$code.=<<___; 4058 4059.section .xdata 4060.align 8 4061.LSEH_info_ecp_nistz256_neg: 4062 .byte 9,0,0,0 4063 .rva short_handler 4064 .rva .Lneg_body,.Lneg_epilogue # HandlerData[] 4065.LSEH_info_ecp_nistz256_ord_mul_mont: 4066 .byte 9,0,0,0 4067 .rva full_handler 4068 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] 4069 .long 48,0 4070.LSEH_info_ecp_nistz256_ord_sqr_mont: 4071 .byte 9,0,0,0 4072 .rva full_handler 4073 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] 4074 .long 48,0 4075___ 4076$code.=<<___ if ($addx); 4077.LSEH_info_ecp_nistz256_ord_mul_montx: 4078 .byte 9,0,0,0 4079 .rva full_handler 4080 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] 4081 .long 48,0 4082.LSEH_info_ecp_nistz256_ord_sqr_montx: 4083 .byte 9,0,0,0 4084 .rva full_handler 4085 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] 4086 .long 48,0 4087___ 4088$code.=<<___; 4089.LSEH_info_ecp_nistz256_mul_mont: 4090 .byte 9,0,0,0 4091 .rva full_handler 4092 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4093 .long 48,0 4094.LSEH_info_ecp_nistz256_sqr_mont: 4095 .byte 9,0,0,0 4096 .rva full_handler 4097 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 4098 .long 48,0 4099.LSEH_info_ecp_nistz256_select_wX: 4100 .byte 0x01,0x33,0x16,0x00 4101 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 4102 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 4103 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 4104 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 4105 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 4106 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 4107 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 4108 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 4109 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 4110 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 4111 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 4112 .align 8 4113___ 4114$code.=<<___ if ($avx>1); 4115.LSEH_info_ecp_nistz256_avx2_select_wX: 4116 .byte 0x01,0x36,0x17,0x0b 4117 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 4118 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 4119 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 4120 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 4121 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 4122 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 4123 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 4124 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 4125 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 4126 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 4127 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 4128 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 4129 .align 8 4130___ 4131$code.=<<___; 4132.LSEH_info_ecp_nistz256_point_double: 4133 .byte 9,0,0,0 4134 .rva full_handler 4135 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] 4136 .long 32*5+56,0 4137.LSEH_info_ecp_nistz256_point_add: 4138 .byte 9,0,0,0 4139 .rva full_handler 4140 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] 4141 .long 32*18+56,0 4142.LSEH_info_ecp_nistz256_point_add_affine: 4143 .byte 9,0,0,0 4144 .rva full_handler 4145 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] 4146 .long 32*15+56,0 4147___ 4148$code.=<<___ if ($addx); 4149.align 8 4150.LSEH_info_ecp_nistz256_point_doublex: 4151 .byte 9,0,0,0 4152 .rva full_handler 4153 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] 4154 .long 32*5+56,0 4155.LSEH_info_ecp_nistz256_point_addx: 4156 .byte 9,0,0,0 4157 .rva full_handler 4158 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] 4159 .long 32*18+56,0 4160.LSEH_info_ecp_nistz256_point_add_affinex: 4161 .byte 9,0,0,0 4162 .rva full_handler 4163 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] 4164 .long 32*15+56,0 4165___ 4166} 4167 4168$code =~ s/\`([^\`]*)\`/eval $1/gem; 4169print $code; 4170close STDOUT or die "error closing STDOUT: $!"; 4171