1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4# Copyright (c) 2015 CloudFlare, Inc. 5# 6# Licensed under the OpenSSL license (the "License"). You may not use 7# this file except in compliance with the License. You can obtain a copy 8# in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12# (1) Intel Corporation, Israel Development Center, Haifa, Israel 13# (2) University of Haifa, Israel 14# (3) CloudFlare, Inc. 15# 16# Reference: 17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18# 256 Bit Primes" 19 20# Further optimization by <appro@openssl.org>: 21# 22# this/original with/without -DECP_NISTZ256_ASM(*) 23# Opteron +15-49% +150-195% 24# Bulldozer +18-45% +175-240% 25# P4 +24-46% +100-150% 26# Westmere +18-34% +87-160% 27# Sandy Bridge +14-35% +120-185% 28# Ivy Bridge +11-35% +125-180% 29# Haswell +10-37% +160-200% 30# Broadwell +24-58% +210-270% 31# Atom +20-50% +180-240% 32# VIA Nano +50-160% +480-480% 33# 34# (*) "without -DECP_NISTZ256_ASM" refers to build with 35# "enable-ec_nistp_64_gcc_128"; 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. In "this/original" column lower coefficient is for 39# ECDSA sign, while in "with/without" - for ECDH key agreement, and 40# higher - for ECDSA sign, relatively fastest server-side operation. 41# Keep in mind that +100% means 2x improvement. 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55*STDOUT=*OUT; 56 57$avx = 2; 58$addx = 1; 59 60$code.=<<___; 61.text 62.extern OPENSSL_ia32cap_P 63 64# The polynomial 65.align 64 66.Lpoly: 67.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 68 69.LOne: 70.long 1,1,1,1,1,1,1,1 71.LTwo: 72.long 2,2,2,2,2,2,2,2 73.LThree: 74.long 3,3,3,3,3,3,3,3 75.LONE_mont: 76.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 77 78# Constants for computations modulo ord(p256) 79.Lord: 80.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 81.LordK: 82.quad 0xccd1c8aaee00bc4f 83___ 84 85{ 86my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 87my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 88my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 89 90$code.=<<___; 91 92################################################################################ 93# void nistz256_neg(uint64_t res[4], uint64_t a[4]); 94.globl nistz256_neg 95.type nistz256_neg,\@function,2 96.align 32 97nistz256_neg: 98.cfi_startproc 99 push %r12 100.cfi_push %r12 101 push %r13 102.cfi_push %r13 103.Lneg_body: 104 105 xor $a0, $a0 106 xor $a1, $a1 107 xor $a2, $a2 108 xor $a3, $a3 109 xor $t4, $t4 110 111 sub 8*0($a_ptr), $a0 112 sbb 8*1($a_ptr), $a1 113 sbb 8*2($a_ptr), $a2 114 mov $a0, $t0 115 sbb 8*3($a_ptr), $a3 116 lea .Lpoly(%rip), $a_ptr 117 mov $a1, $t1 118 sbb \$0, $t4 119 120 add 8*0($a_ptr), $a0 121 mov $a2, $t2 122 adc 8*1($a_ptr), $a1 123 adc 8*2($a_ptr), $a2 124 mov $a3, $t3 125 adc 8*3($a_ptr), $a3 126 test $t4, $t4 127 128 cmovz $t0, $a0 129 cmovz $t1, $a1 130 mov $a0, 8*0($r_ptr) 131 cmovz $t2, $a2 132 mov $a1, 8*1($r_ptr) 133 cmovz $t3, $a3 134 mov $a2, 8*2($r_ptr) 135 mov $a3, 8*3($r_ptr) 136 137 mov 0(%rsp),%r13 138.cfi_restore %r13 139 mov 8(%rsp),%r12 140.cfi_restore %r12 141 lea 16(%rsp),%rsp 142.cfi_adjust_cfa_offset -16 143.Lneg_epilogue: 144 ret 145.cfi_endproc 146.size nistz256_neg,.-nistz256_neg 147___ 148} 149{ 150my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 151my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 152my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 153my ($poly1,$poly3)=($acc6,$acc7); 154 155$code.=<<___; 156################################################################################ 157# void p256_scalar_mul_mont( 158# uint64_t res[4], 159# uint64_t a[4], 160# uint64_t b[4]); 161 162.globl p256_scalar_mul_mont 163.type p256_scalar_mul_mont,\@function,3 164.align 32 165p256_scalar_mul_mont: 166.cfi_startproc 167___ 168$code.=<<___ if ($addx); 169 leaq OPENSSL_ia32cap_P(%rip), %rcx 170 mov 8(%rcx), %rcx 171 and \$0x80100, %ecx 172 cmp \$0x80100, %ecx 173 je .Lecp_nistz256_ord_mul_montx 174___ 175$code.=<<___; 176 push %rbp 177.cfi_push %rbp 178 push %rbx 179.cfi_push %rbx 180 push %r12 181.cfi_push %r12 182 push %r13 183.cfi_push %r13 184 push %r14 185.cfi_push %r14 186 push %r15 187.cfi_push %r15 188.Lord_mul_body: 189 190 mov 8*0($b_org), %rax 191 mov $b_org, $b_ptr 192 lea .Lord(%rip), %r14 193 mov .LordK(%rip), %r15 194 195 ################################# * b[0] 196 mov %rax, $t0 197 mulq 8*0($a_ptr) 198 mov %rax, $acc0 199 mov $t0, %rax 200 mov %rdx, $acc1 201 202 mulq 8*1($a_ptr) 203 add %rax, $acc1 204 mov $t0, %rax 205 adc \$0, %rdx 206 mov %rdx, $acc2 207 208 mulq 8*2($a_ptr) 209 add %rax, $acc2 210 mov $t0, %rax 211 adc \$0, %rdx 212 213 mov $acc0, $acc5 214 imulq %r15,$acc0 215 216 mov %rdx, $acc3 217 mulq 8*3($a_ptr) 218 add %rax, $acc3 219 mov $acc0, %rax 220 adc \$0, %rdx 221 mov %rdx, $acc4 222 223 ################################# First reduction step 224 mulq 8*0(%r14) 225 mov $acc0, $t1 226 add %rax, $acc5 # guaranteed to be zero 227 mov $acc0, %rax 228 adc \$0, %rdx 229 mov %rdx, $t0 230 231 sub $acc0, $acc2 232 sbb \$0, $acc0 # can't borrow 233 234 mulq 8*1(%r14) 235 add $t0, $acc1 236 adc \$0, %rdx 237 add %rax, $acc1 238 mov $t1, %rax 239 adc %rdx, $acc2 240 mov $t1, %rdx 241 adc \$0, $acc0 # can't overflow 242 243 shl \$32, %rax 244 shr \$32, %rdx 245 sub %rax, $acc3 246 mov 8*1($b_ptr), %rax 247 sbb %rdx, $t1 # can't borrow 248 249 add $acc0, $acc3 250 adc $t1, $acc4 251 adc \$0, $acc5 252 253 ################################# * b[1] 254 mov %rax, $t0 255 mulq 8*0($a_ptr) 256 add %rax, $acc1 257 mov $t0, %rax 258 adc \$0, %rdx 259 mov %rdx, $t1 260 261 mulq 8*1($a_ptr) 262 add $t1, $acc2 263 adc \$0, %rdx 264 add %rax, $acc2 265 mov $t0, %rax 266 adc \$0, %rdx 267 mov %rdx, $t1 268 269 mulq 8*2($a_ptr) 270 add $t1, $acc3 271 adc \$0, %rdx 272 add %rax, $acc3 273 mov $t0, %rax 274 adc \$0, %rdx 275 276 mov $acc1, $t0 277 imulq %r15, $acc1 278 279 mov %rdx, $t1 280 mulq 8*3($a_ptr) 281 add $t1, $acc4 282 adc \$0, %rdx 283 xor $acc0, $acc0 284 add %rax, $acc4 285 mov $acc1, %rax 286 adc %rdx, $acc5 287 adc \$0, $acc0 288 289 ################################# Second reduction step 290 mulq 8*0(%r14) 291 mov $acc1, $t1 292 add %rax, $t0 # guaranteed to be zero 293 mov $acc1, %rax 294 adc %rdx, $t0 295 296 sub $acc1, $acc3 297 sbb \$0, $acc1 # can't borrow 298 299 mulq 8*1(%r14) 300 add $t0, $acc2 301 adc \$0, %rdx 302 add %rax, $acc2 303 mov $t1, %rax 304 adc %rdx, $acc3 305 mov $t1, %rdx 306 adc \$0, $acc1 # can't overflow 307 308 shl \$32, %rax 309 shr \$32, %rdx 310 sub %rax, $acc4 311 mov 8*2($b_ptr), %rax 312 sbb %rdx, $t1 # can't borrow 313 314 add $acc1, $acc4 315 adc $t1, $acc5 316 adc \$0, $acc0 317 318 ################################## * b[2] 319 mov %rax, $t0 320 mulq 8*0($a_ptr) 321 add %rax, $acc2 322 mov $t0, %rax 323 adc \$0, %rdx 324 mov %rdx, $t1 325 326 mulq 8*1($a_ptr) 327 add $t1, $acc3 328 adc \$0, %rdx 329 add %rax, $acc3 330 mov $t0, %rax 331 adc \$0, %rdx 332 mov %rdx, $t1 333 334 mulq 8*2($a_ptr) 335 add $t1, $acc4 336 adc \$0, %rdx 337 add %rax, $acc4 338 mov $t0, %rax 339 adc \$0, %rdx 340 341 mov $acc2, $t0 342 imulq %r15, $acc2 343 344 mov %rdx, $t1 345 mulq 8*3($a_ptr) 346 add $t1, $acc5 347 adc \$0, %rdx 348 xor $acc1, $acc1 349 add %rax, $acc5 350 mov $acc2, %rax 351 adc %rdx, $acc0 352 adc \$0, $acc1 353 354 ################################# Third reduction step 355 mulq 8*0(%r14) 356 mov $acc2, $t1 357 add %rax, $t0 # guaranteed to be zero 358 mov $acc2, %rax 359 adc %rdx, $t0 360 361 sub $acc2, $acc4 362 sbb \$0, $acc2 # can't borrow 363 364 mulq 8*1(%r14) 365 add $t0, $acc3 366 adc \$0, %rdx 367 add %rax, $acc3 368 mov $t1, %rax 369 adc %rdx, $acc4 370 mov $t1, %rdx 371 adc \$0, $acc2 # can't overflow 372 373 shl \$32, %rax 374 shr \$32, %rdx 375 sub %rax, $acc5 376 mov 8*3($b_ptr), %rax 377 sbb %rdx, $t1 # can't borrow 378 379 add $acc2, $acc5 380 adc $t1, $acc0 381 adc \$0, $acc1 382 383 ################################# * b[3] 384 mov %rax, $t0 385 mulq 8*0($a_ptr) 386 add %rax, $acc3 387 mov $t0, %rax 388 adc \$0, %rdx 389 mov %rdx, $t1 390 391 mulq 8*1($a_ptr) 392 add $t1, $acc4 393 adc \$0, %rdx 394 add %rax, $acc4 395 mov $t0, %rax 396 adc \$0, %rdx 397 mov %rdx, $t1 398 399 mulq 8*2($a_ptr) 400 add $t1, $acc5 401 adc \$0, %rdx 402 add %rax, $acc5 403 mov $t0, %rax 404 adc \$0, %rdx 405 406 mov $acc3, $t0 407 imulq %r15, $acc3 408 409 mov %rdx, $t1 410 mulq 8*3($a_ptr) 411 add $t1, $acc0 412 adc \$0, %rdx 413 xor $acc2, $acc2 414 add %rax, $acc0 415 mov $acc3, %rax 416 adc %rdx, $acc1 417 adc \$0, $acc2 418 419 ################################# Last reduction step 420 mulq 8*0(%r14) 421 mov $acc3, $t1 422 add %rax, $t0 # guaranteed to be zero 423 mov $acc3, %rax 424 adc %rdx, $t0 425 426 sub $acc3, $acc5 427 sbb \$0, $acc3 # can't borrow 428 429 mulq 8*1(%r14) 430 add $t0, $acc4 431 adc \$0, %rdx 432 add %rax, $acc4 433 mov $t1, %rax 434 adc %rdx, $acc5 435 mov $t1, %rdx 436 adc \$0, $acc3 # can't overflow 437 438 shl \$32, %rax 439 shr \$32, %rdx 440 sub %rax, $acc0 441 sbb %rdx, $t1 # can't borrow 442 443 add $acc3, $acc0 444 adc $t1, $acc1 445 adc \$0, $acc2 446 447 ################################# Subtract ord 448 mov $acc4, $a_ptr 449 sub 8*0(%r14), $acc4 450 mov $acc5, $acc3 451 sbb 8*1(%r14), $acc5 452 mov $acc0, $t0 453 sbb 8*2(%r14), $acc0 454 mov $acc1, $t1 455 sbb 8*3(%r14), $acc1 456 sbb \$0, $acc2 457 458 cmovc $a_ptr, $acc4 459 cmovc $acc3, $acc5 460 cmovc $t0, $acc0 461 cmovc $t1, $acc1 462 463 mov $acc4, 8*0($r_ptr) 464 mov $acc5, 8*1($r_ptr) 465 mov $acc0, 8*2($r_ptr) 466 mov $acc1, 8*3($r_ptr) 467 468 mov 0(%rsp),%r15 469.cfi_restore %r15 470 mov 8(%rsp),%r14 471.cfi_restore %r14 472 mov 16(%rsp),%r13 473.cfi_restore %r13 474 mov 24(%rsp),%r12 475.cfi_restore %r12 476 mov 32(%rsp),%rbx 477.cfi_restore %rbx 478 mov 40(%rsp),%rbp 479.cfi_restore %rbp 480 lea 48(%rsp),%rsp 481.cfi_adjust_cfa_offset -48 482.Lord_mul_epilogue: 483 ret 484.cfi_endproc 485.size p256_scalar_mul_mont,.-p256_scalar_mul_mont 486 487################################################################################ 488# void p256_scalar_sqr_rep_mont( 489# uint64_t res[4], 490# uint64_t a[4], 491# uint64_t rep); 492 493.globl p256_scalar_sqr_rep_mont 494.type p256_scalar_sqr_rep_mont,\@function,3 495.align 32 496p256_scalar_sqr_rep_mont: 497.cfi_startproc 498___ 499$code.=<<___ if ($addx); 500 leaq OPENSSL_ia32cap_P(%rip), %rcx 501 mov 8(%rcx), %rcx 502 and \$0x80100, %ecx 503 cmp \$0x80100, %ecx 504 je .Lecp_nistz256_ord_sqr_montx 505___ 506$code.=<<___; 507 push %rbp 508.cfi_push %rbp 509 push %rbx 510.cfi_push %rbx 511 push %r12 512.cfi_push %r12 513 push %r13 514.cfi_push %r13 515 push %r14 516.cfi_push %r14 517 push %r15 518.cfi_push %r15 519.Lord_sqr_body: 520 521 mov 8*0($a_ptr), $acc0 522 mov 8*1($a_ptr), %rax 523 mov 8*2($a_ptr), $acc6 524 mov 8*3($a_ptr), $acc7 525 lea .Lord(%rip), $a_ptr # pointer to modulus 526 mov $b_org, $b_ptr 527 jmp .Loop_ord_sqr 528 529.align 32 530.Loop_ord_sqr: 531 ################################# a[1:] * a[0] 532 mov %rax, $t1 # put aside a[1] 533 mul $acc0 # a[1] * a[0] 534 mov %rax, $acc1 535 movq $t1, %xmm1 # offload a[1] 536 mov $acc6, %rax 537 mov %rdx, $acc2 538 539 mul $acc0 # a[2] * a[0] 540 add %rax, $acc2 541 mov $acc7, %rax 542 movq $acc6, %xmm2 # offload a[2] 543 adc \$0, %rdx 544 mov %rdx, $acc3 545 546 mul $acc0 # a[3] * a[0] 547 add %rax, $acc3 548 mov $acc7, %rax 549 movq $acc7, %xmm3 # offload a[3] 550 adc \$0, %rdx 551 mov %rdx, $acc4 552 553 ################################# a[3] * a[2] 554 mul $acc6 # a[3] * a[2] 555 mov %rax, $acc5 556 mov $acc6, %rax 557 mov %rdx, $acc6 558 559 ################################# a[2:] * a[1] 560 mul $t1 # a[2] * a[1] 561 add %rax, $acc3 562 mov $acc7, %rax 563 adc \$0, %rdx 564 mov %rdx, $acc7 565 566 mul $t1 # a[3] * a[1] 567 add %rax, $acc4 568 adc \$0, %rdx 569 570 add $acc7, $acc4 571 adc %rdx, $acc5 572 adc \$0, $acc6 # can't overflow 573 574 ################################# *2 575 xor $acc7, $acc7 576 mov $acc0, %rax 577 add $acc1, $acc1 578 adc $acc2, $acc2 579 adc $acc3, $acc3 580 adc $acc4, $acc4 581 adc $acc5, $acc5 582 adc $acc6, $acc6 583 adc \$0, $acc7 584 585 ################################# Missing products 586 mul %rax # a[0] * a[0] 587 mov %rax, $acc0 588 movq %xmm1, %rax 589 mov %rdx, $t1 590 591 mul %rax # a[1] * a[1] 592 add $t1, $acc1 593 adc %rax, $acc2 594 movq %xmm2, %rax 595 adc \$0, %rdx 596 mov %rdx, $t1 597 598 mul %rax # a[2] * a[2] 599 add $t1, $acc3 600 adc %rax, $acc4 601 movq %xmm3, %rax 602 adc \$0, %rdx 603 mov %rdx, $t1 604 605 mov $acc0, $t0 606 imulq 8*4($a_ptr), $acc0 # *= .LordK 607 608 mul %rax # a[3] * a[3] 609 add $t1, $acc5 610 adc %rax, $acc6 611 mov 8*0($a_ptr), %rax # modulus[0] 612 adc %rdx, $acc7 # can't overflow 613 614 ################################# First reduction step 615 mul $acc0 616 mov $acc0, $t1 617 add %rax, $t0 # guaranteed to be zero 618 mov 8*1($a_ptr), %rax # modulus[1] 619 adc %rdx, $t0 620 621 sub $acc0, $acc2 622 sbb \$0, $t1 # can't borrow 623 624 mul $acc0 625 add $t0, $acc1 626 adc \$0, %rdx 627 add %rax, $acc1 628 mov $acc0, %rax 629 adc %rdx, $acc2 630 mov $acc0, %rdx 631 adc \$0, $t1 # can't overflow 632 633 mov $acc1, $t0 634 imulq 8*4($a_ptr), $acc1 # *= .LordK 635 636 shl \$32, %rax 637 shr \$32, %rdx 638 sub %rax, $acc3 639 mov 8*0($a_ptr), %rax 640 sbb %rdx, $acc0 # can't borrow 641 642 add $t1, $acc3 643 adc \$0, $acc0 # can't overflow 644 645 ################################# Second reduction step 646 mul $acc1 647 mov $acc1, $t1 648 add %rax, $t0 # guaranteed to be zero 649 mov 8*1($a_ptr), %rax 650 adc %rdx, $t0 651 652 sub $acc1, $acc3 653 sbb \$0, $t1 # can't borrow 654 655 mul $acc1 656 add $t0, $acc2 657 adc \$0, %rdx 658 add %rax, $acc2 659 mov $acc1, %rax 660 adc %rdx, $acc3 661 mov $acc1, %rdx 662 adc \$0, $t1 # can't overflow 663 664 mov $acc2, $t0 665 imulq 8*4($a_ptr), $acc2 # *= .LordK 666 667 shl \$32, %rax 668 shr \$32, %rdx 669 sub %rax, $acc0 670 mov 8*0($a_ptr), %rax 671 sbb %rdx, $acc1 # can't borrow 672 673 add $t1, $acc0 674 adc \$0, $acc1 # can't overflow 675 676 ################################# Third reduction step 677 mul $acc2 678 mov $acc2, $t1 679 add %rax, $t0 # guaranteed to be zero 680 mov 8*1($a_ptr), %rax 681 adc %rdx, $t0 682 683 sub $acc2, $acc0 684 sbb \$0, $t1 # can't borrow 685 686 mul $acc2 687 add $t0, $acc3 688 adc \$0, %rdx 689 add %rax, $acc3 690 mov $acc2, %rax 691 adc %rdx, $acc0 692 mov $acc2, %rdx 693 adc \$0, $t1 # can't overflow 694 695 mov $acc3, $t0 696 imulq 8*4($a_ptr), $acc3 # *= .LordK 697 698 shl \$32, %rax 699 shr \$32, %rdx 700 sub %rax, $acc1 701 mov 8*0($a_ptr), %rax 702 sbb %rdx, $acc2 # can't borrow 703 704 add $t1, $acc1 705 adc \$0, $acc2 # can't overflow 706 707 ################################# Last reduction step 708 mul $acc3 709 mov $acc3, $t1 710 add %rax, $t0 # guaranteed to be zero 711 mov 8*1($a_ptr), %rax 712 adc %rdx, $t0 713 714 sub $acc3, $acc1 715 sbb \$0, $t1 # can't borrow 716 717 mul $acc3 718 add $t0, $acc0 719 adc \$0, %rdx 720 add %rax, $acc0 721 mov $acc3, %rax 722 adc %rdx, $acc1 723 mov $acc3, %rdx 724 adc \$0, $t1 # can't overflow 725 726 shl \$32, %rax 727 shr \$32, %rdx 728 sub %rax, $acc2 729 sbb %rdx, $acc3 # can't borrow 730 731 add $t1, $acc2 732 adc \$0, $acc3 # can't overflow 733 734 ################################# Add bits [511:256] of the sqr result 735 xor %rdx, %rdx 736 add $acc4, $acc0 737 adc $acc5, $acc1 738 mov $acc0, $acc4 739 adc $acc6, $acc2 740 adc $acc7, $acc3 741 mov $acc1, %rax 742 adc \$0, %rdx 743 744 ################################# Compare to modulus 745 sub 8*0($a_ptr), $acc0 746 mov $acc2, $acc6 747 sbb 8*1($a_ptr), $acc1 748 sbb 8*2($a_ptr), $acc2 749 mov $acc3, $acc7 750 sbb 8*3($a_ptr), $acc3 751 sbb \$0, %rdx 752 753 cmovc $acc4, $acc0 754 cmovnc $acc1, %rax 755 cmovnc $acc2, $acc6 756 cmovnc $acc3, $acc7 757 758 dec $b_ptr 759 jnz .Loop_ord_sqr 760 761 mov $acc0, 8*0($r_ptr) 762 mov %rax, 8*1($r_ptr) 763 pxor %xmm1, %xmm1 764 mov $acc6, 8*2($r_ptr) 765 pxor %xmm2, %xmm2 766 mov $acc7, 8*3($r_ptr) 767 pxor %xmm3, %xmm3 768 769 mov 0(%rsp),%r15 770.cfi_restore %r15 771 mov 8(%rsp),%r14 772.cfi_restore %r14 773 mov 16(%rsp),%r13 774.cfi_restore %r13 775 mov 24(%rsp),%r12 776.cfi_restore %r12 777 mov 32(%rsp),%rbx 778.cfi_restore %rbx 779 mov 40(%rsp),%rbp 780.cfi_restore %rbp 781 lea 48(%rsp),%rsp 782.cfi_adjust_cfa_offset -48 783.Lord_sqr_epilogue: 784 ret 785.cfi_endproc 786.size p256_scalar_sqr_rep_mont,.-p256_scalar_sqr_rep_mont 787___ 788 789$code.=<<___ if ($addx); 790################################################################################ 791.type ecp_nistz256_ord_mul_montx,\@function,3 792.align 32 793ecp_nistz256_ord_mul_montx: 794.cfi_startproc 795.Lecp_nistz256_ord_mul_montx: 796 push %rbp 797.cfi_push %rbp 798 push %rbx 799.cfi_push %rbx 800 push %r12 801.cfi_push %r12 802 push %r13 803.cfi_push %r13 804 push %r14 805.cfi_push %r14 806 push %r15 807.cfi_push %r15 808.Lord_mulx_body: 809 810 mov $b_org, $b_ptr 811 mov 8*0($b_org), %rdx 812 mov 8*0($a_ptr), $acc1 813 mov 8*1($a_ptr), $acc2 814 mov 8*2($a_ptr), $acc3 815 mov 8*3($a_ptr), $acc4 816 lea -128($a_ptr), $a_ptr # control u-op density 817 lea .Lord-128(%rip), %r14 818 mov .LordK(%rip), %r15 819 820 ################################# Multiply by b[0] 821 mulx $acc1, $acc0, $acc1 822 mulx $acc2, $t0, $acc2 823 mulx $acc3, $t1, $acc3 824 add $t0, $acc1 825 mulx $acc4, $t0, $acc4 826 mov $acc0, %rdx 827 mulx %r15, %rdx, %rax 828 adc $t1, $acc2 829 adc $t0, $acc3 830 adc \$0, $acc4 831 832 ################################# reduction 833 xor $acc5, $acc5 # $acc5=0, cf=0, of=0 834 mulx 8*0+128(%r14), $t0, $t1 835 adcx $t0, $acc0 # guaranteed to be zero 836 adox $t1, $acc1 837 838 mulx 8*1+128(%r14), $t0, $t1 839 adcx $t0, $acc1 840 adox $t1, $acc2 841 842 mulx 8*2+128(%r14), $t0, $t1 843 adcx $t0, $acc2 844 adox $t1, $acc3 845 846 mulx 8*3+128(%r14), $t0, $t1 847 mov 8*1($b_ptr), %rdx 848 adcx $t0, $acc3 849 adox $t1, $acc4 850 adcx $acc0, $acc4 851 adox $acc0, $acc5 852 adc \$0, $acc5 # cf=0, of=0 853 854 ################################# Multiply by b[1] 855 mulx 8*0+128($a_ptr), $t0, $t1 856 adcx $t0, $acc1 857 adox $t1, $acc2 858 859 mulx 8*1+128($a_ptr), $t0, $t1 860 adcx $t0, $acc2 861 adox $t1, $acc3 862 863 mulx 8*2+128($a_ptr), $t0, $t1 864 adcx $t0, $acc3 865 adox $t1, $acc4 866 867 mulx 8*3+128($a_ptr), $t0, $t1 868 mov $acc1, %rdx 869 mulx %r15, %rdx, %rax 870 adcx $t0, $acc4 871 adox $t1, $acc5 872 873 adcx $acc0, $acc5 874 adox $acc0, $acc0 875 adc \$0, $acc0 # cf=0, of=0 876 877 ################################# reduction 878 mulx 8*0+128(%r14), $t0, $t1 879 adcx $t0, $acc1 # guaranteed to be zero 880 adox $t1, $acc2 881 882 mulx 8*1+128(%r14), $t0, $t1 883 adcx $t0, $acc2 884 adox $t1, $acc3 885 886 mulx 8*2+128(%r14), $t0, $t1 887 adcx $t0, $acc3 888 adox $t1, $acc4 889 890 mulx 8*3+128(%r14), $t0, $t1 891 mov 8*2($b_ptr), %rdx 892 adcx $t0, $acc4 893 adox $t1, $acc5 894 adcx $acc1, $acc5 895 adox $acc1, $acc0 896 adc \$0, $acc0 # cf=0, of=0 897 898 ################################# Multiply by b[2] 899 mulx 8*0+128($a_ptr), $t0, $t1 900 adcx $t0, $acc2 901 adox $t1, $acc3 902 903 mulx 8*1+128($a_ptr), $t0, $t1 904 adcx $t0, $acc3 905 adox $t1, $acc4 906 907 mulx 8*2+128($a_ptr), $t0, $t1 908 adcx $t0, $acc4 909 adox $t1, $acc5 910 911 mulx 8*3+128($a_ptr), $t0, $t1 912 mov $acc2, %rdx 913 mulx %r15, %rdx, %rax 914 adcx $t0, $acc5 915 adox $t1, $acc0 916 917 adcx $acc1, $acc0 918 adox $acc1, $acc1 919 adc \$0, $acc1 # cf=0, of=0 920 921 ################################# reduction 922 mulx 8*0+128(%r14), $t0, $t1 923 adcx $t0, $acc2 # guaranteed to be zero 924 adox $t1, $acc3 925 926 mulx 8*1+128(%r14), $t0, $t1 927 adcx $t0, $acc3 928 adox $t1, $acc4 929 930 mulx 8*2+128(%r14), $t0, $t1 931 adcx $t0, $acc4 932 adox $t1, $acc5 933 934 mulx 8*3+128(%r14), $t0, $t1 935 mov 8*3($b_ptr), %rdx 936 adcx $t0, $acc5 937 adox $t1, $acc0 938 adcx $acc2, $acc0 939 adox $acc2, $acc1 940 adc \$0, $acc1 # cf=0, of=0 941 942 ################################# Multiply by b[3] 943 mulx 8*0+128($a_ptr), $t0, $t1 944 adcx $t0, $acc3 945 adox $t1, $acc4 946 947 mulx 8*1+128($a_ptr), $t0, $t1 948 adcx $t0, $acc4 949 adox $t1, $acc5 950 951 mulx 8*2+128($a_ptr), $t0, $t1 952 adcx $t0, $acc5 953 adox $t1, $acc0 954 955 mulx 8*3+128($a_ptr), $t0, $t1 956 mov $acc3, %rdx 957 mulx %r15, %rdx, %rax 958 adcx $t0, $acc0 959 adox $t1, $acc1 960 961 adcx $acc2, $acc1 962 adox $acc2, $acc2 963 adc \$0, $acc2 # cf=0, of=0 964 965 ################################# reduction 966 mulx 8*0+128(%r14), $t0, $t1 967 adcx $t0, $acc3 # guranteed to be zero 968 adox $t1, $acc4 969 970 mulx 8*1+128(%r14), $t0, $t1 971 adcx $t0, $acc4 972 adox $t1, $acc5 973 974 mulx 8*2+128(%r14), $t0, $t1 975 adcx $t0, $acc5 976 adox $t1, $acc0 977 978 mulx 8*3+128(%r14), $t0, $t1 979 lea 128(%r14),%r14 980 mov $acc4, $t2 981 adcx $t0, $acc0 982 adox $t1, $acc1 983 mov $acc5, $t3 984 adcx $acc3, $acc1 985 adox $acc3, $acc2 986 adc \$0, $acc2 987 988 ################################# 989 # Branch-less conditional subtraction of P 990 mov $acc0, $t0 991 sub 8*0(%r14), $acc4 992 sbb 8*1(%r14), $acc5 993 sbb 8*2(%r14), $acc0 994 mov $acc1, $t1 995 sbb 8*3(%r14), $acc1 996 sbb \$0, $acc2 997 998 cmovc $t2, $acc4 999 cmovc $t3, $acc5 1000 cmovc $t0, $acc0 1001 cmovc $t1, $acc1 1002 1003 mov $acc4, 8*0($r_ptr) 1004 mov $acc5, 8*1($r_ptr) 1005 mov $acc0, 8*2($r_ptr) 1006 mov $acc1, 8*3($r_ptr) 1007 1008 mov 0(%rsp),%r15 1009.cfi_restore %r15 1010 mov 8(%rsp),%r14 1011.cfi_restore %r14 1012 mov 16(%rsp),%r13 1013.cfi_restore %r13 1014 mov 24(%rsp),%r12 1015.cfi_restore %r12 1016 mov 32(%rsp),%rbx 1017.cfi_restore %rbx 1018 mov 40(%rsp),%rbp 1019.cfi_restore %rbp 1020 lea 48(%rsp),%rsp 1021.cfi_adjust_cfa_offset -48 1022.Lord_mulx_epilogue: 1023 ret 1024.cfi_endproc 1025.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx 1026 1027.type ecp_nistz256_ord_sqr_montx,\@function,3 1028.align 32 1029ecp_nistz256_ord_sqr_montx: 1030.cfi_startproc 1031.Lecp_nistz256_ord_sqr_montx: 1032 push %rbp 1033.cfi_push %rbp 1034 push %rbx 1035.cfi_push %rbx 1036 push %r12 1037.cfi_push %r12 1038 push %r13 1039.cfi_push %r13 1040 push %r14 1041.cfi_push %r14 1042 push %r15 1043.cfi_push %r15 1044.Lord_sqrx_body: 1045 1046 mov $b_org, $b_ptr 1047 mov 8*0($a_ptr), %rdx 1048 mov 8*1($a_ptr), $acc6 1049 mov 8*2($a_ptr), $acc7 1050 mov 8*3($a_ptr), $acc0 1051 lea .Lord(%rip), $a_ptr 1052 jmp .Loop_ord_sqrx 1053 1054.align 32 1055.Loop_ord_sqrx: 1056 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1057 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1058 mov %rdx, %rax # offload a[0] 1059 movq $acc6, %xmm1 # offload a[1] 1060 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1061 mov $acc6, %rdx 1062 add $t0, $acc2 1063 movq $acc7, %xmm2 # offload a[2] 1064 adc $t1, $acc3 1065 adc \$0, $acc4 1066 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1067 ################################# 1068 mulx $acc7, $t0, $t1 # a[1]*a[2] 1069 adcx $t0, $acc3 1070 adox $t1, $acc4 1071 1072 mulx $acc0, $t0, $t1 # a[1]*a[3] 1073 mov $acc7, %rdx 1074 adcx $t0, $acc4 1075 adox $t1, $acc5 1076 adc \$0, $acc5 1077 ################################# 1078 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1079 mov %rax, %rdx 1080 movq $acc0, %xmm3 # offload a[3] 1081 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1082 adcx $acc1, $acc1 # acc1:6<<1 1083 adox $t0, $acc5 1084 adcx $acc2, $acc2 1085 adox $acc7, $acc6 # of=0 1086 1087 ################################# a[i]*a[i] 1088 mulx %rdx, $acc0, $t1 1089 movq %xmm1, %rdx 1090 adcx $acc3, $acc3 1091 adox $t1, $acc1 1092 adcx $acc4, $acc4 1093 mulx %rdx, $t0, $t4 1094 movq %xmm2, %rdx 1095 adcx $acc5, $acc5 1096 adox $t0, $acc2 1097 adcx $acc6, $acc6 1098 mulx %rdx, $t0, $t1 1099 .byte 0x67 1100 movq %xmm3, %rdx 1101 adox $t4, $acc3 1102 adcx $acc7, $acc7 1103 adox $t0, $acc4 1104 adox $t1, $acc5 1105 mulx %rdx, $t0, $t4 1106 adox $t0, $acc6 1107 adox $t4, $acc7 1108 1109 ################################# reduction 1110 mov $acc0, %rdx 1111 mulx 8*4($a_ptr), %rdx, $t0 1112 1113 xor %rax, %rax # cf=0, of=0 1114 mulx 8*0($a_ptr), $t0, $t1 1115 adcx $t0, $acc0 # guaranteed to be zero 1116 adox $t1, $acc1 1117 mulx 8*1($a_ptr), $t0, $t1 1118 adcx $t0, $acc1 1119 adox $t1, $acc2 1120 mulx 8*2($a_ptr), $t0, $t1 1121 adcx $t0, $acc2 1122 adox $t1, $acc3 1123 mulx 8*3($a_ptr), $t0, $t1 1124 adcx $t0, $acc3 1125 adox $t1, $acc0 # of=0 1126 adcx %rax, $acc0 # cf=0 1127 1128 ################################# 1129 mov $acc1, %rdx 1130 mulx 8*4($a_ptr), %rdx, $t0 1131 1132 mulx 8*0($a_ptr), $t0, $t1 1133 adox $t0, $acc1 # guaranteed to be zero 1134 adcx $t1, $acc2 1135 mulx 8*1($a_ptr), $t0, $t1 1136 adox $t0, $acc2 1137 adcx $t1, $acc3 1138 mulx 8*2($a_ptr), $t0, $t1 1139 adox $t0, $acc3 1140 adcx $t1, $acc0 1141 mulx 8*3($a_ptr), $t0, $t1 1142 adox $t0, $acc0 1143 adcx $t1, $acc1 # cf=0 1144 adox %rax, $acc1 # of=0 1145 1146 ################################# 1147 mov $acc2, %rdx 1148 mulx 8*4($a_ptr), %rdx, $t0 1149 1150 mulx 8*0($a_ptr), $t0, $t1 1151 adcx $t0, $acc2 # guaranteed to be zero 1152 adox $t1, $acc3 1153 mulx 8*1($a_ptr), $t0, $t1 1154 adcx $t0, $acc3 1155 adox $t1, $acc0 1156 mulx 8*2($a_ptr), $t0, $t1 1157 adcx $t0, $acc0 1158 adox $t1, $acc1 1159 mulx 8*3($a_ptr), $t0, $t1 1160 adcx $t0, $acc1 1161 adox $t1, $acc2 # of=0 1162 adcx %rax, $acc2 # cf=0 1163 1164 ################################# 1165 mov $acc3, %rdx 1166 mulx 8*4($a_ptr), %rdx, $t0 1167 1168 mulx 8*0($a_ptr), $t0, $t1 1169 adox $t0, $acc3 # guaranteed to be zero 1170 adcx $t1, $acc0 1171 mulx 8*1($a_ptr), $t0, $t1 1172 adox $t0, $acc0 1173 adcx $t1, $acc1 1174 mulx 8*2($a_ptr), $t0, $t1 1175 adox $t0, $acc1 1176 adcx $t1, $acc2 1177 mulx 8*3($a_ptr), $t0, $t1 1178 adox $t0, $acc2 1179 adcx $t1, $acc3 1180 adox %rax, $acc3 1181 1182 ################################# accumulate upper half 1183 add $acc0, $acc4 # add $acc4, $acc0 1184 adc $acc5, $acc1 1185 mov $acc4, %rdx 1186 adc $acc6, $acc2 1187 adc $acc7, $acc3 1188 mov $acc1, $acc6 1189 adc \$0, %rax 1190 1191 ################################# compare to modulus 1192 sub 8*0($a_ptr), $acc4 1193 mov $acc2, $acc7 1194 sbb 8*1($a_ptr), $acc1 1195 sbb 8*2($a_ptr), $acc2 1196 mov $acc3, $acc0 1197 sbb 8*3($a_ptr), $acc3 1198 sbb \$0, %rax 1199 1200 cmovnc $acc4, %rdx 1201 cmovnc $acc1, $acc6 1202 cmovnc $acc2, $acc7 1203 cmovnc $acc3, $acc0 1204 1205 dec $b_ptr 1206 jnz .Loop_ord_sqrx 1207 1208 mov %rdx, 8*0($r_ptr) 1209 mov $acc6, 8*1($r_ptr) 1210 pxor %xmm1, %xmm1 1211 mov $acc7, 8*2($r_ptr) 1212 pxor %xmm2, %xmm2 1213 mov $acc0, 8*3($r_ptr) 1214 pxor %xmm3, %xmm3 1215 1216 mov 0(%rsp),%r15 1217.cfi_restore %r15 1218 mov 8(%rsp),%r14 1219.cfi_restore %r14 1220 mov 16(%rsp),%r13 1221.cfi_restore %r13 1222 mov 24(%rsp),%r12 1223.cfi_restore %r12 1224 mov 32(%rsp),%rbx 1225.cfi_restore %rbx 1226 mov 40(%rsp),%rbp 1227.cfi_restore %rbp 1228 lea 48(%rsp),%rsp 1229.cfi_adjust_cfa_offset -48 1230.Lord_sqrx_epilogue: 1231 ret 1232.cfi_endproc 1233.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx 1234___ 1235 1236$code.=<<___; 1237################################################################################ 1238# void p256_mul_mont( 1239# uint64_t res[4], 1240# uint64_t a[4], 1241# uint64_t b[4]); 1242 1243.globl p256_mul_mont 1244.type p256_mul_mont,\@function,3 1245.align 32 1246p256_mul_mont: 1247.cfi_startproc 1248___ 1249$code.=<<___ if ($addx); 1250 leaq OPENSSL_ia32cap_P(%rip), %rcx 1251 mov 8(%rcx), %rcx 1252 and \$0x80100, %ecx 1253___ 1254$code.=<<___; 1255.Lmul_mont: 1256 push %rbp 1257.cfi_push %rbp 1258 push %rbx 1259.cfi_push %rbx 1260 push %r12 1261.cfi_push %r12 1262 push %r13 1263.cfi_push %r13 1264 push %r14 1265.cfi_push %r14 1266 push %r15 1267.cfi_push %r15 1268.Lmul_body: 1269___ 1270$code.=<<___ if ($addx); 1271 cmp \$0x80100, %ecx 1272 je .Lmul_montx 1273___ 1274$code.=<<___; 1275 mov $b_org, $b_ptr 1276 mov 8*0($b_org), %rax 1277 mov 8*0($a_ptr), $acc1 1278 mov 8*1($a_ptr), $acc2 1279 mov 8*2($a_ptr), $acc3 1280 mov 8*3($a_ptr), $acc4 1281 1282 call __ecp_nistz256_mul_montq 1283___ 1284$code.=<<___ if ($addx); 1285 jmp .Lmul_mont_done 1286 1287.align 32 1288.Lmul_montx: 1289 mov $b_org, $b_ptr 1290 mov 8*0($b_org), %rdx 1291 mov 8*0($a_ptr), $acc1 1292 mov 8*1($a_ptr), $acc2 1293 mov 8*2($a_ptr), $acc3 1294 mov 8*3($a_ptr), $acc4 1295 lea -128($a_ptr), $a_ptr # control u-op density 1296 1297 call __ecp_nistz256_mul_montx 1298___ 1299$code.=<<___; 1300.Lmul_mont_done: 1301 mov 0(%rsp),%r15 1302.cfi_restore %r15 1303 mov 8(%rsp),%r14 1304.cfi_restore %r14 1305 mov 16(%rsp),%r13 1306.cfi_restore %r13 1307 mov 24(%rsp),%r12 1308.cfi_restore %r12 1309 mov 32(%rsp),%rbx 1310.cfi_restore %rbx 1311 mov 40(%rsp),%rbp 1312.cfi_restore %rbp 1313 lea 48(%rsp),%rsp 1314.cfi_adjust_cfa_offset -48 1315.Lmul_epilogue: 1316 ret 1317.cfi_endproc 1318.size p256_mul_mont,.-p256_mul_mont 1319 1320.type __ecp_nistz256_mul_montq,\@abi-omnipotent 1321.align 32 1322__ecp_nistz256_mul_montq: 1323.cfi_startproc 1324 ######################################################################## 1325 # Multiply a by b[0] 1326 mov %rax, $t1 1327 mulq $acc1 1328 mov .Lpoly+8*1(%rip),$poly1 1329 mov %rax, $acc0 1330 mov $t1, %rax 1331 mov %rdx, $acc1 1332 1333 mulq $acc2 1334 mov .Lpoly+8*3(%rip),$poly3 1335 add %rax, $acc1 1336 mov $t1, %rax 1337 adc \$0, %rdx 1338 mov %rdx, $acc2 1339 1340 mulq $acc3 1341 add %rax, $acc2 1342 mov $t1, %rax 1343 adc \$0, %rdx 1344 mov %rdx, $acc3 1345 1346 mulq $acc4 1347 add %rax, $acc3 1348 mov $acc0, %rax 1349 adc \$0, %rdx 1350 xor $acc5, $acc5 1351 mov %rdx, $acc4 1352 1353 ######################################################################## 1354 # First reduction step 1355 # Basically now we want to multiply acc[0] by p256, 1356 # and add the result to the acc. 1357 # Due to the special form of p256 we do some optimizations 1358 # 1359 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 1360 # then we add acc[0] and get acc[0] x 2^96 1361 1362 mov $acc0, $t1 1363 shl \$32, $acc0 1364 mulq $poly3 1365 shr \$32, $t1 1366 add $acc0, $acc1 # +=acc[0]<<96 1367 adc $t1, $acc2 1368 adc %rax, $acc3 1369 mov 8*1($b_ptr), %rax 1370 adc %rdx, $acc4 1371 adc \$0, $acc5 1372 xor $acc0, $acc0 1373 1374 ######################################################################## 1375 # Multiply by b[1] 1376 mov %rax, $t1 1377 mulq 8*0($a_ptr) 1378 add %rax, $acc1 1379 mov $t1, %rax 1380 adc \$0, %rdx 1381 mov %rdx, $t0 1382 1383 mulq 8*1($a_ptr) 1384 add $t0, $acc2 1385 adc \$0, %rdx 1386 add %rax, $acc2 1387 mov $t1, %rax 1388 adc \$0, %rdx 1389 mov %rdx, $t0 1390 1391 mulq 8*2($a_ptr) 1392 add $t0, $acc3 1393 adc \$0, %rdx 1394 add %rax, $acc3 1395 mov $t1, %rax 1396 adc \$0, %rdx 1397 mov %rdx, $t0 1398 1399 mulq 8*3($a_ptr) 1400 add $t0, $acc4 1401 adc \$0, %rdx 1402 add %rax, $acc4 1403 mov $acc1, %rax 1404 adc %rdx, $acc5 1405 adc \$0, $acc0 1406 1407 ######################################################################## 1408 # Second reduction step 1409 mov $acc1, $t1 1410 shl \$32, $acc1 1411 mulq $poly3 1412 shr \$32, $t1 1413 add $acc1, $acc2 1414 adc $t1, $acc3 1415 adc %rax, $acc4 1416 mov 8*2($b_ptr), %rax 1417 adc %rdx, $acc5 1418 adc \$0, $acc0 1419 xor $acc1, $acc1 1420 1421 ######################################################################## 1422 # Multiply by b[2] 1423 mov %rax, $t1 1424 mulq 8*0($a_ptr) 1425 add %rax, $acc2 1426 mov $t1, %rax 1427 adc \$0, %rdx 1428 mov %rdx, $t0 1429 1430 mulq 8*1($a_ptr) 1431 add $t0, $acc3 1432 adc \$0, %rdx 1433 add %rax, $acc3 1434 mov $t1, %rax 1435 adc \$0, %rdx 1436 mov %rdx, $t0 1437 1438 mulq 8*2($a_ptr) 1439 add $t0, $acc4 1440 adc \$0, %rdx 1441 add %rax, $acc4 1442 mov $t1, %rax 1443 adc \$0, %rdx 1444 mov %rdx, $t0 1445 1446 mulq 8*3($a_ptr) 1447 add $t0, $acc5 1448 adc \$0, %rdx 1449 add %rax, $acc5 1450 mov $acc2, %rax 1451 adc %rdx, $acc0 1452 adc \$0, $acc1 1453 1454 ######################################################################## 1455 # Third reduction step 1456 mov $acc2, $t1 1457 shl \$32, $acc2 1458 mulq $poly3 1459 shr \$32, $t1 1460 add $acc2, $acc3 1461 adc $t1, $acc4 1462 adc %rax, $acc5 1463 mov 8*3($b_ptr), %rax 1464 adc %rdx, $acc0 1465 adc \$0, $acc1 1466 xor $acc2, $acc2 1467 1468 ######################################################################## 1469 # Multiply by b[3] 1470 mov %rax, $t1 1471 mulq 8*0($a_ptr) 1472 add %rax, $acc3 1473 mov $t1, %rax 1474 adc \$0, %rdx 1475 mov %rdx, $t0 1476 1477 mulq 8*1($a_ptr) 1478 add $t0, $acc4 1479 adc \$0, %rdx 1480 add %rax, $acc4 1481 mov $t1, %rax 1482 adc \$0, %rdx 1483 mov %rdx, $t0 1484 1485 mulq 8*2($a_ptr) 1486 add $t0, $acc5 1487 adc \$0, %rdx 1488 add %rax, $acc5 1489 mov $t1, %rax 1490 adc \$0, %rdx 1491 mov %rdx, $t0 1492 1493 mulq 8*3($a_ptr) 1494 add $t0, $acc0 1495 adc \$0, %rdx 1496 add %rax, $acc0 1497 mov $acc3, %rax 1498 adc %rdx, $acc1 1499 adc \$0, $acc2 1500 1501 ######################################################################## 1502 # Final reduction step 1503 mov $acc3, $t1 1504 shl \$32, $acc3 1505 mulq $poly3 1506 shr \$32, $t1 1507 add $acc3, $acc4 1508 adc $t1, $acc5 1509 mov $acc4, $t0 1510 adc %rax, $acc0 1511 adc %rdx, $acc1 1512 mov $acc5, $t1 1513 adc \$0, $acc2 1514 1515 ######################################################################## 1516 # Branch-less conditional subtraction of P 1517 sub \$-1, $acc4 # .Lpoly[0] 1518 mov $acc0, $t2 1519 sbb $poly1, $acc5 # .Lpoly[1] 1520 sbb \$0, $acc0 # .Lpoly[2] 1521 mov $acc1, $t3 1522 sbb $poly3, $acc1 # .Lpoly[3] 1523 sbb \$0, $acc2 1524 1525 cmovc $t0, $acc4 1526 cmovc $t1, $acc5 1527 mov $acc4, 8*0($r_ptr) 1528 cmovc $t2, $acc0 1529 mov $acc5, 8*1($r_ptr) 1530 cmovc $t3, $acc1 1531 mov $acc0, 8*2($r_ptr) 1532 mov $acc1, 8*3($r_ptr) 1533 1534 ret 1535.cfi_endproc 1536.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 1537 1538################################################################################ 1539# void p256_sqr_mont( 1540# uint64_t res[4], 1541# uint64_t a[4]); 1542 1543# we optimize the square according to S.Gueron and V.Krasnov, 1544# "Speeding up Big-Number Squaring" 1545.globl p256_sqr_mont 1546.type p256_sqr_mont,\@function,2 1547.align 32 1548p256_sqr_mont: 1549.cfi_startproc 1550___ 1551$code.=<<___ if ($addx); 1552 leaq OPENSSL_ia32cap_P(%rip), %rcx 1553 mov 8(%rcx), %rcx 1554 and \$0x80100, %ecx 1555___ 1556$code.=<<___; 1557 push %rbp 1558.cfi_push %rbp 1559 push %rbx 1560.cfi_push %rbx 1561 push %r12 1562.cfi_push %r12 1563 push %r13 1564.cfi_push %r13 1565 push %r14 1566.cfi_push %r14 1567 push %r15 1568.cfi_push %r15 1569.Lsqr_body: 1570___ 1571$code.=<<___ if ($addx); 1572 cmp \$0x80100, %ecx 1573 je .Lsqr_montx 1574___ 1575$code.=<<___; 1576 mov 8*0($a_ptr), %rax 1577 mov 8*1($a_ptr), $acc6 1578 mov 8*2($a_ptr), $acc7 1579 mov 8*3($a_ptr), $acc0 1580 1581 call __ecp_nistz256_sqr_montq 1582___ 1583$code.=<<___ if ($addx); 1584 jmp .Lsqr_mont_done 1585 1586.align 32 1587.Lsqr_montx: 1588 mov 8*0($a_ptr), %rdx 1589 mov 8*1($a_ptr), $acc6 1590 mov 8*2($a_ptr), $acc7 1591 mov 8*3($a_ptr), $acc0 1592 lea -128($a_ptr), $a_ptr # control u-op density 1593 1594 call __ecp_nistz256_sqr_montx 1595___ 1596$code.=<<___; 1597.Lsqr_mont_done: 1598 mov 0(%rsp),%r15 1599.cfi_restore %r15 1600 mov 8(%rsp),%r14 1601.cfi_restore %r14 1602 mov 16(%rsp),%r13 1603.cfi_restore %r13 1604 mov 24(%rsp),%r12 1605.cfi_restore %r12 1606 mov 32(%rsp),%rbx 1607.cfi_restore %rbx 1608 mov 40(%rsp),%rbp 1609.cfi_restore %rbp 1610 lea 48(%rsp),%rsp 1611.cfi_adjust_cfa_offset -48 1612.Lsqr_epilogue: 1613 ret 1614.cfi_endproc 1615.size p256_sqr_mont,.-p256_sqr_mont 1616 1617.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 1618.align 32 1619__ecp_nistz256_sqr_montq: 1620.cfi_startproc 1621 mov %rax, $acc5 1622 mulq $acc6 # a[1]*a[0] 1623 mov %rax, $acc1 1624 mov $acc7, %rax 1625 mov %rdx, $acc2 1626 1627 mulq $acc5 # a[0]*a[2] 1628 add %rax, $acc2 1629 mov $acc0, %rax 1630 adc \$0, %rdx 1631 mov %rdx, $acc3 1632 1633 mulq $acc5 # a[0]*a[3] 1634 add %rax, $acc3 1635 mov $acc7, %rax 1636 adc \$0, %rdx 1637 mov %rdx, $acc4 1638 1639 ################################# 1640 mulq $acc6 # a[1]*a[2] 1641 add %rax, $acc3 1642 mov $acc0, %rax 1643 adc \$0, %rdx 1644 mov %rdx, $t1 1645 1646 mulq $acc6 # a[1]*a[3] 1647 add %rax, $acc4 1648 mov $acc0, %rax 1649 adc \$0, %rdx 1650 add $t1, $acc4 1651 mov %rdx, $acc5 1652 adc \$0, $acc5 1653 1654 ################################# 1655 mulq $acc7 # a[2]*a[3] 1656 xor $acc7, $acc7 1657 add %rax, $acc5 1658 mov 8*0($a_ptr), %rax 1659 mov %rdx, $acc6 1660 adc \$0, $acc6 1661 1662 add $acc1, $acc1 # acc1:6<<1 1663 adc $acc2, $acc2 1664 adc $acc3, $acc3 1665 adc $acc4, $acc4 1666 adc $acc5, $acc5 1667 adc $acc6, $acc6 1668 adc \$0, $acc7 1669 1670 mulq %rax 1671 mov %rax, $acc0 1672 mov 8*1($a_ptr), %rax 1673 mov %rdx, $t0 1674 1675 mulq %rax 1676 add $t0, $acc1 1677 adc %rax, $acc2 1678 mov 8*2($a_ptr), %rax 1679 adc \$0, %rdx 1680 mov %rdx, $t0 1681 1682 mulq %rax 1683 add $t0, $acc3 1684 adc %rax, $acc4 1685 mov 8*3($a_ptr), %rax 1686 adc \$0, %rdx 1687 mov %rdx, $t0 1688 1689 mulq %rax 1690 add $t0, $acc5 1691 adc %rax, $acc6 1692 mov $acc0, %rax 1693 adc %rdx, $acc7 1694 1695 mov .Lpoly+8*1(%rip), $a_ptr 1696 mov .Lpoly+8*3(%rip), $t1 1697 1698 ########################################## 1699 # Now the reduction 1700 # First iteration 1701 mov $acc0, $t0 1702 shl \$32, $acc0 1703 mulq $t1 1704 shr \$32, $t0 1705 add $acc0, $acc1 # +=acc[0]<<96 1706 adc $t0, $acc2 1707 adc %rax, $acc3 1708 mov $acc1, %rax 1709 adc \$0, %rdx 1710 1711 ########################################## 1712 # Second iteration 1713 mov $acc1, $t0 1714 shl \$32, $acc1 1715 mov %rdx, $acc0 1716 mulq $t1 1717 shr \$32, $t0 1718 add $acc1, $acc2 1719 adc $t0, $acc3 1720 adc %rax, $acc0 1721 mov $acc2, %rax 1722 adc \$0, %rdx 1723 1724 ########################################## 1725 # Third iteration 1726 mov $acc2, $t0 1727 shl \$32, $acc2 1728 mov %rdx, $acc1 1729 mulq $t1 1730 shr \$32, $t0 1731 add $acc2, $acc3 1732 adc $t0, $acc0 1733 adc %rax, $acc1 1734 mov $acc3, %rax 1735 adc \$0, %rdx 1736 1737 ########################################### 1738 # Last iteration 1739 mov $acc3, $t0 1740 shl \$32, $acc3 1741 mov %rdx, $acc2 1742 mulq $t1 1743 shr \$32, $t0 1744 add $acc3, $acc0 1745 adc $t0, $acc1 1746 adc %rax, $acc2 1747 adc \$0, %rdx 1748 xor $acc3, $acc3 1749 1750 ############################################ 1751 # Add the rest of the acc 1752 add $acc0, $acc4 1753 adc $acc1, $acc5 1754 mov $acc4, $acc0 1755 adc $acc2, $acc6 1756 adc %rdx, $acc7 1757 mov $acc5, $acc1 1758 adc \$0, $acc3 1759 1760 sub \$-1, $acc4 # .Lpoly[0] 1761 mov $acc6, $acc2 1762 sbb $a_ptr, $acc5 # .Lpoly[1] 1763 sbb \$0, $acc6 # .Lpoly[2] 1764 mov $acc7, $t0 1765 sbb $t1, $acc7 # .Lpoly[3] 1766 sbb \$0, $acc3 1767 1768 cmovc $acc0, $acc4 1769 cmovc $acc1, $acc5 1770 mov $acc4, 8*0($r_ptr) 1771 cmovc $acc2, $acc6 1772 mov $acc5, 8*1($r_ptr) 1773 cmovc $t0, $acc7 1774 mov $acc6, 8*2($r_ptr) 1775 mov $acc7, 8*3($r_ptr) 1776 1777 ret 1778.cfi_endproc 1779.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 1780___ 1781 1782if ($addx) { 1783$code.=<<___; 1784.type __ecp_nistz256_mul_montx,\@abi-omnipotent 1785.align 32 1786__ecp_nistz256_mul_montx: 1787.cfi_startproc 1788 ######################################################################## 1789 # Multiply by b[0] 1790 mulx $acc1, $acc0, $acc1 1791 mulx $acc2, $t0, $acc2 1792 mov \$32, $poly1 1793 xor $acc5, $acc5 # cf=0 1794 mulx $acc3, $t1, $acc3 1795 mov .Lpoly+8*3(%rip), $poly3 1796 adc $t0, $acc1 1797 mulx $acc4, $t0, $acc4 1798 mov $acc0, %rdx 1799 adc $t1, $acc2 1800 shlx $poly1,$acc0,$t1 1801 adc $t0, $acc3 1802 shrx $poly1,$acc0,$t0 1803 adc \$0, $acc4 1804 1805 ######################################################################## 1806 # First reduction step 1807 add $t1, $acc1 1808 adc $t0, $acc2 1809 1810 mulx $poly3, $t0, $t1 1811 mov 8*1($b_ptr), %rdx 1812 adc $t0, $acc3 1813 adc $t1, $acc4 1814 adc \$0, $acc5 1815 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 1816 1817 ######################################################################## 1818 # Multiply by b[1] 1819 mulx 8*0+128($a_ptr), $t0, $t1 1820 adcx $t0, $acc1 1821 adox $t1, $acc2 1822 1823 mulx 8*1+128($a_ptr), $t0, $t1 1824 adcx $t0, $acc2 1825 adox $t1, $acc3 1826 1827 mulx 8*2+128($a_ptr), $t0, $t1 1828 adcx $t0, $acc3 1829 adox $t1, $acc4 1830 1831 mulx 8*3+128($a_ptr), $t0, $t1 1832 mov $acc1, %rdx 1833 adcx $t0, $acc4 1834 shlx $poly1, $acc1, $t0 1835 adox $t1, $acc5 1836 shrx $poly1, $acc1, $t1 1837 1838 adcx $acc0, $acc5 1839 adox $acc0, $acc0 1840 adc \$0, $acc0 1841 1842 ######################################################################## 1843 # Second reduction step 1844 add $t0, $acc2 1845 adc $t1, $acc3 1846 1847 mulx $poly3, $t0, $t1 1848 mov 8*2($b_ptr), %rdx 1849 adc $t0, $acc4 1850 adc $t1, $acc5 1851 adc \$0, $acc0 1852 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1853 1854 ######################################################################## 1855 # Multiply by b[2] 1856 mulx 8*0+128($a_ptr), $t0, $t1 1857 adcx $t0, $acc2 1858 adox $t1, $acc3 1859 1860 mulx 8*1+128($a_ptr), $t0, $t1 1861 adcx $t0, $acc3 1862 adox $t1, $acc4 1863 1864 mulx 8*2+128($a_ptr), $t0, $t1 1865 adcx $t0, $acc4 1866 adox $t1, $acc5 1867 1868 mulx 8*3+128($a_ptr), $t0, $t1 1869 mov $acc2, %rdx 1870 adcx $t0, $acc5 1871 shlx $poly1, $acc2, $t0 1872 adox $t1, $acc0 1873 shrx $poly1, $acc2, $t1 1874 1875 adcx $acc1, $acc0 1876 adox $acc1, $acc1 1877 adc \$0, $acc1 1878 1879 ######################################################################## 1880 # Third reduction step 1881 add $t0, $acc3 1882 adc $t1, $acc4 1883 1884 mulx $poly3, $t0, $t1 1885 mov 8*3($b_ptr), %rdx 1886 adc $t0, $acc5 1887 adc $t1, $acc0 1888 adc \$0, $acc1 1889 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1890 1891 ######################################################################## 1892 # Multiply by b[3] 1893 mulx 8*0+128($a_ptr), $t0, $t1 1894 adcx $t0, $acc3 1895 adox $t1, $acc4 1896 1897 mulx 8*1+128($a_ptr), $t0, $t1 1898 adcx $t0, $acc4 1899 adox $t1, $acc5 1900 1901 mulx 8*2+128($a_ptr), $t0, $t1 1902 adcx $t0, $acc5 1903 adox $t1, $acc0 1904 1905 mulx 8*3+128($a_ptr), $t0, $t1 1906 mov $acc3, %rdx 1907 adcx $t0, $acc0 1908 shlx $poly1, $acc3, $t0 1909 adox $t1, $acc1 1910 shrx $poly1, $acc3, $t1 1911 1912 adcx $acc2, $acc1 1913 adox $acc2, $acc2 1914 adc \$0, $acc2 1915 1916 ######################################################################## 1917 # Fourth reduction step 1918 add $t0, $acc4 1919 adc $t1, $acc5 1920 1921 mulx $poly3, $t0, $t1 1922 mov $acc4, $t2 1923 mov .Lpoly+8*1(%rip), $poly1 1924 adc $t0, $acc0 1925 mov $acc5, $t3 1926 adc $t1, $acc1 1927 adc \$0, $acc2 1928 1929 ######################################################################## 1930 # Branch-less conditional subtraction of P 1931 xor %eax, %eax 1932 mov $acc0, $t0 1933 sbb \$-1, $acc4 # .Lpoly[0] 1934 sbb $poly1, $acc5 # .Lpoly[1] 1935 sbb \$0, $acc0 # .Lpoly[2] 1936 mov $acc1, $t1 1937 sbb $poly3, $acc1 # .Lpoly[3] 1938 sbb \$0, $acc2 1939 1940 cmovc $t2, $acc4 1941 cmovc $t3, $acc5 1942 mov $acc4, 8*0($r_ptr) 1943 cmovc $t0, $acc0 1944 mov $acc5, 8*1($r_ptr) 1945 cmovc $t1, $acc1 1946 mov $acc0, 8*2($r_ptr) 1947 mov $acc1, 8*3($r_ptr) 1948 1949 ret 1950.cfi_endproc 1951.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1952 1953.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1954.align 32 1955__ecp_nistz256_sqr_montx: 1956.cfi_startproc 1957 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1958 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1959 xor %eax, %eax 1960 adc $t0, $acc2 1961 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1962 mov $acc6, %rdx 1963 adc $t1, $acc3 1964 adc \$0, $acc4 1965 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1966 1967 ################################# 1968 mulx $acc7, $t0, $t1 # a[1]*a[2] 1969 adcx $t0, $acc3 1970 adox $t1, $acc4 1971 1972 mulx $acc0, $t0, $t1 # a[1]*a[3] 1973 mov $acc7, %rdx 1974 adcx $t0, $acc4 1975 adox $t1, $acc5 1976 adc \$0, $acc5 1977 1978 ################################# 1979 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1980 mov 8*0+128($a_ptr), %rdx 1981 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1982 adcx $acc1, $acc1 # acc1:6<<1 1983 adox $t0, $acc5 1984 adcx $acc2, $acc2 1985 adox $acc7, $acc6 # of=0 1986 1987 mulx %rdx, $acc0, $t1 1988 mov 8*1+128($a_ptr), %rdx 1989 adcx $acc3, $acc3 1990 adox $t1, $acc1 1991 adcx $acc4, $acc4 1992 mulx %rdx, $t0, $t4 1993 mov 8*2+128($a_ptr), %rdx 1994 adcx $acc5, $acc5 1995 adox $t0, $acc2 1996 adcx $acc6, $acc6 1997 .byte 0x67 1998 mulx %rdx, $t0, $t1 1999 mov 8*3+128($a_ptr), %rdx 2000 adox $t4, $acc3 2001 adcx $acc7, $acc7 2002 adox $t0, $acc4 2003 mov \$32, $a_ptr 2004 adox $t1, $acc5 2005 .byte 0x67,0x67 2006 mulx %rdx, $t0, $t4 2007 mov .Lpoly+8*3(%rip), %rdx 2008 adox $t0, $acc6 2009 shlx $a_ptr, $acc0, $t0 2010 adox $t4, $acc7 2011 shrx $a_ptr, $acc0, $t4 2012 mov %rdx,$t1 2013 2014 # reduction step 1 2015 add $t0, $acc1 2016 adc $t4, $acc2 2017 2018 mulx $acc0, $t0, $acc0 2019 adc $t0, $acc3 2020 shlx $a_ptr, $acc1, $t0 2021 adc \$0, $acc0 2022 shrx $a_ptr, $acc1, $t4 2023 2024 # reduction step 2 2025 add $t0, $acc2 2026 adc $t4, $acc3 2027 2028 mulx $acc1, $t0, $acc1 2029 adc $t0, $acc0 2030 shlx $a_ptr, $acc2, $t0 2031 adc \$0, $acc1 2032 shrx $a_ptr, $acc2, $t4 2033 2034 # reduction step 3 2035 add $t0, $acc3 2036 adc $t4, $acc0 2037 2038 mulx $acc2, $t0, $acc2 2039 adc $t0, $acc1 2040 shlx $a_ptr, $acc3, $t0 2041 adc \$0, $acc2 2042 shrx $a_ptr, $acc3, $t4 2043 2044 # reduction step 4 2045 add $t0, $acc0 2046 adc $t4, $acc1 2047 2048 mulx $acc3, $t0, $acc3 2049 adc $t0, $acc2 2050 adc \$0, $acc3 2051 2052 xor $t3, $t3 2053 add $acc0, $acc4 # accumulate upper half 2054 mov .Lpoly+8*1(%rip), $a_ptr 2055 adc $acc1, $acc5 2056 mov $acc4, $acc0 2057 adc $acc2, $acc6 2058 adc $acc3, $acc7 2059 mov $acc5, $acc1 2060 adc \$0, $t3 2061 2062 sub \$-1, $acc4 # .Lpoly[0] 2063 mov $acc6, $acc2 2064 sbb $a_ptr, $acc5 # .Lpoly[1] 2065 sbb \$0, $acc6 # .Lpoly[2] 2066 mov $acc7, $acc3 2067 sbb $t1, $acc7 # .Lpoly[3] 2068 sbb \$0, $t3 2069 2070 cmovc $acc0, $acc4 2071 cmovc $acc1, $acc5 2072 mov $acc4, 8*0($r_ptr) 2073 cmovc $acc2, $acc6 2074 mov $acc5, 8*1($r_ptr) 2075 cmovc $acc3, $acc7 2076 mov $acc6, 8*2($r_ptr) 2077 mov $acc7, 8*3($r_ptr) 2078 2079 ret 2080.cfi_endproc 2081.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 2082___ 2083} 2084} 2085{ 2086my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2087my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 2088my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 2089my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 2090 2091$code.=<<___; 2092################################################################################ 2093# void nistz256_select_w5(uint64_t *val, uint64_t *in_t, crypto_word index); 2094.globl nistz256_select_w5 2095.type nistz256_select_w5,\@abi-omnipotent 2096.align 32 2097nistz256_select_w5: 2098.cfi_startproc 2099___ 2100$code.=<<___ if ($avx>1); 2101 leaq OPENSSL_ia32cap_P(%rip), %rax 2102 mov 8(%rax), %rax 2103 test \$`1<<5`, %eax 2104 jnz .Lavx2_select_w5 2105___ 2106$code.=<<___ if ($win64); 2107 lea -0x88(%rsp), %rax 2108.LSEH_begin_nistz256_select_w5: 2109 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2110 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2111 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2112 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2113 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2114 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2115 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2116 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2117 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2118 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2119 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2120___ 2121$code.=<<___; 2122 movdqa .LOne(%rip), $ONE 2123 movd $index, $INDEX 2124 2125 pxor $Ra, $Ra 2126 pxor $Rb, $Rb 2127 pxor $Rc, $Rc 2128 pxor $Rd, $Rd 2129 pxor $Re, $Re 2130 pxor $Rf, $Rf 2131 2132 movdqa $ONE, $M0 2133 pshufd \$0, $INDEX, $INDEX 2134 2135 mov \$16, %rax 2136.Lselect_loop_sse_w5: 2137 2138 movdqa $M0, $TMP0 2139 paddd $ONE, $M0 2140 pcmpeqd $INDEX, $TMP0 2141 2142 movdqa 16*0($in_t), $T0a 2143 movdqa 16*1($in_t), $T0b 2144 movdqa 16*2($in_t), $T0c 2145 movdqa 16*3($in_t), $T0d 2146 movdqa 16*4($in_t), $T0e 2147 movdqa 16*5($in_t), $T0f 2148 lea 16*6($in_t), $in_t 2149 2150 pand $TMP0, $T0a 2151 pand $TMP0, $T0b 2152 por $T0a, $Ra 2153 pand $TMP0, $T0c 2154 por $T0b, $Rb 2155 pand $TMP0, $T0d 2156 por $T0c, $Rc 2157 pand $TMP0, $T0e 2158 por $T0d, $Rd 2159 pand $TMP0, $T0f 2160 por $T0e, $Re 2161 por $T0f, $Rf 2162 2163 dec %rax 2164 jnz .Lselect_loop_sse_w5 2165 2166 movdqu $Ra, 16*0($val) 2167 movdqu $Rb, 16*1($val) 2168 movdqu $Rc, 16*2($val) 2169 movdqu $Rd, 16*3($val) 2170 movdqu $Re, 16*4($val) 2171 movdqu $Rf, 16*5($val) 2172___ 2173$code.=<<___ if ($win64); 2174 movaps (%rsp), %xmm6 2175 movaps 0x10(%rsp), %xmm7 2176 movaps 0x20(%rsp), %xmm8 2177 movaps 0x30(%rsp), %xmm9 2178 movaps 0x40(%rsp), %xmm10 2179 movaps 0x50(%rsp), %xmm11 2180 movaps 0x60(%rsp), %xmm12 2181 movaps 0x70(%rsp), %xmm13 2182 movaps 0x80(%rsp), %xmm14 2183 movaps 0x90(%rsp), %xmm15 2184 lea 0xa8(%rsp), %rsp 2185___ 2186$code.=<<___; 2187 ret 2188.cfi_endproc 2189.LSEH_end_nistz256_select_w5: 2190.size nistz256_select_w5,.-nistz256_select_w5 2191 2192################################################################################ 2193# void nistz256_select_w7(uint64_t *val, uint64_t *in_t, crypto_word index); 2194.globl nistz256_select_w7 2195.type nistz256_select_w7,\@abi-omnipotent 2196.align 32 2197nistz256_select_w7: 2198.cfi_startproc 2199___ 2200$code.=<<___ if ($avx>1); 2201 leaq OPENSSL_ia32cap_P(%rip), %rax 2202 mov 8(%rax), %rax 2203 test \$`1<<5`, %eax 2204 jnz .Lavx2_select_w7 2205___ 2206$code.=<<___ if ($win64); 2207 lea -0x88(%rsp), %rax 2208.LSEH_begin_nistz256_select_w7: 2209 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2210 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2211 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2212 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2213 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2214 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2215 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2216 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2217 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2218 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2219 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2220___ 2221$code.=<<___; 2222 movdqa .LOne(%rip), $M0 2223 movd $index, $INDEX 2224 2225 pxor $Ra, $Ra 2226 pxor $Rb, $Rb 2227 pxor $Rc, $Rc 2228 pxor $Rd, $Rd 2229 2230 movdqa $M0, $ONE 2231 pshufd \$0, $INDEX, $INDEX 2232 mov \$64, %rax 2233 2234.Lselect_loop_sse_w7: 2235 movdqa $M0, $TMP0 2236 paddd $ONE, $M0 2237 movdqa 16*0($in_t), $T0a 2238 movdqa 16*1($in_t), $T0b 2239 pcmpeqd $INDEX, $TMP0 2240 movdqa 16*2($in_t), $T0c 2241 movdqa 16*3($in_t), $T0d 2242 lea 16*4($in_t), $in_t 2243 2244 pand $TMP0, $T0a 2245 pand $TMP0, $T0b 2246 por $T0a, $Ra 2247 pand $TMP0, $T0c 2248 por $T0b, $Rb 2249 pand $TMP0, $T0d 2250 por $T0c, $Rc 2251 prefetcht0 255($in_t) 2252 por $T0d, $Rd 2253 2254 dec %rax 2255 jnz .Lselect_loop_sse_w7 2256 2257 movdqu $Ra, 16*0($val) 2258 movdqu $Rb, 16*1($val) 2259 movdqu $Rc, 16*2($val) 2260 movdqu $Rd, 16*3($val) 2261___ 2262$code.=<<___ if ($win64); 2263 movaps (%rsp), %xmm6 2264 movaps 0x10(%rsp), %xmm7 2265 movaps 0x20(%rsp), %xmm8 2266 movaps 0x30(%rsp), %xmm9 2267 movaps 0x40(%rsp), %xmm10 2268 movaps 0x50(%rsp), %xmm11 2269 movaps 0x60(%rsp), %xmm12 2270 movaps 0x70(%rsp), %xmm13 2271 movaps 0x80(%rsp), %xmm14 2272 movaps 0x90(%rsp), %xmm15 2273 lea 0xa8(%rsp), %rsp 2274___ 2275$code.=<<___; 2276 ret 2277.cfi_endproc 2278.LSEH_end_nistz256_select_w7: 2279.size nistz256_select_w7,.-nistz256_select_w7 2280___ 2281} 2282if ($avx>1) { 2283my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2284my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 2285my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 2286my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 2287 2288$code.=<<___; 2289################################################################################ 2290# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 2291.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 2292.align 32 2293ecp_nistz256_avx2_select_w5: 2294.cfi_startproc 2295.Lavx2_select_w5: 2296 vzeroupper 2297___ 2298$code.=<<___ if ($win64); 2299 lea -0x88(%rsp), %rax 2300 mov %rsp,%r11 2301.LSEH_begin_ecp_nistz256_avx2_select_w5: 2302 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2303 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2304 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2305 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2306 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2307 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2308 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2309 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2310 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2311 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2312 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2313___ 2314$code.=<<___; 2315 vmovdqa .LTwo(%rip), $TWO 2316 2317 vpxor $Ra, $Ra, $Ra 2318 vpxor $Rb, $Rb, $Rb 2319 vpxor $Rc, $Rc, $Rc 2320 2321 vmovdqa .LOne(%rip), $M0 2322 vmovdqa .LTwo(%rip), $M1 2323 2324 vmovd $index, %xmm1 2325 vpermd $INDEX, $Ra, $INDEX 2326 2327 mov \$8, %rax 2328.Lselect_loop_avx2_w5: 2329 2330 vmovdqa 32*0($in_t), $T0a 2331 vmovdqa 32*1($in_t), $T0b 2332 vmovdqa 32*2($in_t), $T0c 2333 2334 vmovdqa 32*3($in_t), $T1a 2335 vmovdqa 32*4($in_t), $T1b 2336 vmovdqa 32*5($in_t), $T1c 2337 2338 vpcmpeqd $INDEX, $M0, $TMP0 2339 vpcmpeqd $INDEX, $M1, $TMP1 2340 2341 vpaddd $TWO, $M0, $M0 2342 vpaddd $TWO, $M1, $M1 2343 lea 32*6($in_t), $in_t 2344 2345 vpand $TMP0, $T0a, $T0a 2346 vpand $TMP0, $T0b, $T0b 2347 vpand $TMP0, $T0c, $T0c 2348 vpand $TMP1, $T1a, $T1a 2349 vpand $TMP1, $T1b, $T1b 2350 vpand $TMP1, $T1c, $T1c 2351 2352 vpxor $T0a, $Ra, $Ra 2353 vpxor $T0b, $Rb, $Rb 2354 vpxor $T0c, $Rc, $Rc 2355 vpxor $T1a, $Ra, $Ra 2356 vpxor $T1b, $Rb, $Rb 2357 vpxor $T1c, $Rc, $Rc 2358 2359 dec %rax 2360 jnz .Lselect_loop_avx2_w5 2361 2362 vmovdqu $Ra, 32*0($val) 2363 vmovdqu $Rb, 32*1($val) 2364 vmovdqu $Rc, 32*2($val) 2365 vzeroupper 2366___ 2367$code.=<<___ if ($win64); 2368 movaps (%rsp), %xmm6 2369 movaps 0x10(%rsp), %xmm7 2370 movaps 0x20(%rsp), %xmm8 2371 movaps 0x30(%rsp), %xmm9 2372 movaps 0x40(%rsp), %xmm10 2373 movaps 0x50(%rsp), %xmm11 2374 movaps 0x60(%rsp), %xmm12 2375 movaps 0x70(%rsp), %xmm13 2376 movaps 0x80(%rsp), %xmm14 2377 movaps 0x90(%rsp), %xmm15 2378 lea (%r11), %rsp 2379___ 2380$code.=<<___; 2381 ret 2382.cfi_endproc 2383.LSEH_end_ecp_nistz256_avx2_select_w5: 2384.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 2385___ 2386} 2387if ($avx>1) { 2388my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2389my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 2390my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 2391my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 2392my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 2393 2394$code.=<<___; 2395 2396################################################################################ 2397# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 2398.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 2399.align 32 2400ecp_nistz256_avx2_select_w7: 2401.cfi_startproc 2402.Lavx2_select_w7: 2403 vzeroupper 2404___ 2405$code.=<<___ if ($win64); 2406 mov %rsp,%r11 2407 lea -0x88(%rsp), %rax 2408.LSEH_begin_ecp_nistz256_avx2_select_w7: 2409 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2410 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2411 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2412 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2413 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2414 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2415 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2416 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2417 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2418 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2419 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2420___ 2421$code.=<<___; 2422 vmovdqa .LThree(%rip), $THREE 2423 2424 vpxor $Ra, $Ra, $Ra 2425 vpxor $Rb, $Rb, $Rb 2426 2427 vmovdqa .LOne(%rip), $M0 2428 vmovdqa .LTwo(%rip), $M1 2429 vmovdqa .LThree(%rip), $M2 2430 2431 vmovd $index, %xmm1 2432 vpermd $INDEX, $Ra, $INDEX 2433 # Skip index = 0, because it is implicitly the point at infinity 2434 2435 mov \$21, %rax 2436.Lselect_loop_avx2_w7: 2437 2438 vmovdqa 32*0($in_t), $T0a 2439 vmovdqa 32*1($in_t), $T0b 2440 2441 vmovdqa 32*2($in_t), $T1a 2442 vmovdqa 32*3($in_t), $T1b 2443 2444 vmovdqa 32*4($in_t), $T2a 2445 vmovdqa 32*5($in_t), $T2b 2446 2447 vpcmpeqd $INDEX, $M0, $TMP0 2448 vpcmpeqd $INDEX, $M1, $TMP1 2449 vpcmpeqd $INDEX, $M2, $TMP2 2450 2451 vpaddd $THREE, $M0, $M0 2452 vpaddd $THREE, $M1, $M1 2453 vpaddd $THREE, $M2, $M2 2454 lea 32*6($in_t), $in_t 2455 2456 vpand $TMP0, $T0a, $T0a 2457 vpand $TMP0, $T0b, $T0b 2458 vpand $TMP1, $T1a, $T1a 2459 vpand $TMP1, $T1b, $T1b 2460 vpand $TMP2, $T2a, $T2a 2461 vpand $TMP2, $T2b, $T2b 2462 2463 vpxor $T0a, $Ra, $Ra 2464 vpxor $T0b, $Rb, $Rb 2465 vpxor $T1a, $Ra, $Ra 2466 vpxor $T1b, $Rb, $Rb 2467 vpxor $T2a, $Ra, $Ra 2468 vpxor $T2b, $Rb, $Rb 2469 2470 dec %rax 2471 jnz .Lselect_loop_avx2_w7 2472 2473 2474 vmovdqa 32*0($in_t), $T0a 2475 vmovdqa 32*1($in_t), $T0b 2476 2477 vpcmpeqd $INDEX, $M0, $TMP0 2478 2479 vpand $TMP0, $T0a, $T0a 2480 vpand $TMP0, $T0b, $T0b 2481 2482 vpxor $T0a, $Ra, $Ra 2483 vpxor $T0b, $Rb, $Rb 2484 2485 vmovdqu $Ra, 32*0($val) 2486 vmovdqu $Rb, 32*1($val) 2487 vzeroupper 2488___ 2489$code.=<<___ if ($win64); 2490 movaps (%rsp), %xmm6 2491 movaps 0x10(%rsp), %xmm7 2492 movaps 0x20(%rsp), %xmm8 2493 movaps 0x30(%rsp), %xmm9 2494 movaps 0x40(%rsp), %xmm10 2495 movaps 0x50(%rsp), %xmm11 2496 movaps 0x60(%rsp), %xmm12 2497 movaps 0x70(%rsp), %xmm13 2498 movaps 0x80(%rsp), %xmm14 2499 movaps 0x90(%rsp), %xmm15 2500 lea (%r11), %rsp 2501___ 2502$code.=<<___; 2503 ret 2504.cfi_endproc 2505.LSEH_end_ecp_nistz256_avx2_select_w7: 2506.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2507___ 2508} 2509{{{ 2510######################################################################## 2511# This block implements higher level point_double, point_add and 2512# point_add_affine. The key to performance in this case is to allow 2513# out-of-order execution logic to overlap computations from next step 2514# with tail processing from current step. By using tailored calling 2515# sequence we minimize inter-step overhead to give processor better 2516# shot at overlapping operations... 2517# 2518# You will notice that input data is copied to stack. Trouble is that 2519# there are no registers to spare for holding original pointers and 2520# reloading them, pointers, would create undesired dependencies on 2521# effective addresses calculation paths. In other words it's too done 2522# to favour out-of-order execution logic. 2523# <appro@openssl.org> 2524 2525my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 2526my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 2527my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 2528my ($poly1,$poly3)=($acc6,$acc7); 2529 2530sub load_for_mul () { 2531my ($a,$b,$src0) = @_; 2532my $bias = $src0 eq "%rax" ? 0 : -128; 2533 2534" mov $b, $src0 2535 lea $b, $b_ptr 2536 mov 8*0+$a, $acc1 2537 mov 8*1+$a, $acc2 2538 lea $bias+$a, $a_ptr 2539 mov 8*2+$a, $acc3 2540 mov 8*3+$a, $acc4" 2541} 2542 2543sub load_for_sqr () { 2544my ($a,$src0) = @_; 2545my $bias = $src0 eq "%rax" ? 0 : -128; 2546 2547" mov 8*0+$a, $src0 2548 mov 8*1+$a, $acc6 2549 lea $bias+$a, $a_ptr 2550 mov 8*2+$a, $acc7 2551 mov 8*3+$a, $acc0" 2552} 2553 2554 { 2555######################################################################## 2556# operate in 4-5-0-1 "name space" that matches multiplication output 2557# 2558my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2559 2560$code.=<<___; 2561.type __ecp_nistz256_add_toq,\@abi-omnipotent 2562.align 32 2563__ecp_nistz256_add_toq: 2564.cfi_startproc 2565 xor $t4,$t4 2566 add 8*0($b_ptr), $a0 2567 adc 8*1($b_ptr), $a1 2568 mov $a0, $t0 2569 adc 8*2($b_ptr), $a2 2570 adc 8*3($b_ptr), $a3 2571 mov $a1, $t1 2572 adc \$0, $t4 2573 2574 sub \$-1, $a0 2575 mov $a2, $t2 2576 sbb $poly1, $a1 2577 sbb \$0, $a2 2578 mov $a3, $t3 2579 sbb $poly3, $a3 2580 sbb \$0, $t4 2581 2582 cmovc $t0, $a0 2583 cmovc $t1, $a1 2584 mov $a0, 8*0($r_ptr) 2585 cmovc $t2, $a2 2586 mov $a1, 8*1($r_ptr) 2587 cmovc $t3, $a3 2588 mov $a2, 8*2($r_ptr) 2589 mov $a3, 8*3($r_ptr) 2590 2591 ret 2592.cfi_endproc 2593.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 2594 2595.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 2596.align 32 2597__ecp_nistz256_sub_fromq: 2598.cfi_startproc 2599 sub 8*0($b_ptr), $a0 2600 sbb 8*1($b_ptr), $a1 2601 mov $a0, $t0 2602 sbb 8*2($b_ptr), $a2 2603 sbb 8*3($b_ptr), $a3 2604 mov $a1, $t1 2605 sbb $t4, $t4 2606 2607 add \$-1, $a0 2608 mov $a2, $t2 2609 adc $poly1, $a1 2610 adc \$0, $a2 2611 mov $a3, $t3 2612 adc $poly3, $a3 2613 test $t4, $t4 2614 2615 cmovz $t0, $a0 2616 cmovz $t1, $a1 2617 mov $a0, 8*0($r_ptr) 2618 cmovz $t2, $a2 2619 mov $a1, 8*1($r_ptr) 2620 cmovz $t3, $a3 2621 mov $a2, 8*2($r_ptr) 2622 mov $a3, 8*3($r_ptr) 2623 2624 ret 2625.cfi_endproc 2626.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 2627 2628.type __ecp_nistz256_subq,\@abi-omnipotent 2629.align 32 2630__ecp_nistz256_subq: 2631.cfi_startproc 2632 sub $a0, $t0 2633 sbb $a1, $t1 2634 mov $t0, $a0 2635 sbb $a2, $t2 2636 sbb $a3, $t3 2637 mov $t1, $a1 2638 sbb $t4, $t4 2639 2640 add \$-1, $t0 2641 mov $t2, $a2 2642 adc $poly1, $t1 2643 adc \$0, $t2 2644 mov $t3, $a3 2645 adc $poly3, $t3 2646 test $t4, $t4 2647 2648 cmovnz $t0, $a0 2649 cmovnz $t1, $a1 2650 cmovnz $t2, $a2 2651 cmovnz $t3, $a3 2652 2653 ret 2654.cfi_endproc 2655.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 2656 2657.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 2658.align 32 2659__ecp_nistz256_mul_by_2q: 2660.cfi_startproc 2661 xor $t4, $t4 2662 add $a0, $a0 # a0:a3+a0:a3 2663 adc $a1, $a1 2664 mov $a0, $t0 2665 adc $a2, $a2 2666 adc $a3, $a3 2667 mov $a1, $t1 2668 adc \$0, $t4 2669 2670 sub \$-1, $a0 2671 mov $a2, $t2 2672 sbb $poly1, $a1 2673 sbb \$0, $a2 2674 mov $a3, $t3 2675 sbb $poly3, $a3 2676 sbb \$0, $t4 2677 2678 cmovc $t0, $a0 2679 cmovc $t1, $a1 2680 mov $a0, 8*0($r_ptr) 2681 cmovc $t2, $a2 2682 mov $a1, 8*1($r_ptr) 2683 cmovc $t3, $a3 2684 mov $a2, 8*2($r_ptr) 2685 mov $a3, 8*3($r_ptr) 2686 2687 ret 2688.cfi_endproc 2689.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 2690___ 2691 } 2692sub gen_double () { 2693 my $x = shift; 2694 my ($src0,$sfx,$bias); 2695 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 2696 2697 if ($x ne "x") { 2698 $src0 = "%rax"; 2699 $sfx = ""; 2700 $bias = 0; 2701 2702$code.=<<___; 2703.globl p256_point_double 2704.type p256_point_double,\@function,2 2705.align 32 2706p256_point_double: 2707.cfi_startproc 2708___ 2709$code.=<<___ if ($addx); 2710 leaq OPENSSL_ia32cap_P(%rip), %rcx 2711 mov 8(%rcx), %rcx 2712 and \$0x80100, %ecx 2713 cmp \$0x80100, %ecx 2714 je .Lpoint_doublex 2715___ 2716 } else { 2717 $src0 = "%rdx"; 2718 $sfx = "x"; 2719 $bias = 128; 2720 2721$code.=<<___; 2722.type p256_point_doublex,\@function,2 2723.align 32 2724p256_point_doublex: 2725.cfi_startproc 2726.Lpoint_doublex: 2727___ 2728 } 2729$code.=<<___; 2730 push %rbp 2731.cfi_push %rbp 2732 push %rbx 2733.cfi_push %rbx 2734 push %r12 2735.cfi_push %r12 2736 push %r13 2737.cfi_push %r13 2738 push %r14 2739.cfi_push %r14 2740 push %r15 2741.cfi_push %r15 2742 sub \$32*5+8, %rsp 2743.cfi_adjust_cfa_offset 32*5+8 2744.Lpoint_double${x}_body: 2745 2746.Lpoint_double_shortcut$x: 2747 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2748 mov $a_ptr, $b_ptr # backup copy 2749 movdqu 0x10($a_ptr), %xmm1 2750 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2751 mov 0x20+8*1($a_ptr), $acc5 2752 mov 0x20+8*2($a_ptr), $acc0 2753 mov 0x20+8*3($a_ptr), $acc1 2754 mov .Lpoly+8*1(%rip), $poly1 2755 mov .Lpoly+8*3(%rip), $poly3 2756 movdqa %xmm0, $in_x(%rsp) 2757 movdqa %xmm1, $in_x+0x10(%rsp) 2758 lea 0x20($r_ptr), $acc2 2759 lea 0x40($r_ptr), $acc3 2760 movq $r_ptr, %xmm0 2761 movq $acc2, %xmm1 2762 movq $acc3, %xmm2 2763 2764 lea $S(%rsp), $r_ptr 2765 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2766 2767 mov 0x40+8*0($a_ptr), $src0 2768 mov 0x40+8*1($a_ptr), $acc6 2769 mov 0x40+8*2($a_ptr), $acc7 2770 mov 0x40+8*3($a_ptr), $acc0 2771 lea 0x40-$bias($a_ptr), $a_ptr 2772 lea $Zsqr(%rsp), $r_ptr 2773 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2774 2775 `&load_for_sqr("$S(%rsp)", "$src0")` 2776 lea $S(%rsp), $r_ptr 2777 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2778 2779 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2780 mov 0x40+8*0($b_ptr), $acc1 2781 mov 0x40+8*1($b_ptr), $acc2 2782 mov 0x40+8*2($b_ptr), $acc3 2783 mov 0x40+8*3($b_ptr), $acc4 2784 lea 0x40-$bias($b_ptr), $a_ptr 2785 lea 0x20($b_ptr), $b_ptr 2786 movq %xmm2, $r_ptr 2787 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2788 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2789 2790 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2791 mov $in_x+8*1(%rsp), $acc5 2792 lea $Zsqr(%rsp), $b_ptr 2793 mov $in_x+8*2(%rsp), $acc0 2794 mov $in_x+8*3(%rsp), $acc1 2795 lea $M(%rsp), $r_ptr 2796 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2797 2798 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2799 mov $in_x+8*1(%rsp), $acc5 2800 lea $Zsqr(%rsp), $b_ptr 2801 mov $in_x+8*2(%rsp), $acc0 2802 mov $in_x+8*3(%rsp), $acc1 2803 lea $Zsqr(%rsp), $r_ptr 2804 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2805 2806 `&load_for_sqr("$S(%rsp)", "$src0")` 2807 movq %xmm1, $r_ptr 2808 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2809___ 2810{ 2811######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2812# operate in 4-5-6-7 "name space" that matches squaring output 2813# 2814my ($poly1,$poly3)=($a_ptr,$t1); 2815my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2816 2817$code.=<<___; 2818 xor $t4, $t4 2819 mov $a0, $t0 2820 add \$-1, $a0 2821 mov $a1, $t1 2822 adc $poly1, $a1 2823 mov $a2, $t2 2824 adc \$0, $a2 2825 mov $a3, $t3 2826 adc $poly3, $a3 2827 adc \$0, $t4 2828 xor $a_ptr, $a_ptr # borrow $a_ptr 2829 test \$1, $t0 2830 2831 cmovz $t0, $a0 2832 cmovz $t1, $a1 2833 cmovz $t2, $a2 2834 cmovz $t3, $a3 2835 cmovz $a_ptr, $t4 2836 2837 mov $a1, $t0 # a0:a3>>1 2838 shr \$1, $a0 2839 shl \$63, $t0 2840 mov $a2, $t1 2841 shr \$1, $a1 2842 or $t0, $a0 2843 shl \$63, $t1 2844 mov $a3, $t2 2845 shr \$1, $a2 2846 or $t1, $a1 2847 shl \$63, $t2 2848 mov $a0, 8*0($r_ptr) 2849 shr \$1, $a3 2850 mov $a1, 8*1($r_ptr) 2851 shl \$63, $t4 2852 or $t2, $a2 2853 or $t4, $a3 2854 mov $a2, 8*2($r_ptr) 2855 mov $a3, 8*3($r_ptr) 2856___ 2857} 2858$code.=<<___; 2859 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2860 lea $M(%rsp), $r_ptr 2861 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2862 2863 lea $tmp0(%rsp), $r_ptr 2864 call __ecp_nistz256_mul_by_2$x 2865 2866 lea $M(%rsp), $b_ptr 2867 lea $M(%rsp), $r_ptr 2868 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2869 2870 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2871 lea $S(%rsp), $r_ptr 2872 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2873 2874 lea $tmp0(%rsp), $r_ptr 2875 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2876 2877 `&load_for_sqr("$M(%rsp)", "$src0")` 2878 movq %xmm0, $r_ptr 2879 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2880 2881 lea $tmp0(%rsp), $b_ptr 2882 mov $acc6, $acc0 # harmonize sqr output and sub input 2883 mov $acc7, $acc1 2884 mov $a_ptr, $poly1 2885 mov $t1, $poly3 2886 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2887 2888 mov $S+8*0(%rsp), $t0 2889 mov $S+8*1(%rsp), $t1 2890 mov $S+8*2(%rsp), $t2 2891 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2892 lea $S(%rsp), $r_ptr 2893 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2894 2895 mov $M(%rsp), $src0 2896 lea $M(%rsp), $b_ptr 2897 mov $acc4, $acc6 # harmonize sub output and mul input 2898 xor %ecx, %ecx 2899 mov $acc4, $S+8*0(%rsp) # have to save:-( 2900 mov $acc5, $acc2 2901 mov $acc5, $S+8*1(%rsp) 2902 cmovz $acc0, $acc3 2903 mov $acc0, $S+8*2(%rsp) 2904 lea $S-$bias(%rsp), $a_ptr 2905 cmovz $acc1, $acc4 2906 mov $acc1, $S+8*3(%rsp) 2907 mov $acc6, $acc1 2908 lea $S(%rsp), $r_ptr 2909 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2910 2911 movq %xmm1, $b_ptr 2912 movq %xmm1, $r_ptr 2913 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2914 2915 lea 32*5+56(%rsp), %rsi 2916.cfi_def_cfa %rsi,8 2917 mov -48(%rsi),%r15 2918.cfi_restore %r15 2919 mov -40(%rsi),%r14 2920.cfi_restore %r14 2921 mov -32(%rsi),%r13 2922.cfi_restore %r13 2923 mov -24(%rsi),%r12 2924.cfi_restore %r12 2925 mov -16(%rsi),%rbx 2926.cfi_restore %rbx 2927 mov -8(%rsi),%rbp 2928.cfi_restore %rbp 2929 lea (%rsi),%rsp 2930.cfi_def_cfa_register %rsp 2931.Lpoint_double${x}_epilogue: 2932 ret 2933.cfi_endproc 2934.size p256_point_double$sfx,.-p256_point_double$sfx 2935___ 2936} 2937&gen_double("q"); 2938 2939sub gen_add () { 2940 my $x = shift; 2941 my ($src0,$sfx,$bias); 2942 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2943 $U1,$U2,$S1,$S2, 2944 $res_x,$res_y,$res_z, 2945 $in1_x,$in1_y,$in1_z, 2946 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2947 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2948 2949 if ($x ne "x") { 2950 $src0 = "%rax"; 2951 $sfx = ""; 2952 $bias = 0; 2953 2954$code.=<<___; 2955.globl p256_point_add 2956.type p256_point_add,\@function,3 2957.align 32 2958p256_point_add: 2959.cfi_startproc 2960___ 2961$code.=<<___ if ($addx); 2962 leaq OPENSSL_ia32cap_P(%rip), %rcx 2963 mov 8(%rcx), %rcx 2964 and \$0x80100, %ecx 2965 cmp \$0x80100, %ecx 2966 je .Lpoint_addx 2967___ 2968 } else { 2969 $src0 = "%rdx"; 2970 $sfx = "x"; 2971 $bias = 128; 2972 2973$code.=<<___; 2974.type p256_point_addx,\@function,3 2975.align 32 2976p256_point_addx: 2977.cfi_startproc 2978.Lpoint_addx: 2979___ 2980 } 2981$code.=<<___; 2982 push %rbp 2983.cfi_push %rbp 2984 push %rbx 2985.cfi_push %rbx 2986 push %r12 2987.cfi_push %r12 2988 push %r13 2989.cfi_push %r13 2990 push %r14 2991.cfi_push %r14 2992 push %r15 2993.cfi_push %r15 2994 sub \$32*18+8, %rsp 2995.cfi_adjust_cfa_offset 32*18+8 2996.Lpoint_add${x}_body: 2997 2998 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 2999 movdqu 0x10($a_ptr), %xmm1 3000 movdqu 0x20($a_ptr), %xmm2 3001 movdqu 0x30($a_ptr), %xmm3 3002 movdqu 0x40($a_ptr), %xmm4 3003 movdqu 0x50($a_ptr), %xmm5 3004 mov $a_ptr, $b_ptr # reassign 3005 mov $b_org, $a_ptr # reassign 3006 movdqa %xmm0, $in1_x(%rsp) 3007 movdqa %xmm1, $in1_x+0x10(%rsp) 3008 movdqa %xmm2, $in1_y(%rsp) 3009 movdqa %xmm3, $in1_y+0x10(%rsp) 3010 movdqa %xmm4, $in1_z(%rsp) 3011 movdqa %xmm5, $in1_z+0x10(%rsp) 3012 por %xmm4, %xmm5 3013 3014 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 3015 pshufd \$0xb1, %xmm5, %xmm3 3016 movdqu 0x10($a_ptr), %xmm1 3017 movdqu 0x20($a_ptr), %xmm2 3018 por %xmm3, %xmm5 3019 movdqu 0x30($a_ptr), %xmm3 3020 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 3021 mov 0x40+8*1($a_ptr), $acc6 3022 mov 0x40+8*2($a_ptr), $acc7 3023 mov 0x40+8*3($a_ptr), $acc0 3024 movdqa %xmm0, $in2_x(%rsp) 3025 pshufd \$0x1e, %xmm5, %xmm4 3026 movdqa %xmm1, $in2_x+0x10(%rsp) 3027 movdqu 0x40($a_ptr),%xmm0 # in2_z again 3028 movdqu 0x50($a_ptr),%xmm1 3029 movdqa %xmm2, $in2_y(%rsp) 3030 movdqa %xmm3, $in2_y+0x10(%rsp) 3031 por %xmm4, %xmm5 3032 pxor %xmm4, %xmm4 3033 por %xmm0, %xmm1 3034 movq $r_ptr, %xmm0 # save $r_ptr 3035 3036 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3037 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 3038 mov $acc6, $in2_z+8*1(%rsp) 3039 mov $acc7, $in2_z+8*2(%rsp) 3040 mov $acc0, $in2_z+8*3(%rsp) 3041 lea $Z2sqr(%rsp), $r_ptr # Z2^2 3042 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 3043 3044 pcmpeqd %xmm4, %xmm5 3045 pshufd \$0xb1, %xmm1, %xmm4 3046 por %xmm1, %xmm4 3047 pshufd \$0, %xmm5, %xmm5 # in1infty 3048 pshufd \$0x1e, %xmm4, %xmm3 3049 por %xmm3, %xmm4 3050 pxor %xmm3, %xmm3 3051 pcmpeqd %xmm3, %xmm4 3052 pshufd \$0, %xmm4, %xmm4 # in2infty 3053 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 3054 mov 0x40+8*1($b_ptr), $acc6 3055 mov 0x40+8*2($b_ptr), $acc7 3056 mov 0x40+8*3($b_ptr), $acc0 3057 movq $b_ptr, %xmm1 3058 3059 lea 0x40-$bias($b_ptr), $a_ptr 3060 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3061 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3062 3063 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 3064 lea $S1(%rsp), $r_ptr # S1 = Z2^3 3065 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 3066 3067 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3068 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3069 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3070 3071 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 3072 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 3073 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 3074 3075 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3076 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3077 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3078 3079 lea $S1(%rsp), $b_ptr 3080 lea $R(%rsp), $r_ptr # R = S2 - S1 3081 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 3082 3083 or $acc5, $acc4 # see if result is zero 3084 movdqa %xmm4, %xmm2 3085 or $acc0, $acc4 3086 or $acc1, $acc4 3087 por %xmm5, %xmm2 # in1infty || in2infty 3088 movq $acc4, %xmm3 3089 3090 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3091 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 3092 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 3093 3094 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 3095 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3096 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 3097 3098 lea $U1(%rsp), $b_ptr 3099 lea $H(%rsp), $r_ptr # H = U2 - U1 3100 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 3101 3102 or $acc5, $acc4 # see if result is zero 3103 or $acc0, $acc4 3104 or $acc1, $acc4 # !is_equal(U1, U2) 3105 3106 movq %xmm2, $acc0 3107 movq %xmm3, $acc1 3108 or $acc0, $acc4 3109 .byte 0x3e # predict taken 3110 jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty 3111 3112 # We now know A = B or A = -B and neither is infinity. Compare the 3113 # y-coordinates via S1 and S2. 3114 test $acc1, $acc1 3115 jz .Ladd_double$x # is_equal(S1, S2) 3116 3117 # A = -B, so the result is infinity. 3118 # 3119 # TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in 3120 # which case we should eliminate this special-case and simplify the 3121 # timing analysis. 3122 movq %xmm0, $r_ptr # restore $r_ptr 3123 pxor %xmm0, %xmm0 3124 movdqu %xmm0, 0x00($r_ptr) 3125 movdqu %xmm0, 0x10($r_ptr) 3126 movdqu %xmm0, 0x20($r_ptr) 3127 movdqu %xmm0, 0x30($r_ptr) 3128 movdqu %xmm0, 0x40($r_ptr) 3129 movdqu %xmm0, 0x50($r_ptr) 3130 jmp .Ladd_done$x 3131 3132.align 32 3133.Ladd_double$x: 3134 movq %xmm1, $a_ptr # restore $a_ptr 3135 movq %xmm0, $r_ptr # restore $r_ptr 3136 add \$`32*(18-5)`, %rsp # difference in frame sizes 3137.cfi_adjust_cfa_offset `-32*(18-5)` 3138 jmp .Lpoint_double_shortcut$x 3139.cfi_adjust_cfa_offset `32*(18-5)` 3140 3141.align 32 3142.Ladd_proceed$x: 3143 `&load_for_sqr("$R(%rsp)", "$src0")` 3144 lea $Rsqr(%rsp), $r_ptr # R^2 3145 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3146 3147 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3148 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3149 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3150 3151 `&load_for_sqr("$H(%rsp)", "$src0")` 3152 lea $Hsqr(%rsp), $r_ptr # H^2 3153 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3154 3155 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 3156 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3157 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 3158 3159 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 3160 lea $Hcub(%rsp), $r_ptr # H^3 3161 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3162 3163 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 3164 lea $U2(%rsp), $r_ptr # U1*H^2 3165 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 3166___ 3167{ 3168####################################################################### 3169# operate in 4-5-0-1 "name space" that matches multiplication output 3170# 3171my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3172my ($poly1, $poly3)=($acc6,$acc7); 3173 3174$code.=<<___; 3175 #lea $U2(%rsp), $a_ptr 3176 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3177 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3178 3179 xor $t4, $t4 3180 add $acc0, $acc0 # a0:a3+a0:a3 3181 lea $Rsqr(%rsp), $a_ptr 3182 adc $acc1, $acc1 3183 mov $acc0, $t0 3184 adc $acc2, $acc2 3185 adc $acc3, $acc3 3186 mov $acc1, $t1 3187 adc \$0, $t4 3188 3189 sub \$-1, $acc0 3190 mov $acc2, $t2 3191 sbb $poly1, $acc1 3192 sbb \$0, $acc2 3193 mov $acc3, $t3 3194 sbb $poly3, $acc3 3195 sbb \$0, $t4 3196 3197 cmovc $t0, $acc0 3198 mov 8*0($a_ptr), $t0 3199 cmovc $t1, $acc1 3200 mov 8*1($a_ptr), $t1 3201 cmovc $t2, $acc2 3202 mov 8*2($a_ptr), $t2 3203 cmovc $t3, $acc3 3204 mov 8*3($a_ptr), $t3 3205 3206 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3207 3208 lea $Hcub(%rsp), $b_ptr 3209 lea $res_x(%rsp), $r_ptr 3210 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3211 3212 mov $U2+8*0(%rsp), $t0 3213 mov $U2+8*1(%rsp), $t1 3214 mov $U2+8*2(%rsp), $t2 3215 mov $U2+8*3(%rsp), $t3 3216 lea $res_y(%rsp), $r_ptr 3217 3218 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 3219 3220 mov $acc0, 8*0($r_ptr) # save the result, as 3221 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3222 mov $acc2, 8*2($r_ptr) 3223 mov $acc3, 8*3($r_ptr) 3224___ 3225} 3226$code.=<<___; 3227 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 3228 lea $S2(%rsp), $r_ptr 3229 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 3230 3231 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 3232 lea $res_y(%rsp), $r_ptr 3233 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 3234 3235 lea $S2(%rsp), $b_ptr 3236 lea $res_y(%rsp), $r_ptr 3237 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 3238 3239 movq %xmm0, $r_ptr # restore $r_ptr 3240 3241 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 3242 movdqa %xmm5, %xmm1 3243 pandn $res_z(%rsp), %xmm0 3244 movdqa %xmm5, %xmm2 3245 pandn $res_z+0x10(%rsp), %xmm1 3246 movdqa %xmm5, %xmm3 3247 pand $in2_z(%rsp), %xmm2 3248 pand $in2_z+0x10(%rsp), %xmm3 3249 por %xmm0, %xmm2 3250 por %xmm1, %xmm3 3251 3252 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3253 movdqa %xmm4, %xmm1 3254 pandn %xmm2, %xmm0 3255 movdqa %xmm4, %xmm2 3256 pandn %xmm3, %xmm1 3257 movdqa %xmm4, %xmm3 3258 pand $in1_z(%rsp), %xmm2 3259 pand $in1_z+0x10(%rsp), %xmm3 3260 por %xmm0, %xmm2 3261 por %xmm1, %xmm3 3262 movdqu %xmm2, 0x40($r_ptr) 3263 movdqu %xmm3, 0x50($r_ptr) 3264 3265 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3266 movdqa %xmm5, %xmm1 3267 pandn $res_x(%rsp), %xmm0 3268 movdqa %xmm5, %xmm2 3269 pandn $res_x+0x10(%rsp), %xmm1 3270 movdqa %xmm5, %xmm3 3271 pand $in2_x(%rsp), %xmm2 3272 pand $in2_x+0x10(%rsp), %xmm3 3273 por %xmm0, %xmm2 3274 por %xmm1, %xmm3 3275 3276 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3277 movdqa %xmm4, %xmm1 3278 pandn %xmm2, %xmm0 3279 movdqa %xmm4, %xmm2 3280 pandn %xmm3, %xmm1 3281 movdqa %xmm4, %xmm3 3282 pand $in1_x(%rsp), %xmm2 3283 pand $in1_x+0x10(%rsp), %xmm3 3284 por %xmm0, %xmm2 3285 por %xmm1, %xmm3 3286 movdqu %xmm2, 0x00($r_ptr) 3287 movdqu %xmm3, 0x10($r_ptr) 3288 3289 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3290 movdqa %xmm5, %xmm1 3291 pandn $res_y(%rsp), %xmm0 3292 movdqa %xmm5, %xmm2 3293 pandn $res_y+0x10(%rsp), %xmm1 3294 movdqa %xmm5, %xmm3 3295 pand $in2_y(%rsp), %xmm2 3296 pand $in2_y+0x10(%rsp), %xmm3 3297 por %xmm0, %xmm2 3298 por %xmm1, %xmm3 3299 3300 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3301 movdqa %xmm4, %xmm1 3302 pandn %xmm2, %xmm0 3303 movdqa %xmm4, %xmm2 3304 pandn %xmm3, %xmm1 3305 movdqa %xmm4, %xmm3 3306 pand $in1_y(%rsp), %xmm2 3307 pand $in1_y+0x10(%rsp), %xmm3 3308 por %xmm0, %xmm2 3309 por %xmm1, %xmm3 3310 movdqu %xmm2, 0x20($r_ptr) 3311 movdqu %xmm3, 0x30($r_ptr) 3312 3313.Ladd_done$x: 3314 lea 32*18+56(%rsp), %rsi 3315.cfi_def_cfa %rsi,8 3316 mov -48(%rsi),%r15 3317.cfi_restore %r15 3318 mov -40(%rsi),%r14 3319.cfi_restore %r14 3320 mov -32(%rsi),%r13 3321.cfi_restore %r13 3322 mov -24(%rsi),%r12 3323.cfi_restore %r12 3324 mov -16(%rsi),%rbx 3325.cfi_restore %rbx 3326 mov -8(%rsi),%rbp 3327.cfi_restore %rbp 3328 lea (%rsi),%rsp 3329.cfi_def_cfa_register %rsp 3330.Lpoint_add${x}_epilogue: 3331 ret 3332.cfi_endproc 3333.size p256_point_add$sfx,.-p256_point_add$sfx 3334___ 3335} 3336&gen_add("q"); 3337 3338sub gen_add_affine () { 3339 my $x = shift; 3340 my ($src0,$sfx,$bias); 3341 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 3342 $res_x,$res_y,$res_z, 3343 $in1_x,$in1_y,$in1_z, 3344 $in2_x,$in2_y)=map(32*$_,(0..14)); 3345 my $Z1sqr = $S2; 3346 3347 if ($x ne "x") { 3348 $src0 = "%rax"; 3349 $sfx = ""; 3350 $bias = 0; 3351 3352$code.=<<___; 3353.globl p256_point_add_affine 3354.type p256_point_add_affine,\@function,3 3355.align 32 3356p256_point_add_affine: 3357.cfi_startproc 3358___ 3359$code.=<<___ if ($addx); 3360 leaq OPENSSL_ia32cap_P(%rip), %rcx 3361 mov 8(%rcx), %rcx 3362 and \$0x80100, %ecx 3363 cmp \$0x80100, %ecx 3364 je .Lpoint_add_affinex 3365___ 3366 } else { 3367 $src0 = "%rdx"; 3368 $sfx = "x"; 3369 $bias = 128; 3370 3371$code.=<<___; 3372.type p256_point_add_affinex,\@function,3 3373.align 32 3374p256_point_add_affinex: 3375.cfi_startproc 3376.Lpoint_add_affinex: 3377___ 3378 } 3379$code.=<<___; 3380 push %rbp 3381.cfi_push %rbp 3382 push %rbx 3383.cfi_push %rbx 3384 push %r12 3385.cfi_push %r12 3386 push %r13 3387.cfi_push %r13 3388 push %r14 3389.cfi_push %r14 3390 push %r15 3391.cfi_push %r15 3392 sub \$32*15+8, %rsp 3393.cfi_adjust_cfa_offset 32*15+8 3394.Ladd_affine${x}_body: 3395 3396 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3397 mov $b_org, $b_ptr # reassign 3398 movdqu 0x10($a_ptr), %xmm1 3399 movdqu 0x20($a_ptr), %xmm2 3400 movdqu 0x30($a_ptr), %xmm3 3401 movdqu 0x40($a_ptr), %xmm4 3402 movdqu 0x50($a_ptr), %xmm5 3403 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 3404 mov 0x40+8*1($a_ptr), $acc6 3405 mov 0x40+8*2($a_ptr), $acc7 3406 mov 0x40+8*3($a_ptr), $acc0 3407 movdqa %xmm0, $in1_x(%rsp) 3408 movdqa %xmm1, $in1_x+0x10(%rsp) 3409 movdqa %xmm2, $in1_y(%rsp) 3410 movdqa %xmm3, $in1_y+0x10(%rsp) 3411 movdqa %xmm4, $in1_z(%rsp) 3412 movdqa %xmm5, $in1_z+0x10(%rsp) 3413 por %xmm4, %xmm5 3414 3415 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 3416 pshufd \$0xb1, %xmm5, %xmm3 3417 movdqu 0x10($b_ptr), %xmm1 3418 movdqu 0x20($b_ptr), %xmm2 3419 por %xmm3, %xmm5 3420 movdqu 0x30($b_ptr), %xmm3 3421 movdqa %xmm0, $in2_x(%rsp) 3422 pshufd \$0x1e, %xmm5, %xmm4 3423 movdqa %xmm1, $in2_x+0x10(%rsp) 3424 por %xmm0, %xmm1 3425 movq $r_ptr, %xmm0 # save $r_ptr 3426 movdqa %xmm2, $in2_y(%rsp) 3427 movdqa %xmm3, $in2_y+0x10(%rsp) 3428 por %xmm2, %xmm3 3429 por %xmm4, %xmm5 3430 pxor %xmm4, %xmm4 3431 por %xmm1, %xmm3 3432 3433 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3434 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3435 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3436 3437 pcmpeqd %xmm4, %xmm5 3438 pshufd \$0xb1, %xmm3, %xmm4 3439 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 3440 #lea 0x00($b_ptr), $b_ptr 3441 mov $acc4, $acc1 # harmonize sqr output and mul input 3442 por %xmm3, %xmm4 3443 pshufd \$0, %xmm5, %xmm5 # in1infty 3444 pshufd \$0x1e, %xmm4, %xmm3 3445 mov $acc5, $acc2 3446 por %xmm3, %xmm4 3447 pxor %xmm3, %xmm3 3448 mov $acc6, $acc3 3449 pcmpeqd %xmm3, %xmm4 3450 pshufd \$0, %xmm4, %xmm4 # in2infty 3451 3452 lea $Z1sqr-$bias(%rsp), $a_ptr 3453 mov $acc7, $acc4 3454 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3455 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 3456 3457 lea $in1_x(%rsp), $b_ptr 3458 lea $H(%rsp), $r_ptr # H = U2 - U1 3459 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 3460 3461 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3462 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3463 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3464 3465 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3466 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3467 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3468 3469 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3470 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3471 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3472 3473 lea $in1_y(%rsp), $b_ptr 3474 lea $R(%rsp), $r_ptr # R = S2 - S1 3475 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 3476 3477 `&load_for_sqr("$H(%rsp)", "$src0")` 3478 lea $Hsqr(%rsp), $r_ptr # H^2 3479 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3480 3481 `&load_for_sqr("$R(%rsp)", "$src0")` 3482 lea $Rsqr(%rsp), $r_ptr # R^2 3483 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3484 3485 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 3486 lea $Hcub(%rsp), $r_ptr # H^3 3487 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3488 3489 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3490 lea $U2(%rsp), $r_ptr # U1*H^2 3491 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 3492___ 3493{ 3494####################################################################### 3495# operate in 4-5-0-1 "name space" that matches multiplication output 3496# 3497my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3498my ($poly1, $poly3)=($acc6,$acc7); 3499 3500$code.=<<___; 3501 #lea $U2(%rsp), $a_ptr 3502 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3503 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3504 3505 xor $t4, $t4 3506 add $acc0, $acc0 # a0:a3+a0:a3 3507 lea $Rsqr(%rsp), $a_ptr 3508 adc $acc1, $acc1 3509 mov $acc0, $t0 3510 adc $acc2, $acc2 3511 adc $acc3, $acc3 3512 mov $acc1, $t1 3513 adc \$0, $t4 3514 3515 sub \$-1, $acc0 3516 mov $acc2, $t2 3517 sbb $poly1, $acc1 3518 sbb \$0, $acc2 3519 mov $acc3, $t3 3520 sbb $poly3, $acc3 3521 sbb \$0, $t4 3522 3523 cmovc $t0, $acc0 3524 mov 8*0($a_ptr), $t0 3525 cmovc $t1, $acc1 3526 mov 8*1($a_ptr), $t1 3527 cmovc $t2, $acc2 3528 mov 8*2($a_ptr), $t2 3529 cmovc $t3, $acc3 3530 mov 8*3($a_ptr), $t3 3531 3532 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3533 3534 lea $Hcub(%rsp), $b_ptr 3535 lea $res_x(%rsp), $r_ptr 3536 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3537 3538 mov $U2+8*0(%rsp), $t0 3539 mov $U2+8*1(%rsp), $t1 3540 mov $U2+8*2(%rsp), $t2 3541 mov $U2+8*3(%rsp), $t3 3542 lea $H(%rsp), $r_ptr 3543 3544 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 3545 3546 mov $acc0, 8*0($r_ptr) # save the result, as 3547 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3548 mov $acc2, 8*2($r_ptr) 3549 mov $acc3, 8*3($r_ptr) 3550___ 3551} 3552$code.=<<___; 3553 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 3554 lea $S2(%rsp), $r_ptr 3555 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 3556 3557 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 3558 lea $H(%rsp), $r_ptr 3559 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 3560 3561 lea $S2(%rsp), $b_ptr 3562 lea $res_y(%rsp), $r_ptr 3563 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 3564 3565 movq %xmm0, $r_ptr # restore $r_ptr 3566 3567 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 3568 movdqa %xmm5, %xmm1 3569 pandn $res_z(%rsp), %xmm0 3570 movdqa %xmm5, %xmm2 3571 pandn $res_z+0x10(%rsp), %xmm1 3572 movdqa %xmm5, %xmm3 3573 pand .LONE_mont(%rip), %xmm2 3574 pand .LONE_mont+0x10(%rip), %xmm3 3575 por %xmm0, %xmm2 3576 por %xmm1, %xmm3 3577 3578 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3579 movdqa %xmm4, %xmm1 3580 pandn %xmm2, %xmm0 3581 movdqa %xmm4, %xmm2 3582 pandn %xmm3, %xmm1 3583 movdqa %xmm4, %xmm3 3584 pand $in1_z(%rsp), %xmm2 3585 pand $in1_z+0x10(%rsp), %xmm3 3586 por %xmm0, %xmm2 3587 por %xmm1, %xmm3 3588 movdqu %xmm2, 0x40($r_ptr) 3589 movdqu %xmm3, 0x50($r_ptr) 3590 3591 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3592 movdqa %xmm5, %xmm1 3593 pandn $res_x(%rsp), %xmm0 3594 movdqa %xmm5, %xmm2 3595 pandn $res_x+0x10(%rsp), %xmm1 3596 movdqa %xmm5, %xmm3 3597 pand $in2_x(%rsp), %xmm2 3598 pand $in2_x+0x10(%rsp), %xmm3 3599 por %xmm0, %xmm2 3600 por %xmm1, %xmm3 3601 3602 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3603 movdqa %xmm4, %xmm1 3604 pandn %xmm2, %xmm0 3605 movdqa %xmm4, %xmm2 3606 pandn %xmm3, %xmm1 3607 movdqa %xmm4, %xmm3 3608 pand $in1_x(%rsp), %xmm2 3609 pand $in1_x+0x10(%rsp), %xmm3 3610 por %xmm0, %xmm2 3611 por %xmm1, %xmm3 3612 movdqu %xmm2, 0x00($r_ptr) 3613 movdqu %xmm3, 0x10($r_ptr) 3614 3615 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3616 movdqa %xmm5, %xmm1 3617 pandn $res_y(%rsp), %xmm0 3618 movdqa %xmm5, %xmm2 3619 pandn $res_y+0x10(%rsp), %xmm1 3620 movdqa %xmm5, %xmm3 3621 pand $in2_y(%rsp), %xmm2 3622 pand $in2_y+0x10(%rsp), %xmm3 3623 por %xmm0, %xmm2 3624 por %xmm1, %xmm3 3625 3626 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3627 movdqa %xmm4, %xmm1 3628 pandn %xmm2, %xmm0 3629 movdqa %xmm4, %xmm2 3630 pandn %xmm3, %xmm1 3631 movdqa %xmm4, %xmm3 3632 pand $in1_y(%rsp), %xmm2 3633 pand $in1_y+0x10(%rsp), %xmm3 3634 por %xmm0, %xmm2 3635 por %xmm1, %xmm3 3636 movdqu %xmm2, 0x20($r_ptr) 3637 movdqu %xmm3, 0x30($r_ptr) 3638 3639 lea 32*15+56(%rsp), %rsi 3640.cfi_def_cfa %rsi,8 3641 mov -48(%rsi),%r15 3642.cfi_restore %r15 3643 mov -40(%rsi),%r14 3644.cfi_restore %r14 3645 mov -32(%rsi),%r13 3646.cfi_restore %r13 3647 mov -24(%rsi),%r12 3648.cfi_restore %r12 3649 mov -16(%rsi),%rbx 3650.cfi_restore %rbx 3651 mov -8(%rsi),%rbp 3652.cfi_restore %rbp 3653 lea (%rsi),%rsp 3654.cfi_def_cfa_register %rsp 3655.Ladd_affine${x}_epilogue: 3656 ret 3657.cfi_endproc 3658.size p256_point_add_affine$sfx,.-p256_point_add_affine$sfx 3659___ 3660} 3661&gen_add_affine("q"); 3662 3663######################################################################## 3664# AD*X magic 3665# 3666if ($addx) { { 3667######################################################################## 3668# operate in 4-5-0-1 "name space" that matches multiplication output 3669# 3670my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3671 3672$code.=<<___; 3673.type __ecp_nistz256_add_tox,\@abi-omnipotent 3674.align 32 3675__ecp_nistz256_add_tox: 3676.cfi_startproc 3677 xor $t4, $t4 3678 adc 8*0($b_ptr), $a0 3679 adc 8*1($b_ptr), $a1 3680 mov $a0, $t0 3681 adc 8*2($b_ptr), $a2 3682 adc 8*3($b_ptr), $a3 3683 mov $a1, $t1 3684 adc \$0, $t4 3685 3686 xor $t3, $t3 3687 sbb \$-1, $a0 3688 mov $a2, $t2 3689 sbb $poly1, $a1 3690 sbb \$0, $a2 3691 mov $a3, $t3 3692 sbb $poly3, $a3 3693 sbb \$0, $t4 3694 3695 cmovc $t0, $a0 3696 cmovc $t1, $a1 3697 mov $a0, 8*0($r_ptr) 3698 cmovc $t2, $a2 3699 mov $a1, 8*1($r_ptr) 3700 cmovc $t3, $a3 3701 mov $a2, 8*2($r_ptr) 3702 mov $a3, 8*3($r_ptr) 3703 3704 ret 3705.cfi_endproc 3706.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 3707 3708.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 3709.align 32 3710__ecp_nistz256_sub_fromx: 3711.cfi_startproc 3712 xor $t4, $t4 3713 sbb 8*0($b_ptr), $a0 3714 sbb 8*1($b_ptr), $a1 3715 mov $a0, $t0 3716 sbb 8*2($b_ptr), $a2 3717 sbb 8*3($b_ptr), $a3 3718 mov $a1, $t1 3719 sbb \$0, $t4 3720 3721 xor $t3, $t3 3722 adc \$-1, $a0 3723 mov $a2, $t2 3724 adc $poly1, $a1 3725 adc \$0, $a2 3726 mov $a3, $t3 3727 adc $poly3, $a3 3728 3729 bt \$0, $t4 3730 cmovnc $t0, $a0 3731 cmovnc $t1, $a1 3732 mov $a0, 8*0($r_ptr) 3733 cmovnc $t2, $a2 3734 mov $a1, 8*1($r_ptr) 3735 cmovnc $t3, $a3 3736 mov $a2, 8*2($r_ptr) 3737 mov $a3, 8*3($r_ptr) 3738 3739 ret 3740.cfi_endproc 3741.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 3742 3743.type __ecp_nistz256_subx,\@abi-omnipotent 3744.align 32 3745__ecp_nistz256_subx: 3746.cfi_startproc 3747 xor $t4, $t4 3748 sbb $a0, $t0 3749 sbb $a1, $t1 3750 mov $t0, $a0 3751 sbb $a2, $t2 3752 sbb $a3, $t3 3753 mov $t1, $a1 3754 sbb \$0, $t4 3755 3756 xor $a3 ,$a3 3757 adc \$-1, $t0 3758 mov $t2, $a2 3759 adc $poly1, $t1 3760 adc \$0, $t2 3761 mov $t3, $a3 3762 adc $poly3, $t3 3763 3764 bt \$0, $t4 3765 cmovc $t0, $a0 3766 cmovc $t1, $a1 3767 cmovc $t2, $a2 3768 cmovc $t3, $a3 3769 3770 ret 3771.cfi_endproc 3772.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 3773 3774.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 3775.align 32 3776__ecp_nistz256_mul_by_2x: 3777.cfi_startproc 3778 xor $t4, $t4 3779 adc $a0, $a0 # a0:a3+a0:a3 3780 adc $a1, $a1 3781 mov $a0, $t0 3782 adc $a2, $a2 3783 adc $a3, $a3 3784 mov $a1, $t1 3785 adc \$0, $t4 3786 3787 xor $t3, $t3 3788 sbb \$-1, $a0 3789 mov $a2, $t2 3790 sbb $poly1, $a1 3791 sbb \$0, $a2 3792 mov $a3, $t3 3793 sbb $poly3, $a3 3794 sbb \$0, $t4 3795 3796 cmovc $t0, $a0 3797 cmovc $t1, $a1 3798 mov $a0, 8*0($r_ptr) 3799 cmovc $t2, $a2 3800 mov $a1, 8*1($r_ptr) 3801 cmovc $t3, $a3 3802 mov $a2, 8*2($r_ptr) 3803 mov $a3, 8*3($r_ptr) 3804 3805 ret 3806.cfi_endproc 3807.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 3808___ 3809 } 3810&gen_double("x"); 3811&gen_add("x"); 3812&gen_add_affine("x"); 3813} 3814}}} 3815 3816# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3817# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3818if ($win64) { 3819$rec="%rcx"; 3820$frame="%rdx"; 3821$context="%r8"; 3822$disp="%r9"; 3823 3824$code.=<<___; 3825.extern __imp_RtlVirtualUnwind 3826 3827.type short_handler,\@abi-omnipotent 3828.align 16 3829short_handler: 3830 push %rsi 3831 push %rdi 3832 push %rbx 3833 push %rbp 3834 push %r12 3835 push %r13 3836 push %r14 3837 push %r15 3838 pushfq 3839 sub \$64,%rsp 3840 3841 mov 120($context),%rax # pull context->Rax 3842 mov 248($context),%rbx # pull context->Rip 3843 3844 mov 8($disp),%rsi # disp->ImageBase 3845 mov 56($disp),%r11 # disp->HandlerData 3846 3847 mov 0(%r11),%r10d # HandlerData[0] 3848 lea (%rsi,%r10),%r10 # end of prologue label 3849 cmp %r10,%rbx # context->Rip<end of prologue label 3850 jb .Lcommon_seh_tail 3851 3852 mov 152($context),%rax # pull context->Rsp 3853 3854 mov 4(%r11),%r10d # HandlerData[1] 3855 lea (%rsi,%r10),%r10 # epilogue label 3856 cmp %r10,%rbx # context->Rip>=epilogue label 3857 jae .Lcommon_seh_tail 3858 3859 lea 16(%rax),%rax 3860 3861 mov -8(%rax),%r12 3862 mov -16(%rax),%r13 3863 mov %r12,216($context) # restore context->R12 3864 mov %r13,224($context) # restore context->R13 3865 3866 jmp .Lcommon_seh_tail 3867.size short_handler,.-short_handler 3868 3869.type full_handler,\@abi-omnipotent 3870.align 16 3871full_handler: 3872 push %rsi 3873 push %rdi 3874 push %rbx 3875 push %rbp 3876 push %r12 3877 push %r13 3878 push %r14 3879 push %r15 3880 pushfq 3881 sub \$64,%rsp 3882 3883 mov 120($context),%rax # pull context->Rax 3884 mov 248($context),%rbx # pull context->Rip 3885 3886 mov 8($disp),%rsi # disp->ImageBase 3887 mov 56($disp),%r11 # disp->HandlerData 3888 3889 mov 0(%r11),%r10d # HandlerData[0] 3890 lea (%rsi,%r10),%r10 # end of prologue label 3891 cmp %r10,%rbx # context->Rip<end of prologue label 3892 jb .Lcommon_seh_tail 3893 3894 mov 152($context),%rax # pull context->Rsp 3895 3896 mov 4(%r11),%r10d # HandlerData[1] 3897 lea (%rsi,%r10),%r10 # epilogue label 3898 cmp %r10,%rbx # context->Rip>=epilogue label 3899 jae .Lcommon_seh_tail 3900 3901 mov 8(%r11),%r10d # HandlerData[2] 3902 lea (%rax,%r10),%rax 3903 3904 mov -8(%rax),%rbp 3905 mov -16(%rax),%rbx 3906 mov -24(%rax),%r12 3907 mov -32(%rax),%r13 3908 mov -40(%rax),%r14 3909 mov -48(%rax),%r15 3910 mov %rbx,144($context) # restore context->Rbx 3911 mov %rbp,160($context) # restore context->Rbp 3912 mov %r12,216($context) # restore context->R12 3913 mov %r13,224($context) # restore context->R13 3914 mov %r14,232($context) # restore context->R14 3915 mov %r15,240($context) # restore context->R15 3916 3917.Lcommon_seh_tail: 3918 mov 8(%rax),%rdi 3919 mov 16(%rax),%rsi 3920 mov %rax,152($context) # restore context->Rsp 3921 mov %rsi,168($context) # restore context->Rsi 3922 mov %rdi,176($context) # restore context->Rdi 3923 3924 mov 40($disp),%rdi # disp->ContextRecord 3925 mov $context,%rsi # context 3926 mov \$154,%ecx # sizeof(CONTEXT) 3927 .long 0xa548f3fc # cld; rep movsq 3928 3929 mov $disp,%rsi 3930 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3931 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3932 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3933 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3934 mov 40(%rsi),%r10 # disp->ContextRecord 3935 lea 56(%rsi),%r11 # &disp->HandlerData 3936 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3937 mov %r10,32(%rsp) # arg5 3938 mov %r11,40(%rsp) # arg6 3939 mov %r12,48(%rsp) # arg7 3940 mov %rcx,56(%rsp) # arg8, (NULL) 3941 call *__imp_RtlVirtualUnwind(%rip) 3942 3943 mov \$1,%eax # ExceptionContinueSearch 3944 add \$64,%rsp 3945 popfq 3946 pop %r15 3947 pop %r14 3948 pop %r13 3949 pop %r12 3950 pop %rbp 3951 pop %rbx 3952 pop %rdi 3953 pop %rsi 3954 ret 3955.size full_handler,.-full_handler 3956 3957.section .pdata 3958.align 4 3959 .rva .LSEH_begin_nistz256_neg 3960 .rva .LSEH_end_nistz256_neg 3961 .rva .LSEH_info_nistz256_neg 3962 3963 .rva .LSEH_begin_p256_scalar_mul_mont 3964 .rva .LSEH_end_p256_scalar_mul_mont 3965 .rva .LSEH_info_p256_scalar_mul_mont 3966 3967 .rva .LSEH_begin_p256_scalar_sqr_rep_mont 3968 .rva .LSEH_end_p256_scalar_sqr_rep_mont 3969 .rva .LSEH_info_p256_scalar_sqr_rep_mont 3970___ 3971$code.=<<___ if ($addx); 3972 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx 3973 .rva .LSEH_end_ecp_nistz256_ord_mul_montx 3974 .rva .LSEH_info_ecp_nistz256_ord_mul_montx 3975 3976 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx 3977 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx 3978 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx 3979___ 3980$code.=<<___; 3981 .rva .LSEH_begin_p256_mul_mont 3982 .rva .LSEH_end_p256_mul_mont 3983 .rva .LSEH_info_p256_mul_mont 3984 3985 .rva .LSEH_begin_p256_sqr_mont 3986 .rva .LSEH_end_p256_sqr_mont 3987 .rva .LSEH_info_p256_sqr_mont 3988 3989 .rva .LSEH_begin_nistz256_select_w5 3990 .rva .LSEH_end_nistz256_select_w5 3991 .rva .LSEH_info_ecp_nistz256_select_wX 3992 3993 .rva .LSEH_begin_nistz256_select_w7 3994 .rva .LSEH_end_nistz256_select_w7 3995 .rva .LSEH_info_ecp_nistz256_select_wX 3996___ 3997$code.=<<___ if ($avx>1); 3998 .rva .LSEH_begin_ecp_nistz256_avx2_select_w5 3999 .rva .LSEH_end_ecp_nistz256_avx2_select_w5 4000 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4001 4002 .rva .LSEH_begin_ecp_nistz256_avx2_select_w7 4003 .rva .LSEH_end_ecp_nistz256_avx2_select_w7 4004 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4005___ 4006$code.=<<___; 4007 .rva .LSEH_begin_p256_point_double 4008 .rva .LSEH_end_p256_point_double 4009 .rva .LSEH_info_p256_point_double 4010 4011 .rva .LSEH_begin_p256_point_add 4012 .rva .LSEH_end_p256_point_add 4013 .rva .LSEH_info_p256_point_add 4014 4015 .rva .LSEH_begin_p256_point_add_affine 4016 .rva .LSEH_end_p256_point_add_affine 4017 .rva .LSEH_info_p256_point_add_affine 4018___ 4019$code.=<<___ if ($addx); 4020 .rva .LSEH_begin_p256_point_doublex 4021 .rva .LSEH_end_p256_point_doublex 4022 .rva .LSEH_info_p256_point_doublex 4023 4024 .rva .LSEH_begin_p256_point_addx 4025 .rva .LSEH_end_p256_point_addx 4026 .rva .LSEH_info_p256_point_addx 4027 4028 .rva .LSEH_begin_p256_point_add_affinex 4029 .rva .LSEH_end_p256_point_add_affinex 4030 .rva .LSEH_info_p256_point_add_affinex 4031___ 4032$code.=<<___; 4033 4034.section .xdata 4035.align 8 4036.LSEH_info_nistz256_neg: 4037 .byte 9,0,0,0 4038 .rva short_handler 4039 .rva .Lneg_body,.Lneg_epilogue # HandlerData[] 4040.LSEH_info_p256_scalar_mul_mont: 4041 .byte 9,0,0,0 4042 .rva full_handler 4043 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] 4044 .long 48,0 4045.LSEH_info_p256_scalar_sqr_rep_mont: 4046 .byte 9,0,0,0 4047 .rva full_handler 4048 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] 4049 .long 48,0 4050___ 4051$code.=<<___ if ($addx); 4052.LSEH_info_ecp_nistz256_ord_mul_montx: 4053 .byte 9,0,0,0 4054 .rva full_handler 4055 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] 4056 .long 48,0 4057.LSEH_info_ecp_nistz256_ord_sqr_montx: 4058 .byte 9,0,0,0 4059 .rva full_handler 4060 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] 4061 .long 48,0 4062___ 4063$code.=<<___; 4064.LSEH_info_p256_mul_mont: 4065 .byte 9,0,0,0 4066 .rva full_handler 4067 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4068 .long 48,0 4069.LSEH_info_p256_sqr_mont: 4070 .byte 9,0,0,0 4071 .rva full_handler 4072 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 4073 .long 48,0 4074.LSEH_info_ecp_nistz256_select_wX: 4075 .byte 0x01,0x33,0x16,0x00 4076 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 4077 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 4078 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 4079 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 4080 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 4081 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 4082 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 4083 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 4084 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 4085 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 4086 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 4087 .align 8 4088___ 4089$code.=<<___ if ($avx>1); 4090.LSEH_info_ecp_nistz256_avx2_select_wX: 4091 .byte 0x01,0x36,0x17,0x0b 4092 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 4093 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 4094 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 4095 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 4096 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 4097 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 4098 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 4099 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 4100 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 4101 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 4102 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 4103 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 4104 .align 8 4105___ 4106$code.=<<___; 4107.LSEH_info_p256_point_double: 4108 .byte 9,0,0,0 4109 .rva full_handler 4110 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] 4111 .long 32*5+56,0 4112.LSEH_info_p256_point_add: 4113 .byte 9,0,0,0 4114 .rva full_handler 4115 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] 4116 .long 32*18+56,0 4117.LSEH_info_p256_point_add_affine: 4118 .byte 9,0,0,0 4119 .rva full_handler 4120 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] 4121 .long 32*15+56,0 4122___ 4123$code.=<<___ if ($addx); 4124.align 8 4125.LSEH_info_p256_point_doublex: 4126 .byte 9,0,0,0 4127 .rva full_handler 4128 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] 4129 .long 32*5+56,0 4130.LSEH_info_p256_point_addx: 4131 .byte 9,0,0,0 4132 .rva full_handler 4133 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] 4134 .long 32*18+56,0 4135.LSEH_info_p256_point_add_affinex: 4136 .byte 9,0,0,0 4137 .rva full_handler 4138 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] 4139 .long 32*15+56,0 4140___ 4141} 4142 4143$code =~ s/\`([^\`]*)\`/eval $1/gem; 4144print $code; 4145close STDOUT or die "error closing STDOUT"; 4146