1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4# Copyright (c) 2015 CloudFlare, Inc. 5# 6# Licensed under the OpenSSL license (the "License"). You may not use 7# this file except in compliance with the License. You can obtain a copy 8# in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12# (1) Intel Corporation, Israel Development Center, Haifa, Israel 13# (2) University of Haifa, Israel 14# (3) CloudFlare, Inc. 15# 16# Reference: 17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18# 256 Bit Primes" 19 20# Further optimization by <appro@openssl.org>: 21# 22# this/original with/without -DECP_NISTZ256_ASM(*) 23# Opteron +15-49% +150-195% 24# Bulldozer +18-45% +175-240% 25# P4 +24-46% +100-150% 26# Westmere +18-34% +87-160% 27# Sandy Bridge +14-35% +120-185% 28# Ivy Bridge +11-35% +125-180% 29# Haswell +10-37% +160-200% 30# Broadwell +24-58% +210-270% 31# Atom +20-50% +180-240% 32# VIA Nano +50-160% +480-480% 33# 34# (*) "without -DECP_NISTZ256_ASM" refers to build with 35# "enable-ec_nistp_64_gcc_128"; 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. In "this/original" column lower coefficient is for 39# ECDSA sign, while in "with/without" - for ECDH key agreement, and 40# higher - for ECDSA sign, relatively fastest server-side operation. 41# Keep in mind that +100% means 2x improvement. 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55*STDOUT=*OUT; 56 57$avx = 2; 58$addx = 1; 59 60$code.=<<___; 61.text 62.extern OPENSSL_ia32cap_P 63 64# The polynomial 65.align 64 66.Lpoly: 67.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 68 69.LOne: 70.long 1,1,1,1,1,1,1,1 71.LTwo: 72.long 2,2,2,2,2,2,2,2 73.LThree: 74.long 3,3,3,3,3,3,3,3 75.LONE_mont: 76.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 77 78# Constants for computations modulo ord(p256) 79.Lord: 80.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 81.LordK: 82.quad 0xccd1c8aaee00bc4f 83___ 84 85{ 86my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 87my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 88my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 89 90$code.=<<___; 91 92################################################################################ 93# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 94.globl ecp_nistz256_neg 95.type ecp_nistz256_neg,\@function,2 96.align 32 97ecp_nistz256_neg: 98.cfi_startproc 99 push %r12 100.cfi_push %r12 101 push %r13 102.cfi_push %r13 103.Lneg_body: 104 105 xor $a0, $a0 106 xor $a1, $a1 107 xor $a2, $a2 108 xor $a3, $a3 109 xor $t4, $t4 110 111 sub 8*0($a_ptr), $a0 112 sbb 8*1($a_ptr), $a1 113 sbb 8*2($a_ptr), $a2 114 mov $a0, $t0 115 sbb 8*3($a_ptr), $a3 116 lea .Lpoly(%rip), $a_ptr 117 mov $a1, $t1 118 sbb \$0, $t4 119 120 add 8*0($a_ptr), $a0 121 mov $a2, $t2 122 adc 8*1($a_ptr), $a1 123 adc 8*2($a_ptr), $a2 124 mov $a3, $t3 125 adc 8*3($a_ptr), $a3 126 test $t4, $t4 127 128 cmovz $t0, $a0 129 cmovz $t1, $a1 130 mov $a0, 8*0($r_ptr) 131 cmovz $t2, $a2 132 mov $a1, 8*1($r_ptr) 133 cmovz $t3, $a3 134 mov $a2, 8*2($r_ptr) 135 mov $a3, 8*3($r_ptr) 136 137 mov 0(%rsp),%r13 138.cfi_restore %r13 139 mov 8(%rsp),%r12 140.cfi_restore %r12 141 lea 16(%rsp),%rsp 142.cfi_adjust_cfa_offset -16 143.Lneg_epilogue: 144 ret 145.cfi_endproc 146.size ecp_nistz256_neg,.-ecp_nistz256_neg 147___ 148} 149{ 150my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 151my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 152my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 153my ($poly1,$poly3)=($acc6,$acc7); 154 155$code.=<<___; 156################################################################################ 157# void ecp_nistz256_ord_mul_mont( 158# uint64_t res[4], 159# uint64_t a[4], 160# uint64_t b[4]); 161 162.globl ecp_nistz256_ord_mul_mont 163.type ecp_nistz256_ord_mul_mont,\@function,3 164.align 32 165ecp_nistz256_ord_mul_mont: 166.cfi_startproc 167___ 168$code.=<<___ if ($addx); 169 leaq OPENSSL_ia32cap_P(%rip), %rcx 170 mov 8(%rcx), %rcx 171 and \$0x80100, %ecx 172 cmp \$0x80100, %ecx 173 je .Lecp_nistz256_ord_mul_montx 174___ 175$code.=<<___; 176 push %rbp 177.cfi_push %rbp 178 push %rbx 179.cfi_push %rbx 180 push %r12 181.cfi_push %r12 182 push %r13 183.cfi_push %r13 184 push %r14 185.cfi_push %r14 186 push %r15 187.cfi_push %r15 188.Lord_mul_body: 189 190 mov 8*0($b_org), %rax 191 mov $b_org, $b_ptr 192 lea .Lord(%rip), %r14 193 mov .LordK(%rip), %r15 194 195 ################################# * b[0] 196 mov %rax, $t0 197 mulq 8*0($a_ptr) 198 mov %rax, $acc0 199 mov $t0, %rax 200 mov %rdx, $acc1 201 202 mulq 8*1($a_ptr) 203 add %rax, $acc1 204 mov $t0, %rax 205 adc \$0, %rdx 206 mov %rdx, $acc2 207 208 mulq 8*2($a_ptr) 209 add %rax, $acc2 210 mov $t0, %rax 211 adc \$0, %rdx 212 213 mov $acc0, $acc5 214 imulq %r15,$acc0 215 216 mov %rdx, $acc3 217 mulq 8*3($a_ptr) 218 add %rax, $acc3 219 mov $acc0, %rax 220 adc \$0, %rdx 221 mov %rdx, $acc4 222 223 ################################# First reduction step 224 mulq 8*0(%r14) 225 mov $acc0, $t1 226 add %rax, $acc5 # guaranteed to be zero 227 mov $acc0, %rax 228 adc \$0, %rdx 229 mov %rdx, $t0 230 231 sub $acc0, $acc2 232 sbb \$0, $acc0 # can't borrow 233 234 mulq 8*1(%r14) 235 add $t0, $acc1 236 adc \$0, %rdx 237 add %rax, $acc1 238 mov $t1, %rax 239 adc %rdx, $acc2 240 mov $t1, %rdx 241 adc \$0, $acc0 # can't overflow 242 243 shl \$32, %rax 244 shr \$32, %rdx 245 sub %rax, $acc3 246 mov 8*1($b_ptr), %rax 247 sbb %rdx, $t1 # can't borrow 248 249 add $acc0, $acc3 250 adc $t1, $acc4 251 adc \$0, $acc5 252 253 ################################# * b[1] 254 mov %rax, $t0 255 mulq 8*0($a_ptr) 256 add %rax, $acc1 257 mov $t0, %rax 258 adc \$0, %rdx 259 mov %rdx, $t1 260 261 mulq 8*1($a_ptr) 262 add $t1, $acc2 263 adc \$0, %rdx 264 add %rax, $acc2 265 mov $t0, %rax 266 adc \$0, %rdx 267 mov %rdx, $t1 268 269 mulq 8*2($a_ptr) 270 add $t1, $acc3 271 adc \$0, %rdx 272 add %rax, $acc3 273 mov $t0, %rax 274 adc \$0, %rdx 275 276 mov $acc1, $t0 277 imulq %r15, $acc1 278 279 mov %rdx, $t1 280 mulq 8*3($a_ptr) 281 add $t1, $acc4 282 adc \$0, %rdx 283 xor $acc0, $acc0 284 add %rax, $acc4 285 mov $acc1, %rax 286 adc %rdx, $acc5 287 adc \$0, $acc0 288 289 ################################# Second reduction step 290 mulq 8*0(%r14) 291 mov $acc1, $t1 292 add %rax, $t0 # guaranteed to be zero 293 mov $acc1, %rax 294 adc %rdx, $t0 295 296 sub $acc1, $acc3 297 sbb \$0, $acc1 # can't borrow 298 299 mulq 8*1(%r14) 300 add $t0, $acc2 301 adc \$0, %rdx 302 add %rax, $acc2 303 mov $t1, %rax 304 adc %rdx, $acc3 305 mov $t1, %rdx 306 adc \$0, $acc1 # can't overflow 307 308 shl \$32, %rax 309 shr \$32, %rdx 310 sub %rax, $acc4 311 mov 8*2($b_ptr), %rax 312 sbb %rdx, $t1 # can't borrow 313 314 add $acc1, $acc4 315 adc $t1, $acc5 316 adc \$0, $acc0 317 318 ################################## * b[2] 319 mov %rax, $t0 320 mulq 8*0($a_ptr) 321 add %rax, $acc2 322 mov $t0, %rax 323 adc \$0, %rdx 324 mov %rdx, $t1 325 326 mulq 8*1($a_ptr) 327 add $t1, $acc3 328 adc \$0, %rdx 329 add %rax, $acc3 330 mov $t0, %rax 331 adc \$0, %rdx 332 mov %rdx, $t1 333 334 mulq 8*2($a_ptr) 335 add $t1, $acc4 336 adc \$0, %rdx 337 add %rax, $acc4 338 mov $t0, %rax 339 adc \$0, %rdx 340 341 mov $acc2, $t0 342 imulq %r15, $acc2 343 344 mov %rdx, $t1 345 mulq 8*3($a_ptr) 346 add $t1, $acc5 347 adc \$0, %rdx 348 xor $acc1, $acc1 349 add %rax, $acc5 350 mov $acc2, %rax 351 adc %rdx, $acc0 352 adc \$0, $acc1 353 354 ################################# Third reduction step 355 mulq 8*0(%r14) 356 mov $acc2, $t1 357 add %rax, $t0 # guaranteed to be zero 358 mov $acc2, %rax 359 adc %rdx, $t0 360 361 sub $acc2, $acc4 362 sbb \$0, $acc2 # can't borrow 363 364 mulq 8*1(%r14) 365 add $t0, $acc3 366 adc \$0, %rdx 367 add %rax, $acc3 368 mov $t1, %rax 369 adc %rdx, $acc4 370 mov $t1, %rdx 371 adc \$0, $acc2 # can't overflow 372 373 shl \$32, %rax 374 shr \$32, %rdx 375 sub %rax, $acc5 376 mov 8*3($b_ptr), %rax 377 sbb %rdx, $t1 # can't borrow 378 379 add $acc2, $acc5 380 adc $t1, $acc0 381 adc \$0, $acc1 382 383 ################################# * b[3] 384 mov %rax, $t0 385 mulq 8*0($a_ptr) 386 add %rax, $acc3 387 mov $t0, %rax 388 adc \$0, %rdx 389 mov %rdx, $t1 390 391 mulq 8*1($a_ptr) 392 add $t1, $acc4 393 adc \$0, %rdx 394 add %rax, $acc4 395 mov $t0, %rax 396 adc \$0, %rdx 397 mov %rdx, $t1 398 399 mulq 8*2($a_ptr) 400 add $t1, $acc5 401 adc \$0, %rdx 402 add %rax, $acc5 403 mov $t0, %rax 404 adc \$0, %rdx 405 406 mov $acc3, $t0 407 imulq %r15, $acc3 408 409 mov %rdx, $t1 410 mulq 8*3($a_ptr) 411 add $t1, $acc0 412 adc \$0, %rdx 413 xor $acc2, $acc2 414 add %rax, $acc0 415 mov $acc3, %rax 416 adc %rdx, $acc1 417 adc \$0, $acc2 418 419 ################################# Last reduction step 420 mulq 8*0(%r14) 421 mov $acc3, $t1 422 add %rax, $t0 # guaranteed to be zero 423 mov $acc3, %rax 424 adc %rdx, $t0 425 426 sub $acc3, $acc5 427 sbb \$0, $acc3 # can't borrow 428 429 mulq 8*1(%r14) 430 add $t0, $acc4 431 adc \$0, %rdx 432 add %rax, $acc4 433 mov $t1, %rax 434 adc %rdx, $acc5 435 mov $t1, %rdx 436 adc \$0, $acc3 # can't overflow 437 438 shl \$32, %rax 439 shr \$32, %rdx 440 sub %rax, $acc0 441 sbb %rdx, $t1 # can't borrow 442 443 add $acc3, $acc0 444 adc $t1, $acc1 445 adc \$0, $acc2 446 447 ################################# Subtract ord 448 mov $acc4, $a_ptr 449 sub 8*0(%r14), $acc4 450 mov $acc5, $acc3 451 sbb 8*1(%r14), $acc5 452 mov $acc0, $t0 453 sbb 8*2(%r14), $acc0 454 mov $acc1, $t1 455 sbb 8*3(%r14), $acc1 456 sbb \$0, $acc2 457 458 cmovc $a_ptr, $acc4 459 cmovc $acc3, $acc5 460 cmovc $t0, $acc0 461 cmovc $t1, $acc1 462 463 mov $acc4, 8*0($r_ptr) 464 mov $acc5, 8*1($r_ptr) 465 mov $acc0, 8*2($r_ptr) 466 mov $acc1, 8*3($r_ptr) 467 468 mov 0(%rsp),%r15 469.cfi_restore %r15 470 mov 8(%rsp),%r14 471.cfi_restore %r14 472 mov 16(%rsp),%r13 473.cfi_restore %r13 474 mov 24(%rsp),%r12 475.cfi_restore %r12 476 mov 32(%rsp),%rbx 477.cfi_restore %rbx 478 mov 40(%rsp),%rbp 479.cfi_restore %rbp 480 lea 48(%rsp),%rsp 481.cfi_adjust_cfa_offset -48 482.Lord_mul_epilogue: 483 ret 484.cfi_endproc 485.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 486 487################################################################################ 488# void ecp_nistz256_ord_sqr_mont( 489# uint64_t res[4], 490# uint64_t a[4], 491# uint64_t rep); 492 493.globl ecp_nistz256_ord_sqr_mont 494.type ecp_nistz256_ord_sqr_mont,\@function,3 495.align 32 496ecp_nistz256_ord_sqr_mont: 497.cfi_startproc 498___ 499$code.=<<___ if ($addx); 500 leaq OPENSSL_ia32cap_P(%rip), %rcx 501 mov 8(%rcx), %rcx 502 and \$0x80100, %ecx 503 cmp \$0x80100, %ecx 504 je .Lecp_nistz256_ord_sqr_montx 505___ 506$code.=<<___; 507 push %rbp 508.cfi_push %rbp 509 push %rbx 510.cfi_push %rbx 511 push %r12 512.cfi_push %r12 513 push %r13 514.cfi_push %r13 515 push %r14 516.cfi_push %r14 517 push %r15 518.cfi_push %r15 519.Lord_sqr_body: 520 521 mov 8*0($a_ptr), $acc0 522 mov 8*1($a_ptr), %rax 523 mov 8*2($a_ptr), $acc6 524 mov 8*3($a_ptr), $acc7 525 lea .Lord(%rip), $a_ptr # pointer to modulus 526 mov $b_org, $b_ptr 527 jmp .Loop_ord_sqr 528 529.align 32 530.Loop_ord_sqr: 531 ################################# a[1:] * a[0] 532 mov %rax, $t1 # put aside a[1] 533 mul $acc0 # a[1] * a[0] 534 mov %rax, $acc1 535 movq $t1, %xmm1 # offload a[1] 536 mov $acc6, %rax 537 mov %rdx, $acc2 538 539 mul $acc0 # a[2] * a[0] 540 add %rax, $acc2 541 mov $acc7, %rax 542 movq $acc6, %xmm2 # offload a[2] 543 adc \$0, %rdx 544 mov %rdx, $acc3 545 546 mul $acc0 # a[3] * a[0] 547 add %rax, $acc3 548 mov $acc7, %rax 549 movq $acc7, %xmm3 # offload a[3] 550 adc \$0, %rdx 551 mov %rdx, $acc4 552 553 ################################# a[3] * a[2] 554 mul $acc6 # a[3] * a[2] 555 mov %rax, $acc5 556 mov $acc6, %rax 557 mov %rdx, $acc6 558 559 ################################# a[2:] * a[1] 560 mul $t1 # a[2] * a[1] 561 add %rax, $acc3 562 mov $acc7, %rax 563 adc \$0, %rdx 564 mov %rdx, $acc7 565 566 mul $t1 # a[3] * a[1] 567 add %rax, $acc4 568 adc \$0, %rdx 569 570 add $acc7, $acc4 571 adc %rdx, $acc5 572 adc \$0, $acc6 # can't overflow 573 574 ################################# *2 575 xor $acc7, $acc7 576 mov $acc0, %rax 577 add $acc1, $acc1 578 adc $acc2, $acc2 579 adc $acc3, $acc3 580 adc $acc4, $acc4 581 adc $acc5, $acc5 582 adc $acc6, $acc6 583 adc \$0, $acc7 584 585 ################################# Missing products 586 mul %rax # a[0] * a[0] 587 mov %rax, $acc0 588 movq %xmm1, %rax 589 mov %rdx, $t1 590 591 mul %rax # a[1] * a[1] 592 add $t1, $acc1 593 adc %rax, $acc2 594 movq %xmm2, %rax 595 adc \$0, %rdx 596 mov %rdx, $t1 597 598 mul %rax # a[2] * a[2] 599 add $t1, $acc3 600 adc %rax, $acc4 601 movq %xmm3, %rax 602 adc \$0, %rdx 603 mov %rdx, $t1 604 605 mov $acc0, $t0 606 imulq 8*4($a_ptr), $acc0 # *= .LordK 607 608 mul %rax # a[3] * a[3] 609 add $t1, $acc5 610 adc %rax, $acc6 611 mov 8*0($a_ptr), %rax # modulus[0] 612 adc %rdx, $acc7 # can't overflow 613 614 ################################# First reduction step 615 mul $acc0 616 mov $acc0, $t1 617 add %rax, $t0 # guaranteed to be zero 618 mov 8*1($a_ptr), %rax # modulus[1] 619 adc %rdx, $t0 620 621 sub $acc0, $acc2 622 sbb \$0, $t1 # can't borrow 623 624 mul $acc0 625 add $t0, $acc1 626 adc \$0, %rdx 627 add %rax, $acc1 628 mov $acc0, %rax 629 adc %rdx, $acc2 630 mov $acc0, %rdx 631 adc \$0, $t1 # can't overflow 632 633 mov $acc1, $t0 634 imulq 8*4($a_ptr), $acc1 # *= .LordK 635 636 shl \$32, %rax 637 shr \$32, %rdx 638 sub %rax, $acc3 639 mov 8*0($a_ptr), %rax 640 sbb %rdx, $acc0 # can't borrow 641 642 add $t1, $acc3 643 adc \$0, $acc0 # can't overflow 644 645 ################################# Second reduction step 646 mul $acc1 647 mov $acc1, $t1 648 add %rax, $t0 # guaranteed to be zero 649 mov 8*1($a_ptr), %rax 650 adc %rdx, $t0 651 652 sub $acc1, $acc3 653 sbb \$0, $t1 # can't borrow 654 655 mul $acc1 656 add $t0, $acc2 657 adc \$0, %rdx 658 add %rax, $acc2 659 mov $acc1, %rax 660 adc %rdx, $acc3 661 mov $acc1, %rdx 662 adc \$0, $t1 # can't overflow 663 664 mov $acc2, $t0 665 imulq 8*4($a_ptr), $acc2 # *= .LordK 666 667 shl \$32, %rax 668 shr \$32, %rdx 669 sub %rax, $acc0 670 mov 8*0($a_ptr), %rax 671 sbb %rdx, $acc1 # can't borrow 672 673 add $t1, $acc0 674 adc \$0, $acc1 # can't overflow 675 676 ################################# Third reduction step 677 mul $acc2 678 mov $acc2, $t1 679 add %rax, $t0 # guaranteed to be zero 680 mov 8*1($a_ptr), %rax 681 adc %rdx, $t0 682 683 sub $acc2, $acc0 684 sbb \$0, $t1 # can't borrow 685 686 mul $acc2 687 add $t0, $acc3 688 adc \$0, %rdx 689 add %rax, $acc3 690 mov $acc2, %rax 691 adc %rdx, $acc0 692 mov $acc2, %rdx 693 adc \$0, $t1 # can't overflow 694 695 mov $acc3, $t0 696 imulq 8*4($a_ptr), $acc3 # *= .LordK 697 698 shl \$32, %rax 699 shr \$32, %rdx 700 sub %rax, $acc1 701 mov 8*0($a_ptr), %rax 702 sbb %rdx, $acc2 # can't borrow 703 704 add $t1, $acc1 705 adc \$0, $acc2 # can't overflow 706 707 ################################# Last reduction step 708 mul $acc3 709 mov $acc3, $t1 710 add %rax, $t0 # guaranteed to be zero 711 mov 8*1($a_ptr), %rax 712 adc %rdx, $t0 713 714 sub $acc3, $acc1 715 sbb \$0, $t1 # can't borrow 716 717 mul $acc3 718 add $t0, $acc0 719 adc \$0, %rdx 720 add %rax, $acc0 721 mov $acc3, %rax 722 adc %rdx, $acc1 723 mov $acc3, %rdx 724 adc \$0, $t1 # can't overflow 725 726 shl \$32, %rax 727 shr \$32, %rdx 728 sub %rax, $acc2 729 sbb %rdx, $acc3 # can't borrow 730 731 add $t1, $acc2 732 adc \$0, $acc3 # can't overflow 733 734 ################################# Add bits [511:256] of the sqr result 735 xor %rdx, %rdx 736 add $acc4, $acc0 737 adc $acc5, $acc1 738 mov $acc0, $acc4 739 adc $acc6, $acc2 740 adc $acc7, $acc3 741 mov $acc1, %rax 742 adc \$0, %rdx 743 744 ################################# Compare to modulus 745 sub 8*0($a_ptr), $acc0 746 mov $acc2, $acc6 747 sbb 8*1($a_ptr), $acc1 748 sbb 8*2($a_ptr), $acc2 749 mov $acc3, $acc7 750 sbb 8*3($a_ptr), $acc3 751 sbb \$0, %rdx 752 753 cmovc $acc4, $acc0 754 cmovnc $acc1, %rax 755 cmovnc $acc2, $acc6 756 cmovnc $acc3, $acc7 757 758 dec $b_ptr 759 jnz .Loop_ord_sqr 760 761 mov $acc0, 8*0($r_ptr) 762 mov %rax, 8*1($r_ptr) 763 pxor %xmm1, %xmm1 764 mov $acc6, 8*2($r_ptr) 765 pxor %xmm2, %xmm2 766 mov $acc7, 8*3($r_ptr) 767 pxor %xmm3, %xmm3 768 769 mov 0(%rsp),%r15 770.cfi_restore %r15 771 mov 8(%rsp),%r14 772.cfi_restore %r14 773 mov 16(%rsp),%r13 774.cfi_restore %r13 775 mov 24(%rsp),%r12 776.cfi_restore %r12 777 mov 32(%rsp),%rbx 778.cfi_restore %rbx 779 mov 40(%rsp),%rbp 780.cfi_restore %rbp 781 lea 48(%rsp),%rsp 782.cfi_adjust_cfa_offset -48 783.Lord_sqr_epilogue: 784 ret 785.cfi_endproc 786.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 787___ 788 789$code.=<<___ if ($addx); 790################################################################################ 791.type ecp_nistz256_ord_mul_montx,\@function,3 792.align 32 793ecp_nistz256_ord_mul_montx: 794.cfi_startproc 795.Lecp_nistz256_ord_mul_montx: 796 push %rbp 797.cfi_push %rbp 798 push %rbx 799.cfi_push %rbx 800 push %r12 801.cfi_push %r12 802 push %r13 803.cfi_push %r13 804 push %r14 805.cfi_push %r14 806 push %r15 807.cfi_push %r15 808.Lord_mulx_body: 809 810 mov $b_org, $b_ptr 811 mov 8*0($b_org), %rdx 812 mov 8*0($a_ptr), $acc1 813 mov 8*1($a_ptr), $acc2 814 mov 8*2($a_ptr), $acc3 815 mov 8*3($a_ptr), $acc4 816 lea -128($a_ptr), $a_ptr # control u-op density 817 lea .Lord-128(%rip), %r14 818 mov .LordK(%rip), %r15 819 820 ################################# Multiply by b[0] 821 mulx $acc1, $acc0, $acc1 822 mulx $acc2, $t0, $acc2 823 mulx $acc3, $t1, $acc3 824 add $t0, $acc1 825 mulx $acc4, $t0, $acc4 826 mov $acc0, %rdx 827 mulx %r15, %rdx, %rax 828 adc $t1, $acc2 829 adc $t0, $acc3 830 adc \$0, $acc4 831 832 ################################# reduction 833 xor $acc5, $acc5 # $acc5=0, cf=0, of=0 834 mulx 8*0+128(%r14), $t0, $t1 835 adcx $t0, $acc0 # guaranteed to be zero 836 adox $t1, $acc1 837 838 mulx 8*1+128(%r14), $t0, $t1 839 adcx $t0, $acc1 840 adox $t1, $acc2 841 842 mulx 8*2+128(%r14), $t0, $t1 843 adcx $t0, $acc2 844 adox $t1, $acc3 845 846 mulx 8*3+128(%r14), $t0, $t1 847 mov 8*1($b_ptr), %rdx 848 adcx $t0, $acc3 849 adox $t1, $acc4 850 adcx $acc0, $acc4 851 adox $acc0, $acc5 852 adc \$0, $acc5 # cf=0, of=0 853 854 ################################# Multiply by b[1] 855 mulx 8*0+128($a_ptr), $t0, $t1 856 adcx $t0, $acc1 857 adox $t1, $acc2 858 859 mulx 8*1+128($a_ptr), $t0, $t1 860 adcx $t0, $acc2 861 adox $t1, $acc3 862 863 mulx 8*2+128($a_ptr), $t0, $t1 864 adcx $t0, $acc3 865 adox $t1, $acc4 866 867 mulx 8*3+128($a_ptr), $t0, $t1 868 mov $acc1, %rdx 869 mulx %r15, %rdx, %rax 870 adcx $t0, $acc4 871 adox $t1, $acc5 872 873 adcx $acc0, $acc5 874 adox $acc0, $acc0 875 adc \$0, $acc0 # cf=0, of=0 876 877 ################################# reduction 878 mulx 8*0+128(%r14), $t0, $t1 879 adcx $t0, $acc1 # guaranteed to be zero 880 adox $t1, $acc2 881 882 mulx 8*1+128(%r14), $t0, $t1 883 adcx $t0, $acc2 884 adox $t1, $acc3 885 886 mulx 8*2+128(%r14), $t0, $t1 887 adcx $t0, $acc3 888 adox $t1, $acc4 889 890 mulx 8*3+128(%r14), $t0, $t1 891 mov 8*2($b_ptr), %rdx 892 adcx $t0, $acc4 893 adox $t1, $acc5 894 adcx $acc1, $acc5 895 adox $acc1, $acc0 896 adc \$0, $acc0 # cf=0, of=0 897 898 ################################# Multiply by b[2] 899 mulx 8*0+128($a_ptr), $t0, $t1 900 adcx $t0, $acc2 901 adox $t1, $acc3 902 903 mulx 8*1+128($a_ptr), $t0, $t1 904 adcx $t0, $acc3 905 adox $t1, $acc4 906 907 mulx 8*2+128($a_ptr), $t0, $t1 908 adcx $t0, $acc4 909 adox $t1, $acc5 910 911 mulx 8*3+128($a_ptr), $t0, $t1 912 mov $acc2, %rdx 913 mulx %r15, %rdx, %rax 914 adcx $t0, $acc5 915 adox $t1, $acc0 916 917 adcx $acc1, $acc0 918 adox $acc1, $acc1 919 adc \$0, $acc1 # cf=0, of=0 920 921 ################################# reduction 922 mulx 8*0+128(%r14), $t0, $t1 923 adcx $t0, $acc2 # guaranteed to be zero 924 adox $t1, $acc3 925 926 mulx 8*1+128(%r14), $t0, $t1 927 adcx $t0, $acc3 928 adox $t1, $acc4 929 930 mulx 8*2+128(%r14), $t0, $t1 931 adcx $t0, $acc4 932 adox $t1, $acc5 933 934 mulx 8*3+128(%r14), $t0, $t1 935 mov 8*3($b_ptr), %rdx 936 adcx $t0, $acc5 937 adox $t1, $acc0 938 adcx $acc2, $acc0 939 adox $acc2, $acc1 940 adc \$0, $acc1 # cf=0, of=0 941 942 ################################# Multiply by b[3] 943 mulx 8*0+128($a_ptr), $t0, $t1 944 adcx $t0, $acc3 945 adox $t1, $acc4 946 947 mulx 8*1+128($a_ptr), $t0, $t1 948 adcx $t0, $acc4 949 adox $t1, $acc5 950 951 mulx 8*2+128($a_ptr), $t0, $t1 952 adcx $t0, $acc5 953 adox $t1, $acc0 954 955 mulx 8*3+128($a_ptr), $t0, $t1 956 mov $acc3, %rdx 957 mulx %r15, %rdx, %rax 958 adcx $t0, $acc0 959 adox $t1, $acc1 960 961 adcx $acc2, $acc1 962 adox $acc2, $acc2 963 adc \$0, $acc2 # cf=0, of=0 964 965 ################################# reduction 966 mulx 8*0+128(%r14), $t0, $t1 967 adcx $t0, $acc3 # guranteed to be zero 968 adox $t1, $acc4 969 970 mulx 8*1+128(%r14), $t0, $t1 971 adcx $t0, $acc4 972 adox $t1, $acc5 973 974 mulx 8*2+128(%r14), $t0, $t1 975 adcx $t0, $acc5 976 adox $t1, $acc0 977 978 mulx 8*3+128(%r14), $t0, $t1 979 lea 128(%r14),%r14 980 mov $acc4, $t2 981 adcx $t0, $acc0 982 adox $t1, $acc1 983 mov $acc5, $t3 984 adcx $acc3, $acc1 985 adox $acc3, $acc2 986 adc \$0, $acc2 987 988 ################################# 989 # Branch-less conditional subtraction of P 990 mov $acc0, $t0 991 sub 8*0(%r14), $acc4 992 sbb 8*1(%r14), $acc5 993 sbb 8*2(%r14), $acc0 994 mov $acc1, $t1 995 sbb 8*3(%r14), $acc1 996 sbb \$0, $acc2 997 998 cmovc $t2, $acc4 999 cmovc $t3, $acc5 1000 cmovc $t0, $acc0 1001 cmovc $t1, $acc1 1002 1003 mov $acc4, 8*0($r_ptr) 1004 mov $acc5, 8*1($r_ptr) 1005 mov $acc0, 8*2($r_ptr) 1006 mov $acc1, 8*3($r_ptr) 1007 1008 mov 0(%rsp),%r15 1009.cfi_restore %r15 1010 mov 8(%rsp),%r14 1011.cfi_restore %r14 1012 mov 16(%rsp),%r13 1013.cfi_restore %r13 1014 mov 24(%rsp),%r12 1015.cfi_restore %r12 1016 mov 32(%rsp),%rbx 1017.cfi_restore %rbx 1018 mov 40(%rsp),%rbp 1019.cfi_restore %rbp 1020 lea 48(%rsp),%rsp 1021.cfi_adjust_cfa_offset -48 1022.Lord_mulx_epilogue: 1023 ret 1024.cfi_endproc 1025.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx 1026 1027.type ecp_nistz256_ord_sqr_montx,\@function,3 1028.align 32 1029ecp_nistz256_ord_sqr_montx: 1030.cfi_startproc 1031.Lecp_nistz256_ord_sqr_montx: 1032 push %rbp 1033.cfi_push %rbp 1034 push %rbx 1035.cfi_push %rbx 1036 push %r12 1037.cfi_push %r12 1038 push %r13 1039.cfi_push %r13 1040 push %r14 1041.cfi_push %r14 1042 push %r15 1043.cfi_push %r15 1044.Lord_sqrx_body: 1045 1046 mov $b_org, $b_ptr 1047 mov 8*0($a_ptr), %rdx 1048 mov 8*1($a_ptr), $acc6 1049 mov 8*2($a_ptr), $acc7 1050 mov 8*3($a_ptr), $acc0 1051 lea .Lord(%rip), $a_ptr 1052 jmp .Loop_ord_sqrx 1053 1054.align 32 1055.Loop_ord_sqrx: 1056 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1057 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1058 mov %rdx, %rax # offload a[0] 1059 movq $acc6, %xmm1 # offload a[1] 1060 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1061 mov $acc6, %rdx 1062 add $t0, $acc2 1063 movq $acc7, %xmm2 # offload a[2] 1064 adc $t1, $acc3 1065 adc \$0, $acc4 1066 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1067 ################################# 1068 mulx $acc7, $t0, $t1 # a[1]*a[2] 1069 adcx $t0, $acc3 1070 adox $t1, $acc4 1071 1072 mulx $acc0, $t0, $t1 # a[1]*a[3] 1073 mov $acc7, %rdx 1074 adcx $t0, $acc4 1075 adox $t1, $acc5 1076 adc \$0, $acc5 1077 ################################# 1078 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1079 mov %rax, %rdx 1080 movq $acc0, %xmm3 # offload a[3] 1081 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1082 adcx $acc1, $acc1 # acc1:6<<1 1083 adox $t0, $acc5 1084 adcx $acc2, $acc2 1085 adox $acc7, $acc6 # of=0 1086 1087 ################################# a[i]*a[i] 1088 mulx %rdx, $acc0, $t1 1089 movq %xmm1, %rdx 1090 adcx $acc3, $acc3 1091 adox $t1, $acc1 1092 adcx $acc4, $acc4 1093 mulx %rdx, $t0, $t4 1094 movq %xmm2, %rdx 1095 adcx $acc5, $acc5 1096 adox $t0, $acc2 1097 adcx $acc6, $acc6 1098 mulx %rdx, $t0, $t1 1099 .byte 0x67 1100 movq %xmm3, %rdx 1101 adox $t4, $acc3 1102 adcx $acc7, $acc7 1103 adox $t0, $acc4 1104 adox $t1, $acc5 1105 mulx %rdx, $t0, $t4 1106 adox $t0, $acc6 1107 adox $t4, $acc7 1108 1109 ################################# reduction 1110 mov $acc0, %rdx 1111 mulx 8*4($a_ptr), %rdx, $t0 1112 1113 xor %rax, %rax # cf=0, of=0 1114 mulx 8*0($a_ptr), $t0, $t1 1115 adcx $t0, $acc0 # guaranteed to be zero 1116 adox $t1, $acc1 1117 mulx 8*1($a_ptr), $t0, $t1 1118 adcx $t0, $acc1 1119 adox $t1, $acc2 1120 mulx 8*2($a_ptr), $t0, $t1 1121 adcx $t0, $acc2 1122 adox $t1, $acc3 1123 mulx 8*3($a_ptr), $t0, $t1 1124 adcx $t0, $acc3 1125 adox $t1, $acc0 # of=0 1126 adcx %rax, $acc0 # cf=0 1127 1128 ################################# 1129 mov $acc1, %rdx 1130 mulx 8*4($a_ptr), %rdx, $t0 1131 1132 mulx 8*0($a_ptr), $t0, $t1 1133 adox $t0, $acc1 # guaranteed to be zero 1134 adcx $t1, $acc2 1135 mulx 8*1($a_ptr), $t0, $t1 1136 adox $t0, $acc2 1137 adcx $t1, $acc3 1138 mulx 8*2($a_ptr), $t0, $t1 1139 adox $t0, $acc3 1140 adcx $t1, $acc0 1141 mulx 8*3($a_ptr), $t0, $t1 1142 adox $t0, $acc0 1143 adcx $t1, $acc1 # cf=0 1144 adox %rax, $acc1 # of=0 1145 1146 ################################# 1147 mov $acc2, %rdx 1148 mulx 8*4($a_ptr), %rdx, $t0 1149 1150 mulx 8*0($a_ptr), $t0, $t1 1151 adcx $t0, $acc2 # guaranteed to be zero 1152 adox $t1, $acc3 1153 mulx 8*1($a_ptr), $t0, $t1 1154 adcx $t0, $acc3 1155 adox $t1, $acc0 1156 mulx 8*2($a_ptr), $t0, $t1 1157 adcx $t0, $acc0 1158 adox $t1, $acc1 1159 mulx 8*3($a_ptr), $t0, $t1 1160 adcx $t0, $acc1 1161 adox $t1, $acc2 # of=0 1162 adcx %rax, $acc2 # cf=0 1163 1164 ################################# 1165 mov $acc3, %rdx 1166 mulx 8*4($a_ptr), %rdx, $t0 1167 1168 mulx 8*0($a_ptr), $t0, $t1 1169 adox $t0, $acc3 # guaranteed to be zero 1170 adcx $t1, $acc0 1171 mulx 8*1($a_ptr), $t0, $t1 1172 adox $t0, $acc0 1173 adcx $t1, $acc1 1174 mulx 8*2($a_ptr), $t0, $t1 1175 adox $t0, $acc1 1176 adcx $t1, $acc2 1177 mulx 8*3($a_ptr), $t0, $t1 1178 adox $t0, $acc2 1179 adcx $t1, $acc3 1180 adox %rax, $acc3 1181 1182 ################################# accumulate upper half 1183 add $acc0, $acc4 # add $acc4, $acc0 1184 adc $acc5, $acc1 1185 mov $acc4, %rdx 1186 adc $acc6, $acc2 1187 adc $acc7, $acc3 1188 mov $acc1, $acc6 1189 adc \$0, %rax 1190 1191 ################################# compare to modulus 1192 sub 8*0($a_ptr), $acc4 1193 mov $acc2, $acc7 1194 sbb 8*1($a_ptr), $acc1 1195 sbb 8*2($a_ptr), $acc2 1196 mov $acc3, $acc0 1197 sbb 8*3($a_ptr), $acc3 1198 sbb \$0, %rax 1199 1200 cmovnc $acc4, %rdx 1201 cmovnc $acc1, $acc6 1202 cmovnc $acc2, $acc7 1203 cmovnc $acc3, $acc0 1204 1205 dec $b_ptr 1206 jnz .Loop_ord_sqrx 1207 1208 mov %rdx, 8*0($r_ptr) 1209 mov $acc6, 8*1($r_ptr) 1210 pxor %xmm1, %xmm1 1211 mov $acc7, 8*2($r_ptr) 1212 pxor %xmm2, %xmm2 1213 mov $acc0, 8*3($r_ptr) 1214 pxor %xmm3, %xmm3 1215 1216 mov 0(%rsp),%r15 1217.cfi_restore %r15 1218 mov 8(%rsp),%r14 1219.cfi_restore %r14 1220 mov 16(%rsp),%r13 1221.cfi_restore %r13 1222 mov 24(%rsp),%r12 1223.cfi_restore %r12 1224 mov 32(%rsp),%rbx 1225.cfi_restore %rbx 1226 mov 40(%rsp),%rbp 1227.cfi_restore %rbp 1228 lea 48(%rsp),%rsp 1229.cfi_adjust_cfa_offset -48 1230.Lord_sqrx_epilogue: 1231 ret 1232.cfi_endproc 1233.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx 1234___ 1235 1236$code.=<<___; 1237################################################################################ 1238# void ecp_nistz256_mul_mont( 1239# uint64_t res[4], 1240# uint64_t a[4], 1241# uint64_t b[4]); 1242 1243.globl ecp_nistz256_mul_mont 1244.type ecp_nistz256_mul_mont,\@function,3 1245.align 32 1246ecp_nistz256_mul_mont: 1247.cfi_startproc 1248___ 1249$code.=<<___ if ($addx); 1250 leaq OPENSSL_ia32cap_P(%rip), %rcx 1251 mov 8(%rcx), %rcx 1252 and \$0x80100, %ecx 1253___ 1254$code.=<<___; 1255.Lmul_mont: 1256 push %rbp 1257.cfi_push %rbp 1258 push %rbx 1259.cfi_push %rbx 1260 push %r12 1261.cfi_push %r12 1262 push %r13 1263.cfi_push %r13 1264 push %r14 1265.cfi_push %r14 1266 push %r15 1267.cfi_push %r15 1268.Lmul_body: 1269___ 1270$code.=<<___ if ($addx); 1271 cmp \$0x80100, %ecx 1272 je .Lmul_montx 1273___ 1274$code.=<<___; 1275 mov $b_org, $b_ptr 1276 mov 8*0($b_org), %rax 1277 mov 8*0($a_ptr), $acc1 1278 mov 8*1($a_ptr), $acc2 1279 mov 8*2($a_ptr), $acc3 1280 mov 8*3($a_ptr), $acc4 1281 1282 call __ecp_nistz256_mul_montq 1283___ 1284$code.=<<___ if ($addx); 1285 jmp .Lmul_mont_done 1286 1287.align 32 1288.Lmul_montx: 1289 mov $b_org, $b_ptr 1290 mov 8*0($b_org), %rdx 1291 mov 8*0($a_ptr), $acc1 1292 mov 8*1($a_ptr), $acc2 1293 mov 8*2($a_ptr), $acc3 1294 mov 8*3($a_ptr), $acc4 1295 lea -128($a_ptr), $a_ptr # control u-op density 1296 1297 call __ecp_nistz256_mul_montx 1298___ 1299$code.=<<___; 1300.Lmul_mont_done: 1301 mov 0(%rsp),%r15 1302.cfi_restore %r15 1303 mov 8(%rsp),%r14 1304.cfi_restore %r14 1305 mov 16(%rsp),%r13 1306.cfi_restore %r13 1307 mov 24(%rsp),%r12 1308.cfi_restore %r12 1309 mov 32(%rsp),%rbx 1310.cfi_restore %rbx 1311 mov 40(%rsp),%rbp 1312.cfi_restore %rbp 1313 lea 48(%rsp),%rsp 1314.cfi_adjust_cfa_offset -48 1315.Lmul_epilogue: 1316 ret 1317.cfi_endproc 1318.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 1319 1320.type __ecp_nistz256_mul_montq,\@abi-omnipotent 1321.align 32 1322__ecp_nistz256_mul_montq: 1323.cfi_startproc 1324 ######################################################################## 1325 # Multiply a by b[0] 1326 mov %rax, $t1 1327 mulq $acc1 1328 mov .Lpoly+8*1(%rip),$poly1 1329 mov %rax, $acc0 1330 mov $t1, %rax 1331 mov %rdx, $acc1 1332 1333 mulq $acc2 1334 mov .Lpoly+8*3(%rip),$poly3 1335 add %rax, $acc1 1336 mov $t1, %rax 1337 adc \$0, %rdx 1338 mov %rdx, $acc2 1339 1340 mulq $acc3 1341 add %rax, $acc2 1342 mov $t1, %rax 1343 adc \$0, %rdx 1344 mov %rdx, $acc3 1345 1346 mulq $acc4 1347 add %rax, $acc3 1348 mov $acc0, %rax 1349 adc \$0, %rdx 1350 xor $acc5, $acc5 1351 mov %rdx, $acc4 1352 1353 ######################################################################## 1354 # First reduction step 1355 # Basically now we want to multiply acc[0] by p256, 1356 # and add the result to the acc. 1357 # Due to the special form of p256 we do some optimizations 1358 # 1359 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 1360 # then we add acc[0] and get acc[0] x 2^96 1361 1362 mov $acc0, $t1 1363 shl \$32, $acc0 1364 mulq $poly3 1365 shr \$32, $t1 1366 add $acc0, $acc1 # +=acc[0]<<96 1367 adc $t1, $acc2 1368 adc %rax, $acc3 1369 mov 8*1($b_ptr), %rax 1370 adc %rdx, $acc4 1371 adc \$0, $acc5 1372 xor $acc0, $acc0 1373 1374 ######################################################################## 1375 # Multiply by b[1] 1376 mov %rax, $t1 1377 mulq 8*0($a_ptr) 1378 add %rax, $acc1 1379 mov $t1, %rax 1380 adc \$0, %rdx 1381 mov %rdx, $t0 1382 1383 mulq 8*1($a_ptr) 1384 add $t0, $acc2 1385 adc \$0, %rdx 1386 add %rax, $acc2 1387 mov $t1, %rax 1388 adc \$0, %rdx 1389 mov %rdx, $t0 1390 1391 mulq 8*2($a_ptr) 1392 add $t0, $acc3 1393 adc \$0, %rdx 1394 add %rax, $acc3 1395 mov $t1, %rax 1396 adc \$0, %rdx 1397 mov %rdx, $t0 1398 1399 mulq 8*3($a_ptr) 1400 add $t0, $acc4 1401 adc \$0, %rdx 1402 add %rax, $acc4 1403 mov $acc1, %rax 1404 adc %rdx, $acc5 1405 adc \$0, $acc0 1406 1407 ######################################################################## 1408 # Second reduction step 1409 mov $acc1, $t1 1410 shl \$32, $acc1 1411 mulq $poly3 1412 shr \$32, $t1 1413 add $acc1, $acc2 1414 adc $t1, $acc3 1415 adc %rax, $acc4 1416 mov 8*2($b_ptr), %rax 1417 adc %rdx, $acc5 1418 adc \$0, $acc0 1419 xor $acc1, $acc1 1420 1421 ######################################################################## 1422 # Multiply by b[2] 1423 mov %rax, $t1 1424 mulq 8*0($a_ptr) 1425 add %rax, $acc2 1426 mov $t1, %rax 1427 adc \$0, %rdx 1428 mov %rdx, $t0 1429 1430 mulq 8*1($a_ptr) 1431 add $t0, $acc3 1432 adc \$0, %rdx 1433 add %rax, $acc3 1434 mov $t1, %rax 1435 adc \$0, %rdx 1436 mov %rdx, $t0 1437 1438 mulq 8*2($a_ptr) 1439 add $t0, $acc4 1440 adc \$0, %rdx 1441 add %rax, $acc4 1442 mov $t1, %rax 1443 adc \$0, %rdx 1444 mov %rdx, $t0 1445 1446 mulq 8*3($a_ptr) 1447 add $t0, $acc5 1448 adc \$0, %rdx 1449 add %rax, $acc5 1450 mov $acc2, %rax 1451 adc %rdx, $acc0 1452 adc \$0, $acc1 1453 1454 ######################################################################## 1455 # Third reduction step 1456 mov $acc2, $t1 1457 shl \$32, $acc2 1458 mulq $poly3 1459 shr \$32, $t1 1460 add $acc2, $acc3 1461 adc $t1, $acc4 1462 adc %rax, $acc5 1463 mov 8*3($b_ptr), %rax 1464 adc %rdx, $acc0 1465 adc \$0, $acc1 1466 xor $acc2, $acc2 1467 1468 ######################################################################## 1469 # Multiply by b[3] 1470 mov %rax, $t1 1471 mulq 8*0($a_ptr) 1472 add %rax, $acc3 1473 mov $t1, %rax 1474 adc \$0, %rdx 1475 mov %rdx, $t0 1476 1477 mulq 8*1($a_ptr) 1478 add $t0, $acc4 1479 adc \$0, %rdx 1480 add %rax, $acc4 1481 mov $t1, %rax 1482 adc \$0, %rdx 1483 mov %rdx, $t0 1484 1485 mulq 8*2($a_ptr) 1486 add $t0, $acc5 1487 adc \$0, %rdx 1488 add %rax, $acc5 1489 mov $t1, %rax 1490 adc \$0, %rdx 1491 mov %rdx, $t0 1492 1493 mulq 8*3($a_ptr) 1494 add $t0, $acc0 1495 adc \$0, %rdx 1496 add %rax, $acc0 1497 mov $acc3, %rax 1498 adc %rdx, $acc1 1499 adc \$0, $acc2 1500 1501 ######################################################################## 1502 # Final reduction step 1503 mov $acc3, $t1 1504 shl \$32, $acc3 1505 mulq $poly3 1506 shr \$32, $t1 1507 add $acc3, $acc4 1508 adc $t1, $acc5 1509 mov $acc4, $t0 1510 adc %rax, $acc0 1511 adc %rdx, $acc1 1512 mov $acc5, $t1 1513 adc \$0, $acc2 1514 1515 ######################################################################## 1516 # Branch-less conditional subtraction of P 1517 sub \$-1, $acc4 # .Lpoly[0] 1518 mov $acc0, $t2 1519 sbb $poly1, $acc5 # .Lpoly[1] 1520 sbb \$0, $acc0 # .Lpoly[2] 1521 mov $acc1, $t3 1522 sbb $poly3, $acc1 # .Lpoly[3] 1523 sbb \$0, $acc2 1524 1525 cmovc $t0, $acc4 1526 cmovc $t1, $acc5 1527 mov $acc4, 8*0($r_ptr) 1528 cmovc $t2, $acc0 1529 mov $acc5, 8*1($r_ptr) 1530 cmovc $t3, $acc1 1531 mov $acc0, 8*2($r_ptr) 1532 mov $acc1, 8*3($r_ptr) 1533 1534 ret 1535.cfi_endproc 1536.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 1537 1538################################################################################ 1539# void ecp_nistz256_sqr_mont( 1540# uint64_t res[4], 1541# uint64_t a[4]); 1542 1543# we optimize the square according to S.Gueron and V.Krasnov, 1544# "Speeding up Big-Number Squaring" 1545.globl ecp_nistz256_sqr_mont 1546.type ecp_nistz256_sqr_mont,\@function,2 1547.align 32 1548ecp_nistz256_sqr_mont: 1549.cfi_startproc 1550___ 1551$code.=<<___ if ($addx); 1552 leaq OPENSSL_ia32cap_P(%rip), %rcx 1553 mov 8(%rcx), %rcx 1554 and \$0x80100, %ecx 1555___ 1556$code.=<<___; 1557 push %rbp 1558.cfi_push %rbp 1559 push %rbx 1560.cfi_push %rbx 1561 push %r12 1562.cfi_push %r12 1563 push %r13 1564.cfi_push %r13 1565 push %r14 1566.cfi_push %r14 1567 push %r15 1568.cfi_push %r15 1569.Lsqr_body: 1570___ 1571$code.=<<___ if ($addx); 1572 cmp \$0x80100, %ecx 1573 je .Lsqr_montx 1574___ 1575$code.=<<___; 1576 mov 8*0($a_ptr), %rax 1577 mov 8*1($a_ptr), $acc6 1578 mov 8*2($a_ptr), $acc7 1579 mov 8*3($a_ptr), $acc0 1580 1581 call __ecp_nistz256_sqr_montq 1582___ 1583$code.=<<___ if ($addx); 1584 jmp .Lsqr_mont_done 1585 1586.align 32 1587.Lsqr_montx: 1588 mov 8*0($a_ptr), %rdx 1589 mov 8*1($a_ptr), $acc6 1590 mov 8*2($a_ptr), $acc7 1591 mov 8*3($a_ptr), $acc0 1592 lea -128($a_ptr), $a_ptr # control u-op density 1593 1594 call __ecp_nistz256_sqr_montx 1595___ 1596$code.=<<___; 1597.Lsqr_mont_done: 1598 mov 0(%rsp),%r15 1599.cfi_restore %r15 1600 mov 8(%rsp),%r14 1601.cfi_restore %r14 1602 mov 16(%rsp),%r13 1603.cfi_restore %r13 1604 mov 24(%rsp),%r12 1605.cfi_restore %r12 1606 mov 32(%rsp),%rbx 1607.cfi_restore %rbx 1608 mov 40(%rsp),%rbp 1609.cfi_restore %rbp 1610 lea 48(%rsp),%rsp 1611.cfi_adjust_cfa_offset -48 1612.Lsqr_epilogue: 1613 ret 1614.cfi_endproc 1615.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 1616 1617.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 1618.align 32 1619__ecp_nistz256_sqr_montq: 1620.cfi_startproc 1621 mov %rax, $acc5 1622 mulq $acc6 # a[1]*a[0] 1623 mov %rax, $acc1 1624 mov $acc7, %rax 1625 mov %rdx, $acc2 1626 1627 mulq $acc5 # a[0]*a[2] 1628 add %rax, $acc2 1629 mov $acc0, %rax 1630 adc \$0, %rdx 1631 mov %rdx, $acc3 1632 1633 mulq $acc5 # a[0]*a[3] 1634 add %rax, $acc3 1635 mov $acc7, %rax 1636 adc \$0, %rdx 1637 mov %rdx, $acc4 1638 1639 ################################# 1640 mulq $acc6 # a[1]*a[2] 1641 add %rax, $acc3 1642 mov $acc0, %rax 1643 adc \$0, %rdx 1644 mov %rdx, $t1 1645 1646 mulq $acc6 # a[1]*a[3] 1647 add %rax, $acc4 1648 mov $acc0, %rax 1649 adc \$0, %rdx 1650 add $t1, $acc4 1651 mov %rdx, $acc5 1652 adc \$0, $acc5 1653 1654 ################################# 1655 mulq $acc7 # a[2]*a[3] 1656 xor $acc7, $acc7 1657 add %rax, $acc5 1658 mov 8*0($a_ptr), %rax 1659 mov %rdx, $acc6 1660 adc \$0, $acc6 1661 1662 add $acc1, $acc1 # acc1:6<<1 1663 adc $acc2, $acc2 1664 adc $acc3, $acc3 1665 adc $acc4, $acc4 1666 adc $acc5, $acc5 1667 adc $acc6, $acc6 1668 adc \$0, $acc7 1669 1670 mulq %rax 1671 mov %rax, $acc0 1672 mov 8*1($a_ptr), %rax 1673 mov %rdx, $t0 1674 1675 mulq %rax 1676 add $t0, $acc1 1677 adc %rax, $acc2 1678 mov 8*2($a_ptr), %rax 1679 adc \$0, %rdx 1680 mov %rdx, $t0 1681 1682 mulq %rax 1683 add $t0, $acc3 1684 adc %rax, $acc4 1685 mov 8*3($a_ptr), %rax 1686 adc \$0, %rdx 1687 mov %rdx, $t0 1688 1689 mulq %rax 1690 add $t0, $acc5 1691 adc %rax, $acc6 1692 mov $acc0, %rax 1693 adc %rdx, $acc7 1694 1695 mov .Lpoly+8*1(%rip), $a_ptr 1696 mov .Lpoly+8*3(%rip), $t1 1697 1698 ########################################## 1699 # Now the reduction 1700 # First iteration 1701 mov $acc0, $t0 1702 shl \$32, $acc0 1703 mulq $t1 1704 shr \$32, $t0 1705 add $acc0, $acc1 # +=acc[0]<<96 1706 adc $t0, $acc2 1707 adc %rax, $acc3 1708 mov $acc1, %rax 1709 adc \$0, %rdx 1710 1711 ########################################## 1712 # Second iteration 1713 mov $acc1, $t0 1714 shl \$32, $acc1 1715 mov %rdx, $acc0 1716 mulq $t1 1717 shr \$32, $t0 1718 add $acc1, $acc2 1719 adc $t0, $acc3 1720 adc %rax, $acc0 1721 mov $acc2, %rax 1722 adc \$0, %rdx 1723 1724 ########################################## 1725 # Third iteration 1726 mov $acc2, $t0 1727 shl \$32, $acc2 1728 mov %rdx, $acc1 1729 mulq $t1 1730 shr \$32, $t0 1731 add $acc2, $acc3 1732 adc $t0, $acc0 1733 adc %rax, $acc1 1734 mov $acc3, %rax 1735 adc \$0, %rdx 1736 1737 ########################################### 1738 # Last iteration 1739 mov $acc3, $t0 1740 shl \$32, $acc3 1741 mov %rdx, $acc2 1742 mulq $t1 1743 shr \$32, $t0 1744 add $acc3, $acc0 1745 adc $t0, $acc1 1746 adc %rax, $acc2 1747 adc \$0, %rdx 1748 xor $acc3, $acc3 1749 1750 ############################################ 1751 # Add the rest of the acc 1752 add $acc0, $acc4 1753 adc $acc1, $acc5 1754 mov $acc4, $acc0 1755 adc $acc2, $acc6 1756 adc %rdx, $acc7 1757 mov $acc5, $acc1 1758 adc \$0, $acc3 1759 1760 sub \$-1, $acc4 # .Lpoly[0] 1761 mov $acc6, $acc2 1762 sbb $a_ptr, $acc5 # .Lpoly[1] 1763 sbb \$0, $acc6 # .Lpoly[2] 1764 mov $acc7, $t0 1765 sbb $t1, $acc7 # .Lpoly[3] 1766 sbb \$0, $acc3 1767 1768 cmovc $acc0, $acc4 1769 cmovc $acc1, $acc5 1770 mov $acc4, 8*0($r_ptr) 1771 cmovc $acc2, $acc6 1772 mov $acc5, 8*1($r_ptr) 1773 cmovc $t0, $acc7 1774 mov $acc6, 8*2($r_ptr) 1775 mov $acc7, 8*3($r_ptr) 1776 1777 ret 1778.cfi_endproc 1779.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 1780___ 1781 1782if ($addx) { 1783$code.=<<___; 1784.type __ecp_nistz256_mul_montx,\@abi-omnipotent 1785.align 32 1786__ecp_nistz256_mul_montx: 1787.cfi_startproc 1788 ######################################################################## 1789 # Multiply by b[0] 1790 mulx $acc1, $acc0, $acc1 1791 mulx $acc2, $t0, $acc2 1792 mov \$32, $poly1 1793 xor $acc5, $acc5 # cf=0 1794 mulx $acc3, $t1, $acc3 1795 mov .Lpoly+8*3(%rip), $poly3 1796 adc $t0, $acc1 1797 mulx $acc4, $t0, $acc4 1798 mov $acc0, %rdx 1799 adc $t1, $acc2 1800 shlx $poly1,$acc0,$t1 1801 adc $t0, $acc3 1802 shrx $poly1,$acc0,$t0 1803 adc \$0, $acc4 1804 1805 ######################################################################## 1806 # First reduction step 1807 add $t1, $acc1 1808 adc $t0, $acc2 1809 1810 mulx $poly3, $t0, $t1 1811 mov 8*1($b_ptr), %rdx 1812 adc $t0, $acc3 1813 adc $t1, $acc4 1814 adc \$0, $acc5 1815 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 1816 1817 ######################################################################## 1818 # Multiply by b[1] 1819 mulx 8*0+128($a_ptr), $t0, $t1 1820 adcx $t0, $acc1 1821 adox $t1, $acc2 1822 1823 mulx 8*1+128($a_ptr), $t0, $t1 1824 adcx $t0, $acc2 1825 adox $t1, $acc3 1826 1827 mulx 8*2+128($a_ptr), $t0, $t1 1828 adcx $t0, $acc3 1829 adox $t1, $acc4 1830 1831 mulx 8*3+128($a_ptr), $t0, $t1 1832 mov $acc1, %rdx 1833 adcx $t0, $acc4 1834 shlx $poly1, $acc1, $t0 1835 adox $t1, $acc5 1836 shrx $poly1, $acc1, $t1 1837 1838 adcx $acc0, $acc5 1839 adox $acc0, $acc0 1840 adc \$0, $acc0 1841 1842 ######################################################################## 1843 # Second reduction step 1844 add $t0, $acc2 1845 adc $t1, $acc3 1846 1847 mulx $poly3, $t0, $t1 1848 mov 8*2($b_ptr), %rdx 1849 adc $t0, $acc4 1850 adc $t1, $acc5 1851 adc \$0, $acc0 1852 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1853 1854 ######################################################################## 1855 # Multiply by b[2] 1856 mulx 8*0+128($a_ptr), $t0, $t1 1857 adcx $t0, $acc2 1858 adox $t1, $acc3 1859 1860 mulx 8*1+128($a_ptr), $t0, $t1 1861 adcx $t0, $acc3 1862 adox $t1, $acc4 1863 1864 mulx 8*2+128($a_ptr), $t0, $t1 1865 adcx $t0, $acc4 1866 adox $t1, $acc5 1867 1868 mulx 8*3+128($a_ptr), $t0, $t1 1869 mov $acc2, %rdx 1870 adcx $t0, $acc5 1871 shlx $poly1, $acc2, $t0 1872 adox $t1, $acc0 1873 shrx $poly1, $acc2, $t1 1874 1875 adcx $acc1, $acc0 1876 adox $acc1, $acc1 1877 adc \$0, $acc1 1878 1879 ######################################################################## 1880 # Third reduction step 1881 add $t0, $acc3 1882 adc $t1, $acc4 1883 1884 mulx $poly3, $t0, $t1 1885 mov 8*3($b_ptr), %rdx 1886 adc $t0, $acc5 1887 adc $t1, $acc0 1888 adc \$0, $acc1 1889 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1890 1891 ######################################################################## 1892 # Multiply by b[3] 1893 mulx 8*0+128($a_ptr), $t0, $t1 1894 adcx $t0, $acc3 1895 adox $t1, $acc4 1896 1897 mulx 8*1+128($a_ptr), $t0, $t1 1898 adcx $t0, $acc4 1899 adox $t1, $acc5 1900 1901 mulx 8*2+128($a_ptr), $t0, $t1 1902 adcx $t0, $acc5 1903 adox $t1, $acc0 1904 1905 mulx 8*3+128($a_ptr), $t0, $t1 1906 mov $acc3, %rdx 1907 adcx $t0, $acc0 1908 shlx $poly1, $acc3, $t0 1909 adox $t1, $acc1 1910 shrx $poly1, $acc3, $t1 1911 1912 adcx $acc2, $acc1 1913 adox $acc2, $acc2 1914 adc \$0, $acc2 1915 1916 ######################################################################## 1917 # Fourth reduction step 1918 add $t0, $acc4 1919 adc $t1, $acc5 1920 1921 mulx $poly3, $t0, $t1 1922 mov $acc4, $t2 1923 mov .Lpoly+8*1(%rip), $poly1 1924 adc $t0, $acc0 1925 mov $acc5, $t3 1926 adc $t1, $acc1 1927 adc \$0, $acc2 1928 1929 ######################################################################## 1930 # Branch-less conditional subtraction of P 1931 xor %eax, %eax 1932 mov $acc0, $t0 1933 sbb \$-1, $acc4 # .Lpoly[0] 1934 sbb $poly1, $acc5 # .Lpoly[1] 1935 sbb \$0, $acc0 # .Lpoly[2] 1936 mov $acc1, $t1 1937 sbb $poly3, $acc1 # .Lpoly[3] 1938 sbb \$0, $acc2 1939 1940 cmovc $t2, $acc4 1941 cmovc $t3, $acc5 1942 mov $acc4, 8*0($r_ptr) 1943 cmovc $t0, $acc0 1944 mov $acc5, 8*1($r_ptr) 1945 cmovc $t1, $acc1 1946 mov $acc0, 8*2($r_ptr) 1947 mov $acc1, 8*3($r_ptr) 1948 1949 ret 1950.cfi_endproc 1951.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1952 1953.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1954.align 32 1955__ecp_nistz256_sqr_montx: 1956.cfi_startproc 1957 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1958 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1959 xor %eax, %eax 1960 adc $t0, $acc2 1961 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1962 mov $acc6, %rdx 1963 adc $t1, $acc3 1964 adc \$0, $acc4 1965 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1966 1967 ################################# 1968 mulx $acc7, $t0, $t1 # a[1]*a[2] 1969 adcx $t0, $acc3 1970 adox $t1, $acc4 1971 1972 mulx $acc0, $t0, $t1 # a[1]*a[3] 1973 mov $acc7, %rdx 1974 adcx $t0, $acc4 1975 adox $t1, $acc5 1976 adc \$0, $acc5 1977 1978 ################################# 1979 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1980 mov 8*0+128($a_ptr), %rdx 1981 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1982 adcx $acc1, $acc1 # acc1:6<<1 1983 adox $t0, $acc5 1984 adcx $acc2, $acc2 1985 adox $acc7, $acc6 # of=0 1986 1987 mulx %rdx, $acc0, $t1 1988 mov 8*1+128($a_ptr), %rdx 1989 adcx $acc3, $acc3 1990 adox $t1, $acc1 1991 adcx $acc4, $acc4 1992 mulx %rdx, $t0, $t4 1993 mov 8*2+128($a_ptr), %rdx 1994 adcx $acc5, $acc5 1995 adox $t0, $acc2 1996 adcx $acc6, $acc6 1997 .byte 0x67 1998 mulx %rdx, $t0, $t1 1999 mov 8*3+128($a_ptr), %rdx 2000 adox $t4, $acc3 2001 adcx $acc7, $acc7 2002 adox $t0, $acc4 2003 mov \$32, $a_ptr 2004 adox $t1, $acc5 2005 .byte 0x67,0x67 2006 mulx %rdx, $t0, $t4 2007 mov .Lpoly+8*3(%rip), %rdx 2008 adox $t0, $acc6 2009 shlx $a_ptr, $acc0, $t0 2010 adox $t4, $acc7 2011 shrx $a_ptr, $acc0, $t4 2012 mov %rdx,$t1 2013 2014 # reduction step 1 2015 add $t0, $acc1 2016 adc $t4, $acc2 2017 2018 mulx $acc0, $t0, $acc0 2019 adc $t0, $acc3 2020 shlx $a_ptr, $acc1, $t0 2021 adc \$0, $acc0 2022 shrx $a_ptr, $acc1, $t4 2023 2024 # reduction step 2 2025 add $t0, $acc2 2026 adc $t4, $acc3 2027 2028 mulx $acc1, $t0, $acc1 2029 adc $t0, $acc0 2030 shlx $a_ptr, $acc2, $t0 2031 adc \$0, $acc1 2032 shrx $a_ptr, $acc2, $t4 2033 2034 # reduction step 3 2035 add $t0, $acc3 2036 adc $t4, $acc0 2037 2038 mulx $acc2, $t0, $acc2 2039 adc $t0, $acc1 2040 shlx $a_ptr, $acc3, $t0 2041 adc \$0, $acc2 2042 shrx $a_ptr, $acc3, $t4 2043 2044 # reduction step 4 2045 add $t0, $acc0 2046 adc $t4, $acc1 2047 2048 mulx $acc3, $t0, $acc3 2049 adc $t0, $acc2 2050 adc \$0, $acc3 2051 2052 xor $t3, $t3 2053 add $acc0, $acc4 # accumulate upper half 2054 mov .Lpoly+8*1(%rip), $a_ptr 2055 adc $acc1, $acc5 2056 mov $acc4, $acc0 2057 adc $acc2, $acc6 2058 adc $acc3, $acc7 2059 mov $acc5, $acc1 2060 adc \$0, $t3 2061 2062 sub \$-1, $acc4 # .Lpoly[0] 2063 mov $acc6, $acc2 2064 sbb $a_ptr, $acc5 # .Lpoly[1] 2065 sbb \$0, $acc6 # .Lpoly[2] 2066 mov $acc7, $acc3 2067 sbb $t1, $acc7 # .Lpoly[3] 2068 sbb \$0, $t3 2069 2070 cmovc $acc0, $acc4 2071 cmovc $acc1, $acc5 2072 mov $acc4, 8*0($r_ptr) 2073 cmovc $acc2, $acc6 2074 mov $acc5, 8*1($r_ptr) 2075 cmovc $acc3, $acc7 2076 mov $acc6, 8*2($r_ptr) 2077 mov $acc7, 8*3($r_ptr) 2078 2079 ret 2080.cfi_endproc 2081.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 2082___ 2083} 2084} 2085{ 2086my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2087my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 2088my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 2089my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 2090 2091$code.=<<___; 2092################################################################################ 2093# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 2094.globl ecp_nistz256_select_w5 2095.type ecp_nistz256_select_w5,\@abi-omnipotent 2096.align 32 2097ecp_nistz256_select_w5: 2098.cfi_startproc 2099___ 2100$code.=<<___ if ($avx>1); 2101 leaq OPENSSL_ia32cap_P(%rip), %rax 2102 mov 8(%rax), %rax 2103 test \$`1<<5`, %eax 2104 jnz .Lavx2_select_w5 2105___ 2106$code.=<<___ if ($win64); 2107 lea -0x88(%rsp), %rax 2108.LSEH_begin_ecp_nistz256_select_w5: 2109 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2110 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2111 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2112 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2113 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2114 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2115 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2116 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2117 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2118 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2119 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2120___ 2121$code.=<<___; 2122 movdqa .LOne(%rip), $ONE 2123 movd $index, $INDEX 2124 2125 pxor $Ra, $Ra 2126 pxor $Rb, $Rb 2127 pxor $Rc, $Rc 2128 pxor $Rd, $Rd 2129 pxor $Re, $Re 2130 pxor $Rf, $Rf 2131 2132 movdqa $ONE, $M0 2133 pshufd \$0, $INDEX, $INDEX 2134 2135 mov \$16, %rax 2136.Lselect_loop_sse_w5: 2137 2138 movdqa $M0, $TMP0 2139 paddd $ONE, $M0 2140 pcmpeqd $INDEX, $TMP0 2141 2142 movdqa 16*0($in_t), $T0a 2143 movdqa 16*1($in_t), $T0b 2144 movdqa 16*2($in_t), $T0c 2145 movdqa 16*3($in_t), $T0d 2146 movdqa 16*4($in_t), $T0e 2147 movdqa 16*5($in_t), $T0f 2148 lea 16*6($in_t), $in_t 2149 2150 pand $TMP0, $T0a 2151 pand $TMP0, $T0b 2152 por $T0a, $Ra 2153 pand $TMP0, $T0c 2154 por $T0b, $Rb 2155 pand $TMP0, $T0d 2156 por $T0c, $Rc 2157 pand $TMP0, $T0e 2158 por $T0d, $Rd 2159 pand $TMP0, $T0f 2160 por $T0e, $Re 2161 por $T0f, $Rf 2162 2163 dec %rax 2164 jnz .Lselect_loop_sse_w5 2165 2166 movdqu $Ra, 16*0($val) 2167 movdqu $Rb, 16*1($val) 2168 movdqu $Rc, 16*2($val) 2169 movdqu $Rd, 16*3($val) 2170 movdqu $Re, 16*4($val) 2171 movdqu $Rf, 16*5($val) 2172___ 2173$code.=<<___ if ($win64); 2174 movaps (%rsp), %xmm6 2175 movaps 0x10(%rsp), %xmm7 2176 movaps 0x20(%rsp), %xmm8 2177 movaps 0x30(%rsp), %xmm9 2178 movaps 0x40(%rsp), %xmm10 2179 movaps 0x50(%rsp), %xmm11 2180 movaps 0x60(%rsp), %xmm12 2181 movaps 0x70(%rsp), %xmm13 2182 movaps 0x80(%rsp), %xmm14 2183 movaps 0x90(%rsp), %xmm15 2184 lea 0xa8(%rsp), %rsp 2185___ 2186$code.=<<___; 2187 ret 2188.cfi_endproc 2189.LSEH_end_ecp_nistz256_select_w5: 2190.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 2191 2192################################################################################ 2193# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 2194.globl ecp_nistz256_select_w7 2195.type ecp_nistz256_select_w7,\@abi-omnipotent 2196.align 32 2197ecp_nistz256_select_w7: 2198.cfi_startproc 2199___ 2200$code.=<<___ if ($avx>1); 2201 leaq OPENSSL_ia32cap_P(%rip), %rax 2202 mov 8(%rax), %rax 2203 test \$`1<<5`, %eax 2204 jnz .Lavx2_select_w7 2205___ 2206$code.=<<___ if ($win64); 2207 lea -0x88(%rsp), %rax 2208.LSEH_begin_ecp_nistz256_select_w7: 2209 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2210 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2211 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2212 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2213 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2214 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2215 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2216 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2217 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2218 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2219 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2220___ 2221$code.=<<___; 2222 movdqa .LOne(%rip), $M0 2223 movd $index, $INDEX 2224 2225 pxor $Ra, $Ra 2226 pxor $Rb, $Rb 2227 pxor $Rc, $Rc 2228 pxor $Rd, $Rd 2229 2230 movdqa $M0, $ONE 2231 pshufd \$0, $INDEX, $INDEX 2232 mov \$64, %rax 2233 2234.Lselect_loop_sse_w7: 2235 movdqa $M0, $TMP0 2236 paddd $ONE, $M0 2237 movdqa 16*0($in_t), $T0a 2238 movdqa 16*1($in_t), $T0b 2239 pcmpeqd $INDEX, $TMP0 2240 movdqa 16*2($in_t), $T0c 2241 movdqa 16*3($in_t), $T0d 2242 lea 16*4($in_t), $in_t 2243 2244 pand $TMP0, $T0a 2245 pand $TMP0, $T0b 2246 por $T0a, $Ra 2247 pand $TMP0, $T0c 2248 por $T0b, $Rb 2249 pand $TMP0, $T0d 2250 por $T0c, $Rc 2251 prefetcht0 255($in_t) 2252 por $T0d, $Rd 2253 2254 dec %rax 2255 jnz .Lselect_loop_sse_w7 2256 2257 movdqu $Ra, 16*0($val) 2258 movdqu $Rb, 16*1($val) 2259 movdqu $Rc, 16*2($val) 2260 movdqu $Rd, 16*3($val) 2261___ 2262$code.=<<___ if ($win64); 2263 movaps (%rsp), %xmm6 2264 movaps 0x10(%rsp), %xmm7 2265 movaps 0x20(%rsp), %xmm8 2266 movaps 0x30(%rsp), %xmm9 2267 movaps 0x40(%rsp), %xmm10 2268 movaps 0x50(%rsp), %xmm11 2269 movaps 0x60(%rsp), %xmm12 2270 movaps 0x70(%rsp), %xmm13 2271 movaps 0x80(%rsp), %xmm14 2272 movaps 0x90(%rsp), %xmm15 2273 lea 0xa8(%rsp), %rsp 2274___ 2275$code.=<<___; 2276 ret 2277.cfi_endproc 2278.LSEH_end_ecp_nistz256_select_w7: 2279.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 2280___ 2281} 2282if ($avx>1) { 2283my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2284my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 2285my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 2286my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 2287 2288$code.=<<___; 2289################################################################################ 2290# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 2291.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 2292.align 32 2293ecp_nistz256_avx2_select_w5: 2294.cfi_startproc 2295.Lavx2_select_w5: 2296 vzeroupper 2297___ 2298$code.=<<___ if ($win64); 2299 lea -0x88(%rsp), %rax 2300 mov %rsp,%r11 2301.LSEH_begin_ecp_nistz256_avx2_select_w5: 2302 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2303 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2304 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2305 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2306 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2307 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2308 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2309 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2310 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2311 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2312 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2313___ 2314$code.=<<___; 2315 vmovdqa .LTwo(%rip), $TWO 2316 2317 vpxor $Ra, $Ra, $Ra 2318 vpxor $Rb, $Rb, $Rb 2319 vpxor $Rc, $Rc, $Rc 2320 2321 vmovdqa .LOne(%rip), $M0 2322 vmovdqa .LTwo(%rip), $M1 2323 2324 vmovd $index, %xmm1 2325 vpermd $INDEX, $Ra, $INDEX 2326 2327 mov \$8, %rax 2328.Lselect_loop_avx2_w5: 2329 2330 vmovdqa 32*0($in_t), $T0a 2331 vmovdqa 32*1($in_t), $T0b 2332 vmovdqa 32*2($in_t), $T0c 2333 2334 vmovdqa 32*3($in_t), $T1a 2335 vmovdqa 32*4($in_t), $T1b 2336 vmovdqa 32*5($in_t), $T1c 2337 2338 vpcmpeqd $INDEX, $M0, $TMP0 2339 vpcmpeqd $INDEX, $M1, $TMP1 2340 2341 vpaddd $TWO, $M0, $M0 2342 vpaddd $TWO, $M1, $M1 2343 lea 32*6($in_t), $in_t 2344 2345 vpand $TMP0, $T0a, $T0a 2346 vpand $TMP0, $T0b, $T0b 2347 vpand $TMP0, $T0c, $T0c 2348 vpand $TMP1, $T1a, $T1a 2349 vpand $TMP1, $T1b, $T1b 2350 vpand $TMP1, $T1c, $T1c 2351 2352 vpxor $T0a, $Ra, $Ra 2353 vpxor $T0b, $Rb, $Rb 2354 vpxor $T0c, $Rc, $Rc 2355 vpxor $T1a, $Ra, $Ra 2356 vpxor $T1b, $Rb, $Rb 2357 vpxor $T1c, $Rc, $Rc 2358 2359 dec %rax 2360 jnz .Lselect_loop_avx2_w5 2361 2362 vmovdqu $Ra, 32*0($val) 2363 vmovdqu $Rb, 32*1($val) 2364 vmovdqu $Rc, 32*2($val) 2365 vzeroupper 2366___ 2367$code.=<<___ if ($win64); 2368 movaps (%rsp), %xmm6 2369 movaps 0x10(%rsp), %xmm7 2370 movaps 0x20(%rsp), %xmm8 2371 movaps 0x30(%rsp), %xmm9 2372 movaps 0x40(%rsp), %xmm10 2373 movaps 0x50(%rsp), %xmm11 2374 movaps 0x60(%rsp), %xmm12 2375 movaps 0x70(%rsp), %xmm13 2376 movaps 0x80(%rsp), %xmm14 2377 movaps 0x90(%rsp), %xmm15 2378 lea (%r11), %rsp 2379___ 2380$code.=<<___; 2381 ret 2382.cfi_endproc 2383.LSEH_end_ecp_nistz256_avx2_select_w5: 2384.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 2385___ 2386} 2387if ($avx>1) { 2388my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2389my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 2390my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 2391my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 2392my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 2393 2394$code.=<<___; 2395 2396################################################################################ 2397# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 2398.globl ecp_nistz256_avx2_select_w7 2399.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 2400.align 32 2401ecp_nistz256_avx2_select_w7: 2402.cfi_startproc 2403.Lavx2_select_w7: 2404 vzeroupper 2405___ 2406$code.=<<___ if ($win64); 2407 mov %rsp,%r11 2408 lea -0x88(%rsp), %rax 2409.LSEH_begin_ecp_nistz256_avx2_select_w7: 2410 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2411 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2412 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2413 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2414 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2415 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2416 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2417 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2418 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2419 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2420 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2421___ 2422$code.=<<___; 2423 vmovdqa .LThree(%rip), $THREE 2424 2425 vpxor $Ra, $Ra, $Ra 2426 vpxor $Rb, $Rb, $Rb 2427 2428 vmovdqa .LOne(%rip), $M0 2429 vmovdqa .LTwo(%rip), $M1 2430 vmovdqa .LThree(%rip), $M2 2431 2432 vmovd $index, %xmm1 2433 vpermd $INDEX, $Ra, $INDEX 2434 # Skip index = 0, because it is implicitly the point at infinity 2435 2436 mov \$21, %rax 2437.Lselect_loop_avx2_w7: 2438 2439 vmovdqa 32*0($in_t), $T0a 2440 vmovdqa 32*1($in_t), $T0b 2441 2442 vmovdqa 32*2($in_t), $T1a 2443 vmovdqa 32*3($in_t), $T1b 2444 2445 vmovdqa 32*4($in_t), $T2a 2446 vmovdqa 32*5($in_t), $T2b 2447 2448 vpcmpeqd $INDEX, $M0, $TMP0 2449 vpcmpeqd $INDEX, $M1, $TMP1 2450 vpcmpeqd $INDEX, $M2, $TMP2 2451 2452 vpaddd $THREE, $M0, $M0 2453 vpaddd $THREE, $M1, $M1 2454 vpaddd $THREE, $M2, $M2 2455 lea 32*6($in_t), $in_t 2456 2457 vpand $TMP0, $T0a, $T0a 2458 vpand $TMP0, $T0b, $T0b 2459 vpand $TMP1, $T1a, $T1a 2460 vpand $TMP1, $T1b, $T1b 2461 vpand $TMP2, $T2a, $T2a 2462 vpand $TMP2, $T2b, $T2b 2463 2464 vpxor $T0a, $Ra, $Ra 2465 vpxor $T0b, $Rb, $Rb 2466 vpxor $T1a, $Ra, $Ra 2467 vpxor $T1b, $Rb, $Rb 2468 vpxor $T2a, $Ra, $Ra 2469 vpxor $T2b, $Rb, $Rb 2470 2471 dec %rax 2472 jnz .Lselect_loop_avx2_w7 2473 2474 2475 vmovdqa 32*0($in_t), $T0a 2476 vmovdqa 32*1($in_t), $T0b 2477 2478 vpcmpeqd $INDEX, $M0, $TMP0 2479 2480 vpand $TMP0, $T0a, $T0a 2481 vpand $TMP0, $T0b, $T0b 2482 2483 vpxor $T0a, $Ra, $Ra 2484 vpxor $T0b, $Rb, $Rb 2485 2486 vmovdqu $Ra, 32*0($val) 2487 vmovdqu $Rb, 32*1($val) 2488 vzeroupper 2489___ 2490$code.=<<___ if ($win64); 2491 movaps (%rsp), %xmm6 2492 movaps 0x10(%rsp), %xmm7 2493 movaps 0x20(%rsp), %xmm8 2494 movaps 0x30(%rsp), %xmm9 2495 movaps 0x40(%rsp), %xmm10 2496 movaps 0x50(%rsp), %xmm11 2497 movaps 0x60(%rsp), %xmm12 2498 movaps 0x70(%rsp), %xmm13 2499 movaps 0x80(%rsp), %xmm14 2500 movaps 0x90(%rsp), %xmm15 2501 lea (%r11), %rsp 2502___ 2503$code.=<<___; 2504 ret 2505.cfi_endproc 2506.LSEH_end_ecp_nistz256_avx2_select_w7: 2507.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2508___ 2509} else { 2510$code.=<<___; 2511.globl ecp_nistz256_avx2_select_w7 2512.type ecp_nistz256_avx2_select_w7,\@function,3 2513.align 32 2514ecp_nistz256_avx2_select_w7: 2515 .byte 0x0f,0x0b # ud2 2516 ret 2517.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2518___ 2519} 2520{{{ 2521######################################################################## 2522# This block implements higher level point_double, point_add and 2523# point_add_affine. The key to performance in this case is to allow 2524# out-of-order execution logic to overlap computations from next step 2525# with tail processing from current step. By using tailored calling 2526# sequence we minimize inter-step overhead to give processor better 2527# shot at overlapping operations... 2528# 2529# You will notice that input data is copied to stack. Trouble is that 2530# there are no registers to spare for holding original pointers and 2531# reloading them, pointers, would create undesired dependencies on 2532# effective addresses calculation paths. In other words it's too done 2533# to favour out-of-order execution logic. 2534# <appro@openssl.org> 2535 2536my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 2537my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 2538my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 2539my ($poly1,$poly3)=($acc6,$acc7); 2540 2541sub load_for_mul () { 2542my ($a,$b,$src0) = @_; 2543my $bias = $src0 eq "%rax" ? 0 : -128; 2544 2545" mov $b, $src0 2546 lea $b, $b_ptr 2547 mov 8*0+$a, $acc1 2548 mov 8*1+$a, $acc2 2549 lea $bias+$a, $a_ptr 2550 mov 8*2+$a, $acc3 2551 mov 8*3+$a, $acc4" 2552} 2553 2554sub load_for_sqr () { 2555my ($a,$src0) = @_; 2556my $bias = $src0 eq "%rax" ? 0 : -128; 2557 2558" mov 8*0+$a, $src0 2559 mov 8*1+$a, $acc6 2560 lea $bias+$a, $a_ptr 2561 mov 8*2+$a, $acc7 2562 mov 8*3+$a, $acc0" 2563} 2564 2565 { 2566######################################################################## 2567# operate in 4-5-0-1 "name space" that matches multiplication output 2568# 2569my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2570 2571$code.=<<___; 2572.type __ecp_nistz256_add_toq,\@abi-omnipotent 2573.align 32 2574__ecp_nistz256_add_toq: 2575.cfi_startproc 2576 xor $t4,$t4 2577 add 8*0($b_ptr), $a0 2578 adc 8*1($b_ptr), $a1 2579 mov $a0, $t0 2580 adc 8*2($b_ptr), $a2 2581 adc 8*3($b_ptr), $a3 2582 mov $a1, $t1 2583 adc \$0, $t4 2584 2585 sub \$-1, $a0 2586 mov $a2, $t2 2587 sbb $poly1, $a1 2588 sbb \$0, $a2 2589 mov $a3, $t3 2590 sbb $poly3, $a3 2591 sbb \$0, $t4 2592 2593 cmovc $t0, $a0 2594 cmovc $t1, $a1 2595 mov $a0, 8*0($r_ptr) 2596 cmovc $t2, $a2 2597 mov $a1, 8*1($r_ptr) 2598 cmovc $t3, $a3 2599 mov $a2, 8*2($r_ptr) 2600 mov $a3, 8*3($r_ptr) 2601 2602 ret 2603.cfi_endproc 2604.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 2605 2606.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 2607.align 32 2608__ecp_nistz256_sub_fromq: 2609.cfi_startproc 2610 sub 8*0($b_ptr), $a0 2611 sbb 8*1($b_ptr), $a1 2612 mov $a0, $t0 2613 sbb 8*2($b_ptr), $a2 2614 sbb 8*3($b_ptr), $a3 2615 mov $a1, $t1 2616 sbb $t4, $t4 2617 2618 add \$-1, $a0 2619 mov $a2, $t2 2620 adc $poly1, $a1 2621 adc \$0, $a2 2622 mov $a3, $t3 2623 adc $poly3, $a3 2624 test $t4, $t4 2625 2626 cmovz $t0, $a0 2627 cmovz $t1, $a1 2628 mov $a0, 8*0($r_ptr) 2629 cmovz $t2, $a2 2630 mov $a1, 8*1($r_ptr) 2631 cmovz $t3, $a3 2632 mov $a2, 8*2($r_ptr) 2633 mov $a3, 8*3($r_ptr) 2634 2635 ret 2636.cfi_endproc 2637.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 2638 2639.type __ecp_nistz256_subq,\@abi-omnipotent 2640.align 32 2641__ecp_nistz256_subq: 2642.cfi_startproc 2643 sub $a0, $t0 2644 sbb $a1, $t1 2645 mov $t0, $a0 2646 sbb $a2, $t2 2647 sbb $a3, $t3 2648 mov $t1, $a1 2649 sbb $t4, $t4 2650 2651 add \$-1, $t0 2652 mov $t2, $a2 2653 adc $poly1, $t1 2654 adc \$0, $t2 2655 mov $t3, $a3 2656 adc $poly3, $t3 2657 test $t4, $t4 2658 2659 cmovnz $t0, $a0 2660 cmovnz $t1, $a1 2661 cmovnz $t2, $a2 2662 cmovnz $t3, $a3 2663 2664 ret 2665.cfi_endproc 2666.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 2667 2668.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 2669.align 32 2670__ecp_nistz256_mul_by_2q: 2671.cfi_startproc 2672 xor $t4, $t4 2673 add $a0, $a0 # a0:a3+a0:a3 2674 adc $a1, $a1 2675 mov $a0, $t0 2676 adc $a2, $a2 2677 adc $a3, $a3 2678 mov $a1, $t1 2679 adc \$0, $t4 2680 2681 sub \$-1, $a0 2682 mov $a2, $t2 2683 sbb $poly1, $a1 2684 sbb \$0, $a2 2685 mov $a3, $t3 2686 sbb $poly3, $a3 2687 sbb \$0, $t4 2688 2689 cmovc $t0, $a0 2690 cmovc $t1, $a1 2691 mov $a0, 8*0($r_ptr) 2692 cmovc $t2, $a2 2693 mov $a1, 8*1($r_ptr) 2694 cmovc $t3, $a3 2695 mov $a2, 8*2($r_ptr) 2696 mov $a3, 8*3($r_ptr) 2697 2698 ret 2699.cfi_endproc 2700.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 2701___ 2702 } 2703sub gen_double () { 2704 my $x = shift; 2705 my ($src0,$sfx,$bias); 2706 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 2707 2708 if ($x ne "x") { 2709 $src0 = "%rax"; 2710 $sfx = ""; 2711 $bias = 0; 2712 2713$code.=<<___; 2714.globl ecp_nistz256_point_double 2715.type ecp_nistz256_point_double,\@function,2 2716.align 32 2717ecp_nistz256_point_double: 2718.cfi_startproc 2719___ 2720$code.=<<___ if ($addx); 2721 leaq OPENSSL_ia32cap_P(%rip), %rcx 2722 mov 8(%rcx), %rcx 2723 and \$0x80100, %ecx 2724 cmp \$0x80100, %ecx 2725 je .Lpoint_doublex 2726___ 2727 } else { 2728 $src0 = "%rdx"; 2729 $sfx = "x"; 2730 $bias = 128; 2731 2732$code.=<<___; 2733.type ecp_nistz256_point_doublex,\@function,2 2734.align 32 2735ecp_nistz256_point_doublex: 2736.cfi_startproc 2737.Lpoint_doublex: 2738___ 2739 } 2740$code.=<<___; 2741 push %rbp 2742.cfi_push %rbp 2743 push %rbx 2744.cfi_push %rbx 2745 push %r12 2746.cfi_push %r12 2747 push %r13 2748.cfi_push %r13 2749 push %r14 2750.cfi_push %r14 2751 push %r15 2752.cfi_push %r15 2753 sub \$32*5+8, %rsp 2754.cfi_adjust_cfa_offset 32*5+8 2755.Lpoint_double${x}_body: 2756 2757.Lpoint_double_shortcut$x: 2758 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2759 mov $a_ptr, $b_ptr # backup copy 2760 movdqu 0x10($a_ptr), %xmm1 2761 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2762 mov 0x20+8*1($a_ptr), $acc5 2763 mov 0x20+8*2($a_ptr), $acc0 2764 mov 0x20+8*3($a_ptr), $acc1 2765 mov .Lpoly+8*1(%rip), $poly1 2766 mov .Lpoly+8*3(%rip), $poly3 2767 movdqa %xmm0, $in_x(%rsp) 2768 movdqa %xmm1, $in_x+0x10(%rsp) 2769 lea 0x20($r_ptr), $acc2 2770 lea 0x40($r_ptr), $acc3 2771 movq $r_ptr, %xmm0 2772 movq $acc2, %xmm1 2773 movq $acc3, %xmm2 2774 2775 lea $S(%rsp), $r_ptr 2776 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2777 2778 mov 0x40+8*0($a_ptr), $src0 2779 mov 0x40+8*1($a_ptr), $acc6 2780 mov 0x40+8*2($a_ptr), $acc7 2781 mov 0x40+8*3($a_ptr), $acc0 2782 lea 0x40-$bias($a_ptr), $a_ptr 2783 lea $Zsqr(%rsp), $r_ptr 2784 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2785 2786 `&load_for_sqr("$S(%rsp)", "$src0")` 2787 lea $S(%rsp), $r_ptr 2788 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2789 2790 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2791 mov 0x40+8*0($b_ptr), $acc1 2792 mov 0x40+8*1($b_ptr), $acc2 2793 mov 0x40+8*2($b_ptr), $acc3 2794 mov 0x40+8*3($b_ptr), $acc4 2795 lea 0x40-$bias($b_ptr), $a_ptr 2796 lea 0x20($b_ptr), $b_ptr 2797 movq %xmm2, $r_ptr 2798 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2799 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2800 2801 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2802 mov $in_x+8*1(%rsp), $acc5 2803 lea $Zsqr(%rsp), $b_ptr 2804 mov $in_x+8*2(%rsp), $acc0 2805 mov $in_x+8*3(%rsp), $acc1 2806 lea $M(%rsp), $r_ptr 2807 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2808 2809 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2810 mov $in_x+8*1(%rsp), $acc5 2811 lea $Zsqr(%rsp), $b_ptr 2812 mov $in_x+8*2(%rsp), $acc0 2813 mov $in_x+8*3(%rsp), $acc1 2814 lea $Zsqr(%rsp), $r_ptr 2815 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2816 2817 `&load_for_sqr("$S(%rsp)", "$src0")` 2818 movq %xmm1, $r_ptr 2819 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2820___ 2821{ 2822######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2823# operate in 4-5-6-7 "name space" that matches squaring output 2824# 2825my ($poly1,$poly3)=($a_ptr,$t1); 2826my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2827 2828$code.=<<___; 2829 xor $t4, $t4 2830 mov $a0, $t0 2831 add \$-1, $a0 2832 mov $a1, $t1 2833 adc $poly1, $a1 2834 mov $a2, $t2 2835 adc \$0, $a2 2836 mov $a3, $t3 2837 adc $poly3, $a3 2838 adc \$0, $t4 2839 xor $a_ptr, $a_ptr # borrow $a_ptr 2840 test \$1, $t0 2841 2842 cmovz $t0, $a0 2843 cmovz $t1, $a1 2844 cmovz $t2, $a2 2845 cmovz $t3, $a3 2846 cmovz $a_ptr, $t4 2847 2848 mov $a1, $t0 # a0:a3>>1 2849 shr \$1, $a0 2850 shl \$63, $t0 2851 mov $a2, $t1 2852 shr \$1, $a1 2853 or $t0, $a0 2854 shl \$63, $t1 2855 mov $a3, $t2 2856 shr \$1, $a2 2857 or $t1, $a1 2858 shl \$63, $t2 2859 mov $a0, 8*0($r_ptr) 2860 shr \$1, $a3 2861 mov $a1, 8*1($r_ptr) 2862 shl \$63, $t4 2863 or $t2, $a2 2864 or $t4, $a3 2865 mov $a2, 8*2($r_ptr) 2866 mov $a3, 8*3($r_ptr) 2867___ 2868} 2869$code.=<<___; 2870 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2871 lea $M(%rsp), $r_ptr 2872 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2873 2874 lea $tmp0(%rsp), $r_ptr 2875 call __ecp_nistz256_mul_by_2$x 2876 2877 lea $M(%rsp), $b_ptr 2878 lea $M(%rsp), $r_ptr 2879 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2880 2881 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2882 lea $S(%rsp), $r_ptr 2883 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2884 2885 lea $tmp0(%rsp), $r_ptr 2886 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2887 2888 `&load_for_sqr("$M(%rsp)", "$src0")` 2889 movq %xmm0, $r_ptr 2890 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2891 2892 lea $tmp0(%rsp), $b_ptr 2893 mov $acc6, $acc0 # harmonize sqr output and sub input 2894 mov $acc7, $acc1 2895 mov $a_ptr, $poly1 2896 mov $t1, $poly3 2897 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2898 2899 mov $S+8*0(%rsp), $t0 2900 mov $S+8*1(%rsp), $t1 2901 mov $S+8*2(%rsp), $t2 2902 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2903 lea $S(%rsp), $r_ptr 2904 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2905 2906 mov $M(%rsp), $src0 2907 lea $M(%rsp), $b_ptr 2908 mov $acc4, $acc6 # harmonize sub output and mul input 2909 xor %ecx, %ecx 2910 mov $acc4, $S+8*0(%rsp) # have to save:-( 2911 mov $acc5, $acc2 2912 mov $acc5, $S+8*1(%rsp) 2913 cmovz $acc0, $acc3 2914 mov $acc0, $S+8*2(%rsp) 2915 lea $S-$bias(%rsp), $a_ptr 2916 cmovz $acc1, $acc4 2917 mov $acc1, $S+8*3(%rsp) 2918 mov $acc6, $acc1 2919 lea $S(%rsp), $r_ptr 2920 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2921 2922 movq %xmm1, $b_ptr 2923 movq %xmm1, $r_ptr 2924 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2925 2926 lea 32*5+56(%rsp), %rsi 2927.cfi_def_cfa %rsi,8 2928 mov -48(%rsi),%r15 2929.cfi_restore %r15 2930 mov -40(%rsi),%r14 2931.cfi_restore %r14 2932 mov -32(%rsi),%r13 2933.cfi_restore %r13 2934 mov -24(%rsi),%r12 2935.cfi_restore %r12 2936 mov -16(%rsi),%rbx 2937.cfi_restore %rbx 2938 mov -8(%rsi),%rbp 2939.cfi_restore %rbp 2940 lea (%rsi),%rsp 2941.cfi_def_cfa_register %rsp 2942.Lpoint_double${x}_epilogue: 2943 ret 2944.cfi_endproc 2945.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2946___ 2947} 2948&gen_double("q"); 2949 2950sub gen_add () { 2951 my $x = shift; 2952 my ($src0,$sfx,$bias); 2953 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2954 $U1,$U2,$S1,$S2, 2955 $res_x,$res_y,$res_z, 2956 $in1_x,$in1_y,$in1_z, 2957 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2958 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2959 2960 if ($x ne "x") { 2961 $src0 = "%rax"; 2962 $sfx = ""; 2963 $bias = 0; 2964 2965$code.=<<___; 2966.globl ecp_nistz256_point_add 2967.type ecp_nistz256_point_add,\@function,3 2968.align 32 2969ecp_nistz256_point_add: 2970.cfi_startproc 2971___ 2972$code.=<<___ if ($addx); 2973 leaq OPENSSL_ia32cap_P(%rip), %rcx 2974 mov 8(%rcx), %rcx 2975 and \$0x80100, %ecx 2976 cmp \$0x80100, %ecx 2977 je .Lpoint_addx 2978___ 2979 } else { 2980 $src0 = "%rdx"; 2981 $sfx = "x"; 2982 $bias = 128; 2983 2984$code.=<<___; 2985.type ecp_nistz256_point_addx,\@function,3 2986.align 32 2987ecp_nistz256_point_addx: 2988.cfi_startproc 2989.Lpoint_addx: 2990___ 2991 } 2992$code.=<<___; 2993 push %rbp 2994.cfi_push %rbp 2995 push %rbx 2996.cfi_push %rbx 2997 push %r12 2998.cfi_push %r12 2999 push %r13 3000.cfi_push %r13 3001 push %r14 3002.cfi_push %r14 3003 push %r15 3004.cfi_push %r15 3005 sub \$32*18+8, %rsp 3006.cfi_adjust_cfa_offset 32*18+8 3007.Lpoint_add${x}_body: 3008 3009 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3010 movdqu 0x10($a_ptr), %xmm1 3011 movdqu 0x20($a_ptr), %xmm2 3012 movdqu 0x30($a_ptr), %xmm3 3013 movdqu 0x40($a_ptr), %xmm4 3014 movdqu 0x50($a_ptr), %xmm5 3015 mov $a_ptr, $b_ptr # reassign 3016 mov $b_org, $a_ptr # reassign 3017 movdqa %xmm0, $in1_x(%rsp) 3018 movdqa %xmm1, $in1_x+0x10(%rsp) 3019 movdqa %xmm2, $in1_y(%rsp) 3020 movdqa %xmm3, $in1_y+0x10(%rsp) 3021 movdqa %xmm4, $in1_z(%rsp) 3022 movdqa %xmm5, $in1_z+0x10(%rsp) 3023 por %xmm4, %xmm5 3024 3025 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 3026 pshufd \$0xb1, %xmm5, %xmm3 3027 movdqu 0x10($a_ptr), %xmm1 3028 movdqu 0x20($a_ptr), %xmm2 3029 por %xmm3, %xmm5 3030 movdqu 0x30($a_ptr), %xmm3 3031 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 3032 mov 0x40+8*1($a_ptr), $acc6 3033 mov 0x40+8*2($a_ptr), $acc7 3034 mov 0x40+8*3($a_ptr), $acc0 3035 movdqa %xmm0, $in2_x(%rsp) 3036 pshufd \$0x1e, %xmm5, %xmm4 3037 movdqa %xmm1, $in2_x+0x10(%rsp) 3038 movdqu 0x40($a_ptr),%xmm0 # in2_z again 3039 movdqu 0x50($a_ptr),%xmm1 3040 movdqa %xmm2, $in2_y(%rsp) 3041 movdqa %xmm3, $in2_y+0x10(%rsp) 3042 por %xmm4, %xmm5 3043 pxor %xmm4, %xmm4 3044 por %xmm0, %xmm1 3045 movq $r_ptr, %xmm0 # save $r_ptr 3046 3047 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3048 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 3049 mov $acc6, $in2_z+8*1(%rsp) 3050 mov $acc7, $in2_z+8*2(%rsp) 3051 mov $acc0, $in2_z+8*3(%rsp) 3052 lea $Z2sqr(%rsp), $r_ptr # Z2^2 3053 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 3054 3055 pcmpeqd %xmm4, %xmm5 3056 pshufd \$0xb1, %xmm1, %xmm4 3057 por %xmm1, %xmm4 3058 pshufd \$0, %xmm5, %xmm5 # in1infty 3059 pshufd \$0x1e, %xmm4, %xmm3 3060 por %xmm3, %xmm4 3061 pxor %xmm3, %xmm3 3062 pcmpeqd %xmm3, %xmm4 3063 pshufd \$0, %xmm4, %xmm4 # in2infty 3064 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 3065 mov 0x40+8*1($b_ptr), $acc6 3066 mov 0x40+8*2($b_ptr), $acc7 3067 mov 0x40+8*3($b_ptr), $acc0 3068 movq $b_ptr, %xmm1 3069 3070 lea 0x40-$bias($b_ptr), $a_ptr 3071 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3072 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3073 3074 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 3075 lea $S1(%rsp), $r_ptr # S1 = Z2^3 3076 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 3077 3078 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3079 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3080 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3081 3082 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 3083 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 3084 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 3085 3086 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3087 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3088 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3089 3090 lea $S1(%rsp), $b_ptr 3091 lea $R(%rsp), $r_ptr # R = S2 - S1 3092 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 3093 3094 or $acc5, $acc4 # see if result is zero 3095 movdqa %xmm4, %xmm2 3096 or $acc0, $acc4 3097 or $acc1, $acc4 3098 por %xmm5, %xmm2 # in1infty || in2infty 3099 movq $acc4, %xmm3 3100 3101 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3102 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 3103 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 3104 3105 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 3106 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3107 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 3108 3109 lea $U1(%rsp), $b_ptr 3110 lea $H(%rsp), $r_ptr # H = U2 - U1 3111 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 3112 3113 or $acc5, $acc4 # see if result is zero 3114 or $acc0, $acc4 3115 or $acc1, $acc4 3116 3117 .byte 0x3e # predict taken 3118 jnz .Ladd_proceed$x # is_equal(U1,U2)? 3119 movq %xmm2, $acc0 3120 movq %xmm3, $acc1 3121 test $acc0, $acc0 3122 jnz .Ladd_proceed$x # (in1infty || in2infty)? 3123 test $acc1, $acc1 3124 jz .Ladd_double$x # is_equal(S1,S2)? 3125 3126 movq %xmm0, $r_ptr # restore $r_ptr 3127 pxor %xmm0, %xmm0 3128 movdqu %xmm0, 0x00($r_ptr) 3129 movdqu %xmm0, 0x10($r_ptr) 3130 movdqu %xmm0, 0x20($r_ptr) 3131 movdqu %xmm0, 0x30($r_ptr) 3132 movdqu %xmm0, 0x40($r_ptr) 3133 movdqu %xmm0, 0x50($r_ptr) 3134 jmp .Ladd_done$x 3135 3136.align 32 3137.Ladd_double$x: 3138 movq %xmm1, $a_ptr # restore $a_ptr 3139 movq %xmm0, $r_ptr # restore $r_ptr 3140 add \$`32*(18-5)`, %rsp # difference in frame sizes 3141.cfi_adjust_cfa_offset `-32*(18-5)` 3142 jmp .Lpoint_double_shortcut$x 3143.cfi_adjust_cfa_offset `32*(18-5)` 3144 3145.align 32 3146.Ladd_proceed$x: 3147 `&load_for_sqr("$R(%rsp)", "$src0")` 3148 lea $Rsqr(%rsp), $r_ptr # R^2 3149 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3150 3151 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3152 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3153 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3154 3155 `&load_for_sqr("$H(%rsp)", "$src0")` 3156 lea $Hsqr(%rsp), $r_ptr # H^2 3157 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3158 3159 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 3160 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3161 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 3162 3163 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 3164 lea $Hcub(%rsp), $r_ptr # H^3 3165 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3166 3167 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 3168 lea $U2(%rsp), $r_ptr # U1*H^2 3169 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 3170___ 3171{ 3172####################################################################### 3173# operate in 4-5-0-1 "name space" that matches multiplication output 3174# 3175my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3176my ($poly1, $poly3)=($acc6,$acc7); 3177 3178$code.=<<___; 3179 #lea $U2(%rsp), $a_ptr 3180 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3181 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3182 3183 xor $t4, $t4 3184 add $acc0, $acc0 # a0:a3+a0:a3 3185 lea $Rsqr(%rsp), $a_ptr 3186 adc $acc1, $acc1 3187 mov $acc0, $t0 3188 adc $acc2, $acc2 3189 adc $acc3, $acc3 3190 mov $acc1, $t1 3191 adc \$0, $t4 3192 3193 sub \$-1, $acc0 3194 mov $acc2, $t2 3195 sbb $poly1, $acc1 3196 sbb \$0, $acc2 3197 mov $acc3, $t3 3198 sbb $poly3, $acc3 3199 sbb \$0, $t4 3200 3201 cmovc $t0, $acc0 3202 mov 8*0($a_ptr), $t0 3203 cmovc $t1, $acc1 3204 mov 8*1($a_ptr), $t1 3205 cmovc $t2, $acc2 3206 mov 8*2($a_ptr), $t2 3207 cmovc $t3, $acc3 3208 mov 8*3($a_ptr), $t3 3209 3210 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3211 3212 lea $Hcub(%rsp), $b_ptr 3213 lea $res_x(%rsp), $r_ptr 3214 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3215 3216 mov $U2+8*0(%rsp), $t0 3217 mov $U2+8*1(%rsp), $t1 3218 mov $U2+8*2(%rsp), $t2 3219 mov $U2+8*3(%rsp), $t3 3220 lea $res_y(%rsp), $r_ptr 3221 3222 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 3223 3224 mov $acc0, 8*0($r_ptr) # save the result, as 3225 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3226 mov $acc2, 8*2($r_ptr) 3227 mov $acc3, 8*3($r_ptr) 3228___ 3229} 3230$code.=<<___; 3231 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 3232 lea $S2(%rsp), $r_ptr 3233 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 3234 3235 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 3236 lea $res_y(%rsp), $r_ptr 3237 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 3238 3239 lea $S2(%rsp), $b_ptr 3240 lea $res_y(%rsp), $r_ptr 3241 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 3242 3243 movq %xmm0, $r_ptr # restore $r_ptr 3244 3245 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 3246 movdqa %xmm5, %xmm1 3247 pandn $res_z(%rsp), %xmm0 3248 movdqa %xmm5, %xmm2 3249 pandn $res_z+0x10(%rsp), %xmm1 3250 movdqa %xmm5, %xmm3 3251 pand $in2_z(%rsp), %xmm2 3252 pand $in2_z+0x10(%rsp), %xmm3 3253 por %xmm0, %xmm2 3254 por %xmm1, %xmm3 3255 3256 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3257 movdqa %xmm4, %xmm1 3258 pandn %xmm2, %xmm0 3259 movdqa %xmm4, %xmm2 3260 pandn %xmm3, %xmm1 3261 movdqa %xmm4, %xmm3 3262 pand $in1_z(%rsp), %xmm2 3263 pand $in1_z+0x10(%rsp), %xmm3 3264 por %xmm0, %xmm2 3265 por %xmm1, %xmm3 3266 movdqu %xmm2, 0x40($r_ptr) 3267 movdqu %xmm3, 0x50($r_ptr) 3268 3269 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3270 movdqa %xmm5, %xmm1 3271 pandn $res_x(%rsp), %xmm0 3272 movdqa %xmm5, %xmm2 3273 pandn $res_x+0x10(%rsp), %xmm1 3274 movdqa %xmm5, %xmm3 3275 pand $in2_x(%rsp), %xmm2 3276 pand $in2_x+0x10(%rsp), %xmm3 3277 por %xmm0, %xmm2 3278 por %xmm1, %xmm3 3279 3280 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3281 movdqa %xmm4, %xmm1 3282 pandn %xmm2, %xmm0 3283 movdqa %xmm4, %xmm2 3284 pandn %xmm3, %xmm1 3285 movdqa %xmm4, %xmm3 3286 pand $in1_x(%rsp), %xmm2 3287 pand $in1_x+0x10(%rsp), %xmm3 3288 por %xmm0, %xmm2 3289 por %xmm1, %xmm3 3290 movdqu %xmm2, 0x00($r_ptr) 3291 movdqu %xmm3, 0x10($r_ptr) 3292 3293 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3294 movdqa %xmm5, %xmm1 3295 pandn $res_y(%rsp), %xmm0 3296 movdqa %xmm5, %xmm2 3297 pandn $res_y+0x10(%rsp), %xmm1 3298 movdqa %xmm5, %xmm3 3299 pand $in2_y(%rsp), %xmm2 3300 pand $in2_y+0x10(%rsp), %xmm3 3301 por %xmm0, %xmm2 3302 por %xmm1, %xmm3 3303 3304 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3305 movdqa %xmm4, %xmm1 3306 pandn %xmm2, %xmm0 3307 movdqa %xmm4, %xmm2 3308 pandn %xmm3, %xmm1 3309 movdqa %xmm4, %xmm3 3310 pand $in1_y(%rsp), %xmm2 3311 pand $in1_y+0x10(%rsp), %xmm3 3312 por %xmm0, %xmm2 3313 por %xmm1, %xmm3 3314 movdqu %xmm2, 0x20($r_ptr) 3315 movdqu %xmm3, 0x30($r_ptr) 3316 3317.Ladd_done$x: 3318 lea 32*18+56(%rsp), %rsi 3319.cfi_def_cfa %rsi,8 3320 mov -48(%rsi),%r15 3321.cfi_restore %r15 3322 mov -40(%rsi),%r14 3323.cfi_restore %r14 3324 mov -32(%rsi),%r13 3325.cfi_restore %r13 3326 mov -24(%rsi),%r12 3327.cfi_restore %r12 3328 mov -16(%rsi),%rbx 3329.cfi_restore %rbx 3330 mov -8(%rsi),%rbp 3331.cfi_restore %rbp 3332 lea (%rsi),%rsp 3333.cfi_def_cfa_register %rsp 3334.Lpoint_add${x}_epilogue: 3335 ret 3336.cfi_endproc 3337.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 3338___ 3339} 3340&gen_add("q"); 3341 3342sub gen_add_affine () { 3343 my $x = shift; 3344 my ($src0,$sfx,$bias); 3345 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 3346 $res_x,$res_y,$res_z, 3347 $in1_x,$in1_y,$in1_z, 3348 $in2_x,$in2_y)=map(32*$_,(0..14)); 3349 my $Z1sqr = $S2; 3350 3351 if ($x ne "x") { 3352 $src0 = "%rax"; 3353 $sfx = ""; 3354 $bias = 0; 3355 3356$code.=<<___; 3357.globl ecp_nistz256_point_add_affine 3358.type ecp_nistz256_point_add_affine,\@function,3 3359.align 32 3360ecp_nistz256_point_add_affine: 3361.cfi_startproc 3362___ 3363$code.=<<___ if ($addx); 3364 leaq OPENSSL_ia32cap_P(%rip), %rcx 3365 mov 8(%rcx), %rcx 3366 and \$0x80100, %ecx 3367 cmp \$0x80100, %ecx 3368 je .Lpoint_add_affinex 3369___ 3370 } else { 3371 $src0 = "%rdx"; 3372 $sfx = "x"; 3373 $bias = 128; 3374 3375$code.=<<___; 3376.type ecp_nistz256_point_add_affinex,\@function,3 3377.align 32 3378ecp_nistz256_point_add_affinex: 3379.cfi_startproc 3380.Lpoint_add_affinex: 3381___ 3382 } 3383$code.=<<___; 3384 push %rbp 3385.cfi_push %rbp 3386 push %rbx 3387.cfi_push %rbx 3388 push %r12 3389.cfi_push %r12 3390 push %r13 3391.cfi_push %r13 3392 push %r14 3393.cfi_push %r14 3394 push %r15 3395.cfi_push %r15 3396 sub \$32*15+8, %rsp 3397.cfi_adjust_cfa_offset 32*15+8 3398.Ladd_affine${x}_body: 3399 3400 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3401 mov $b_org, $b_ptr # reassign 3402 movdqu 0x10($a_ptr), %xmm1 3403 movdqu 0x20($a_ptr), %xmm2 3404 movdqu 0x30($a_ptr), %xmm3 3405 movdqu 0x40($a_ptr), %xmm4 3406 movdqu 0x50($a_ptr), %xmm5 3407 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 3408 mov 0x40+8*1($a_ptr), $acc6 3409 mov 0x40+8*2($a_ptr), $acc7 3410 mov 0x40+8*3($a_ptr), $acc0 3411 movdqa %xmm0, $in1_x(%rsp) 3412 movdqa %xmm1, $in1_x+0x10(%rsp) 3413 movdqa %xmm2, $in1_y(%rsp) 3414 movdqa %xmm3, $in1_y+0x10(%rsp) 3415 movdqa %xmm4, $in1_z(%rsp) 3416 movdqa %xmm5, $in1_z+0x10(%rsp) 3417 por %xmm4, %xmm5 3418 3419 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 3420 pshufd \$0xb1, %xmm5, %xmm3 3421 movdqu 0x10($b_ptr), %xmm1 3422 movdqu 0x20($b_ptr), %xmm2 3423 por %xmm3, %xmm5 3424 movdqu 0x30($b_ptr), %xmm3 3425 movdqa %xmm0, $in2_x(%rsp) 3426 pshufd \$0x1e, %xmm5, %xmm4 3427 movdqa %xmm1, $in2_x+0x10(%rsp) 3428 por %xmm0, %xmm1 3429 movq $r_ptr, %xmm0 # save $r_ptr 3430 movdqa %xmm2, $in2_y(%rsp) 3431 movdqa %xmm3, $in2_y+0x10(%rsp) 3432 por %xmm2, %xmm3 3433 por %xmm4, %xmm5 3434 pxor %xmm4, %xmm4 3435 por %xmm1, %xmm3 3436 3437 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3438 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3439 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3440 3441 pcmpeqd %xmm4, %xmm5 3442 pshufd \$0xb1, %xmm3, %xmm4 3443 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 3444 #lea 0x00($b_ptr), $b_ptr 3445 mov $acc4, $acc1 # harmonize sqr output and mul input 3446 por %xmm3, %xmm4 3447 pshufd \$0, %xmm5, %xmm5 # in1infty 3448 pshufd \$0x1e, %xmm4, %xmm3 3449 mov $acc5, $acc2 3450 por %xmm3, %xmm4 3451 pxor %xmm3, %xmm3 3452 mov $acc6, $acc3 3453 pcmpeqd %xmm3, %xmm4 3454 pshufd \$0, %xmm4, %xmm4 # in2infty 3455 3456 lea $Z1sqr-$bias(%rsp), $a_ptr 3457 mov $acc7, $acc4 3458 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3459 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 3460 3461 lea $in1_x(%rsp), $b_ptr 3462 lea $H(%rsp), $r_ptr # H = U2 - U1 3463 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 3464 3465 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3466 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3467 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3468 3469 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3470 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3471 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3472 3473 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3474 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3475 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3476 3477 lea $in1_y(%rsp), $b_ptr 3478 lea $R(%rsp), $r_ptr # R = S2 - S1 3479 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 3480 3481 `&load_for_sqr("$H(%rsp)", "$src0")` 3482 lea $Hsqr(%rsp), $r_ptr # H^2 3483 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3484 3485 `&load_for_sqr("$R(%rsp)", "$src0")` 3486 lea $Rsqr(%rsp), $r_ptr # R^2 3487 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3488 3489 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 3490 lea $Hcub(%rsp), $r_ptr # H^3 3491 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3492 3493 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3494 lea $U2(%rsp), $r_ptr # U1*H^2 3495 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 3496___ 3497{ 3498####################################################################### 3499# operate in 4-5-0-1 "name space" that matches multiplication output 3500# 3501my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3502my ($poly1, $poly3)=($acc6,$acc7); 3503 3504$code.=<<___; 3505 #lea $U2(%rsp), $a_ptr 3506 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3507 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3508 3509 xor $t4, $t4 3510 add $acc0, $acc0 # a0:a3+a0:a3 3511 lea $Rsqr(%rsp), $a_ptr 3512 adc $acc1, $acc1 3513 mov $acc0, $t0 3514 adc $acc2, $acc2 3515 adc $acc3, $acc3 3516 mov $acc1, $t1 3517 adc \$0, $t4 3518 3519 sub \$-1, $acc0 3520 mov $acc2, $t2 3521 sbb $poly1, $acc1 3522 sbb \$0, $acc2 3523 mov $acc3, $t3 3524 sbb $poly3, $acc3 3525 sbb \$0, $t4 3526 3527 cmovc $t0, $acc0 3528 mov 8*0($a_ptr), $t0 3529 cmovc $t1, $acc1 3530 mov 8*1($a_ptr), $t1 3531 cmovc $t2, $acc2 3532 mov 8*2($a_ptr), $t2 3533 cmovc $t3, $acc3 3534 mov 8*3($a_ptr), $t3 3535 3536 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3537 3538 lea $Hcub(%rsp), $b_ptr 3539 lea $res_x(%rsp), $r_ptr 3540 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3541 3542 mov $U2+8*0(%rsp), $t0 3543 mov $U2+8*1(%rsp), $t1 3544 mov $U2+8*2(%rsp), $t2 3545 mov $U2+8*3(%rsp), $t3 3546 lea $H(%rsp), $r_ptr 3547 3548 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 3549 3550 mov $acc0, 8*0($r_ptr) # save the result, as 3551 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3552 mov $acc2, 8*2($r_ptr) 3553 mov $acc3, 8*3($r_ptr) 3554___ 3555} 3556$code.=<<___; 3557 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 3558 lea $S2(%rsp), $r_ptr 3559 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 3560 3561 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 3562 lea $H(%rsp), $r_ptr 3563 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 3564 3565 lea $S2(%rsp), $b_ptr 3566 lea $res_y(%rsp), $r_ptr 3567 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 3568 3569 movq %xmm0, $r_ptr # restore $r_ptr 3570 3571 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 3572 movdqa %xmm5, %xmm1 3573 pandn $res_z(%rsp), %xmm0 3574 movdqa %xmm5, %xmm2 3575 pandn $res_z+0x10(%rsp), %xmm1 3576 movdqa %xmm5, %xmm3 3577 pand .LONE_mont(%rip), %xmm2 3578 pand .LONE_mont+0x10(%rip), %xmm3 3579 por %xmm0, %xmm2 3580 por %xmm1, %xmm3 3581 3582 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3583 movdqa %xmm4, %xmm1 3584 pandn %xmm2, %xmm0 3585 movdqa %xmm4, %xmm2 3586 pandn %xmm3, %xmm1 3587 movdqa %xmm4, %xmm3 3588 pand $in1_z(%rsp), %xmm2 3589 pand $in1_z+0x10(%rsp), %xmm3 3590 por %xmm0, %xmm2 3591 por %xmm1, %xmm3 3592 movdqu %xmm2, 0x40($r_ptr) 3593 movdqu %xmm3, 0x50($r_ptr) 3594 3595 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3596 movdqa %xmm5, %xmm1 3597 pandn $res_x(%rsp), %xmm0 3598 movdqa %xmm5, %xmm2 3599 pandn $res_x+0x10(%rsp), %xmm1 3600 movdqa %xmm5, %xmm3 3601 pand $in2_x(%rsp), %xmm2 3602 pand $in2_x+0x10(%rsp), %xmm3 3603 por %xmm0, %xmm2 3604 por %xmm1, %xmm3 3605 3606 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3607 movdqa %xmm4, %xmm1 3608 pandn %xmm2, %xmm0 3609 movdqa %xmm4, %xmm2 3610 pandn %xmm3, %xmm1 3611 movdqa %xmm4, %xmm3 3612 pand $in1_x(%rsp), %xmm2 3613 pand $in1_x+0x10(%rsp), %xmm3 3614 por %xmm0, %xmm2 3615 por %xmm1, %xmm3 3616 movdqu %xmm2, 0x00($r_ptr) 3617 movdqu %xmm3, 0x10($r_ptr) 3618 3619 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3620 movdqa %xmm5, %xmm1 3621 pandn $res_y(%rsp), %xmm0 3622 movdqa %xmm5, %xmm2 3623 pandn $res_y+0x10(%rsp), %xmm1 3624 movdqa %xmm5, %xmm3 3625 pand $in2_y(%rsp), %xmm2 3626 pand $in2_y+0x10(%rsp), %xmm3 3627 por %xmm0, %xmm2 3628 por %xmm1, %xmm3 3629 3630 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3631 movdqa %xmm4, %xmm1 3632 pandn %xmm2, %xmm0 3633 movdqa %xmm4, %xmm2 3634 pandn %xmm3, %xmm1 3635 movdqa %xmm4, %xmm3 3636 pand $in1_y(%rsp), %xmm2 3637 pand $in1_y+0x10(%rsp), %xmm3 3638 por %xmm0, %xmm2 3639 por %xmm1, %xmm3 3640 movdqu %xmm2, 0x20($r_ptr) 3641 movdqu %xmm3, 0x30($r_ptr) 3642 3643 lea 32*15+56(%rsp), %rsi 3644.cfi_def_cfa %rsi,8 3645 mov -48(%rsi),%r15 3646.cfi_restore %r15 3647 mov -40(%rsi),%r14 3648.cfi_restore %r14 3649 mov -32(%rsi),%r13 3650.cfi_restore %r13 3651 mov -24(%rsi),%r12 3652.cfi_restore %r12 3653 mov -16(%rsi),%rbx 3654.cfi_restore %rbx 3655 mov -8(%rsi),%rbp 3656.cfi_restore %rbp 3657 lea (%rsi),%rsp 3658.cfi_def_cfa_register %rsp 3659.Ladd_affine${x}_epilogue: 3660 ret 3661.cfi_endproc 3662.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 3663___ 3664} 3665&gen_add_affine("q"); 3666 3667######################################################################## 3668# AD*X magic 3669# 3670if ($addx) { { 3671######################################################################## 3672# operate in 4-5-0-1 "name space" that matches multiplication output 3673# 3674my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3675 3676$code.=<<___; 3677.type __ecp_nistz256_add_tox,\@abi-omnipotent 3678.align 32 3679__ecp_nistz256_add_tox: 3680.cfi_startproc 3681 xor $t4, $t4 3682 adc 8*0($b_ptr), $a0 3683 adc 8*1($b_ptr), $a1 3684 mov $a0, $t0 3685 adc 8*2($b_ptr), $a2 3686 adc 8*3($b_ptr), $a3 3687 mov $a1, $t1 3688 adc \$0, $t4 3689 3690 xor $t3, $t3 3691 sbb \$-1, $a0 3692 mov $a2, $t2 3693 sbb $poly1, $a1 3694 sbb \$0, $a2 3695 mov $a3, $t3 3696 sbb $poly3, $a3 3697 sbb \$0, $t4 3698 3699 cmovc $t0, $a0 3700 cmovc $t1, $a1 3701 mov $a0, 8*0($r_ptr) 3702 cmovc $t2, $a2 3703 mov $a1, 8*1($r_ptr) 3704 cmovc $t3, $a3 3705 mov $a2, 8*2($r_ptr) 3706 mov $a3, 8*3($r_ptr) 3707 3708 ret 3709.cfi_endproc 3710.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 3711 3712.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 3713.align 32 3714__ecp_nistz256_sub_fromx: 3715.cfi_startproc 3716 xor $t4, $t4 3717 sbb 8*0($b_ptr), $a0 3718 sbb 8*1($b_ptr), $a1 3719 mov $a0, $t0 3720 sbb 8*2($b_ptr), $a2 3721 sbb 8*3($b_ptr), $a3 3722 mov $a1, $t1 3723 sbb \$0, $t4 3724 3725 xor $t3, $t3 3726 adc \$-1, $a0 3727 mov $a2, $t2 3728 adc $poly1, $a1 3729 adc \$0, $a2 3730 mov $a3, $t3 3731 adc $poly3, $a3 3732 3733 bt \$0, $t4 3734 cmovnc $t0, $a0 3735 cmovnc $t1, $a1 3736 mov $a0, 8*0($r_ptr) 3737 cmovnc $t2, $a2 3738 mov $a1, 8*1($r_ptr) 3739 cmovnc $t3, $a3 3740 mov $a2, 8*2($r_ptr) 3741 mov $a3, 8*3($r_ptr) 3742 3743 ret 3744.cfi_endproc 3745.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 3746 3747.type __ecp_nistz256_subx,\@abi-omnipotent 3748.align 32 3749__ecp_nistz256_subx: 3750.cfi_startproc 3751 xor $t4, $t4 3752 sbb $a0, $t0 3753 sbb $a1, $t1 3754 mov $t0, $a0 3755 sbb $a2, $t2 3756 sbb $a3, $t3 3757 mov $t1, $a1 3758 sbb \$0, $t4 3759 3760 xor $a3 ,$a3 3761 adc \$-1, $t0 3762 mov $t2, $a2 3763 adc $poly1, $t1 3764 adc \$0, $t2 3765 mov $t3, $a3 3766 adc $poly3, $t3 3767 3768 bt \$0, $t4 3769 cmovc $t0, $a0 3770 cmovc $t1, $a1 3771 cmovc $t2, $a2 3772 cmovc $t3, $a3 3773 3774 ret 3775.cfi_endproc 3776.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 3777 3778.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 3779.align 32 3780__ecp_nistz256_mul_by_2x: 3781.cfi_startproc 3782 xor $t4, $t4 3783 adc $a0, $a0 # a0:a3+a0:a3 3784 adc $a1, $a1 3785 mov $a0, $t0 3786 adc $a2, $a2 3787 adc $a3, $a3 3788 mov $a1, $t1 3789 adc \$0, $t4 3790 3791 xor $t3, $t3 3792 sbb \$-1, $a0 3793 mov $a2, $t2 3794 sbb $poly1, $a1 3795 sbb \$0, $a2 3796 mov $a3, $t3 3797 sbb $poly3, $a3 3798 sbb \$0, $t4 3799 3800 cmovc $t0, $a0 3801 cmovc $t1, $a1 3802 mov $a0, 8*0($r_ptr) 3803 cmovc $t2, $a2 3804 mov $a1, 8*1($r_ptr) 3805 cmovc $t3, $a3 3806 mov $a2, 8*2($r_ptr) 3807 mov $a3, 8*3($r_ptr) 3808 3809 ret 3810.cfi_endproc 3811.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 3812___ 3813 } 3814&gen_double("x"); 3815&gen_add("x"); 3816&gen_add_affine("x"); 3817} 3818}}} 3819 3820# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3821# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3822if ($win64) { 3823$rec="%rcx"; 3824$frame="%rdx"; 3825$context="%r8"; 3826$disp="%r9"; 3827 3828$code.=<<___; 3829.extern __imp_RtlVirtualUnwind 3830 3831.type short_handler,\@abi-omnipotent 3832.align 16 3833short_handler: 3834 push %rsi 3835 push %rdi 3836 push %rbx 3837 push %rbp 3838 push %r12 3839 push %r13 3840 push %r14 3841 push %r15 3842 pushfq 3843 sub \$64,%rsp 3844 3845 mov 120($context),%rax # pull context->Rax 3846 mov 248($context),%rbx # pull context->Rip 3847 3848 mov 8($disp),%rsi # disp->ImageBase 3849 mov 56($disp),%r11 # disp->HandlerData 3850 3851 mov 0(%r11),%r10d # HandlerData[0] 3852 lea (%rsi,%r10),%r10 # end of prologue label 3853 cmp %r10,%rbx # context->Rip<end of prologue label 3854 jb .Lcommon_seh_tail 3855 3856 mov 152($context),%rax # pull context->Rsp 3857 3858 mov 4(%r11),%r10d # HandlerData[1] 3859 lea (%rsi,%r10),%r10 # epilogue label 3860 cmp %r10,%rbx # context->Rip>=epilogue label 3861 jae .Lcommon_seh_tail 3862 3863 lea 16(%rax),%rax 3864 3865 mov -8(%rax),%r12 3866 mov -16(%rax),%r13 3867 mov %r12,216($context) # restore context->R12 3868 mov %r13,224($context) # restore context->R13 3869 3870 jmp .Lcommon_seh_tail 3871.size short_handler,.-short_handler 3872 3873.type full_handler,\@abi-omnipotent 3874.align 16 3875full_handler: 3876 push %rsi 3877 push %rdi 3878 push %rbx 3879 push %rbp 3880 push %r12 3881 push %r13 3882 push %r14 3883 push %r15 3884 pushfq 3885 sub \$64,%rsp 3886 3887 mov 120($context),%rax # pull context->Rax 3888 mov 248($context),%rbx # pull context->Rip 3889 3890 mov 8($disp),%rsi # disp->ImageBase 3891 mov 56($disp),%r11 # disp->HandlerData 3892 3893 mov 0(%r11),%r10d # HandlerData[0] 3894 lea (%rsi,%r10),%r10 # end of prologue label 3895 cmp %r10,%rbx # context->Rip<end of prologue label 3896 jb .Lcommon_seh_tail 3897 3898 mov 152($context),%rax # pull context->Rsp 3899 3900 mov 4(%r11),%r10d # HandlerData[1] 3901 lea (%rsi,%r10),%r10 # epilogue label 3902 cmp %r10,%rbx # context->Rip>=epilogue label 3903 jae .Lcommon_seh_tail 3904 3905 mov 8(%r11),%r10d # HandlerData[2] 3906 lea (%rax,%r10),%rax 3907 3908 mov -8(%rax),%rbp 3909 mov -16(%rax),%rbx 3910 mov -24(%rax),%r12 3911 mov -32(%rax),%r13 3912 mov -40(%rax),%r14 3913 mov -48(%rax),%r15 3914 mov %rbx,144($context) # restore context->Rbx 3915 mov %rbp,160($context) # restore context->Rbp 3916 mov %r12,216($context) # restore context->R12 3917 mov %r13,224($context) # restore context->R13 3918 mov %r14,232($context) # restore context->R14 3919 mov %r15,240($context) # restore context->R15 3920 3921.Lcommon_seh_tail: 3922 mov 8(%rax),%rdi 3923 mov 16(%rax),%rsi 3924 mov %rax,152($context) # restore context->Rsp 3925 mov %rsi,168($context) # restore context->Rsi 3926 mov %rdi,176($context) # restore context->Rdi 3927 3928 mov 40($disp),%rdi # disp->ContextRecord 3929 mov $context,%rsi # context 3930 mov \$154,%ecx # sizeof(CONTEXT) 3931 .long 0xa548f3fc # cld; rep movsq 3932 3933 mov $disp,%rsi 3934 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3935 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3936 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3937 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3938 mov 40(%rsi),%r10 # disp->ContextRecord 3939 lea 56(%rsi),%r11 # &disp->HandlerData 3940 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3941 mov %r10,32(%rsp) # arg5 3942 mov %r11,40(%rsp) # arg6 3943 mov %r12,48(%rsp) # arg7 3944 mov %rcx,56(%rsp) # arg8, (NULL) 3945 call *__imp_RtlVirtualUnwind(%rip) 3946 3947 mov \$1,%eax # ExceptionContinueSearch 3948 add \$64,%rsp 3949 popfq 3950 pop %r15 3951 pop %r14 3952 pop %r13 3953 pop %r12 3954 pop %rbp 3955 pop %rbx 3956 pop %rdi 3957 pop %rsi 3958 ret 3959.size full_handler,.-full_handler 3960 3961.section .pdata 3962.align 4 3963 .rva .LSEH_begin_ecp_nistz256_neg 3964 .rva .LSEH_end_ecp_nistz256_neg 3965 .rva .LSEH_info_ecp_nistz256_neg 3966 3967 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont 3968 .rva .LSEH_end_ecp_nistz256_ord_mul_mont 3969 .rva .LSEH_info_ecp_nistz256_ord_mul_mont 3970 3971 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont 3972 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont 3973 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont 3974___ 3975$code.=<<___ if ($addx); 3976 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx 3977 .rva .LSEH_end_ecp_nistz256_ord_mul_montx 3978 .rva .LSEH_info_ecp_nistz256_ord_mul_montx 3979 3980 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx 3981 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx 3982 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx 3983___ 3984$code.=<<___; 3985 .rva .LSEH_begin_ecp_nistz256_mul_mont 3986 .rva .LSEH_end_ecp_nistz256_mul_mont 3987 .rva .LSEH_info_ecp_nistz256_mul_mont 3988 3989 .rva .LSEH_begin_ecp_nistz256_sqr_mont 3990 .rva .LSEH_end_ecp_nistz256_sqr_mont 3991 .rva .LSEH_info_ecp_nistz256_sqr_mont 3992 3993 .rva .LSEH_begin_ecp_nistz256_select_w5 3994 .rva .LSEH_end_ecp_nistz256_select_w5 3995 .rva .LSEH_info_ecp_nistz256_select_wX 3996 3997 .rva .LSEH_begin_ecp_nistz256_select_w7 3998 .rva .LSEH_end_ecp_nistz256_select_w7 3999 .rva .LSEH_info_ecp_nistz256_select_wX 4000___ 4001$code.=<<___ if ($avx>1); 4002 .rva .LSEH_begin_ecp_nistz256_avx2_select_w5 4003 .rva .LSEH_end_ecp_nistz256_avx2_select_w5 4004 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4005 4006 .rva .LSEH_begin_ecp_nistz256_avx2_select_w7 4007 .rva .LSEH_end_ecp_nistz256_avx2_select_w7 4008 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4009___ 4010$code.=<<___; 4011 .rva .LSEH_begin_ecp_nistz256_point_double 4012 .rva .LSEH_end_ecp_nistz256_point_double 4013 .rva .LSEH_info_ecp_nistz256_point_double 4014 4015 .rva .LSEH_begin_ecp_nistz256_point_add 4016 .rva .LSEH_end_ecp_nistz256_point_add 4017 .rva .LSEH_info_ecp_nistz256_point_add 4018 4019 .rva .LSEH_begin_ecp_nistz256_point_add_affine 4020 .rva .LSEH_end_ecp_nistz256_point_add_affine 4021 .rva .LSEH_info_ecp_nistz256_point_add_affine 4022___ 4023$code.=<<___ if ($addx); 4024 .rva .LSEH_begin_ecp_nistz256_point_doublex 4025 .rva .LSEH_end_ecp_nistz256_point_doublex 4026 .rva .LSEH_info_ecp_nistz256_point_doublex 4027 4028 .rva .LSEH_begin_ecp_nistz256_point_addx 4029 .rva .LSEH_end_ecp_nistz256_point_addx 4030 .rva .LSEH_info_ecp_nistz256_point_addx 4031 4032 .rva .LSEH_begin_ecp_nistz256_point_add_affinex 4033 .rva .LSEH_end_ecp_nistz256_point_add_affinex 4034 .rva .LSEH_info_ecp_nistz256_point_add_affinex 4035___ 4036$code.=<<___; 4037 4038.section .xdata 4039.align 8 4040.LSEH_info_ecp_nistz256_neg: 4041 .byte 9,0,0,0 4042 .rva short_handler 4043 .rva .Lneg_body,.Lneg_epilogue # HandlerData[] 4044.LSEH_info_ecp_nistz256_ord_mul_mont: 4045 .byte 9,0,0,0 4046 .rva full_handler 4047 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] 4048 .long 48,0 4049.LSEH_info_ecp_nistz256_ord_sqr_mont: 4050 .byte 9,0,0,0 4051 .rva full_handler 4052 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] 4053 .long 48,0 4054___ 4055$code.=<<___ if ($addx); 4056.LSEH_info_ecp_nistz256_ord_mul_montx: 4057 .byte 9,0,0,0 4058 .rva full_handler 4059 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] 4060 .long 48,0 4061.LSEH_info_ecp_nistz256_ord_sqr_montx: 4062 .byte 9,0,0,0 4063 .rva full_handler 4064 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] 4065 .long 48,0 4066___ 4067$code.=<<___; 4068.LSEH_info_ecp_nistz256_mul_mont: 4069 .byte 9,0,0,0 4070 .rva full_handler 4071 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4072 .long 48,0 4073.LSEH_info_ecp_nistz256_sqr_mont: 4074 .byte 9,0,0,0 4075 .rva full_handler 4076 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 4077 .long 48,0 4078.LSEH_info_ecp_nistz256_select_wX: 4079 .byte 0x01,0x33,0x16,0x00 4080 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 4081 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 4082 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 4083 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 4084 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 4085 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 4086 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 4087 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 4088 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 4089 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 4090 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 4091 .align 8 4092___ 4093$code.=<<___ if ($avx>1); 4094.LSEH_info_ecp_nistz256_avx2_select_wX: 4095 .byte 0x01,0x36,0x17,0x0b 4096 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 4097 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 4098 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 4099 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 4100 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 4101 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 4102 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 4103 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 4104 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 4105 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 4106 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 4107 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 4108 .align 8 4109___ 4110$code.=<<___; 4111.LSEH_info_ecp_nistz256_point_double: 4112 .byte 9,0,0,0 4113 .rva full_handler 4114 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] 4115 .long 32*5+56,0 4116.LSEH_info_ecp_nistz256_point_add: 4117 .byte 9,0,0,0 4118 .rva full_handler 4119 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] 4120 .long 32*18+56,0 4121.LSEH_info_ecp_nistz256_point_add_affine: 4122 .byte 9,0,0,0 4123 .rva full_handler 4124 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] 4125 .long 32*15+56,0 4126___ 4127$code.=<<___ if ($addx); 4128.align 8 4129.LSEH_info_ecp_nistz256_point_doublex: 4130 .byte 9,0,0,0 4131 .rva full_handler 4132 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] 4133 .long 32*5+56,0 4134.LSEH_info_ecp_nistz256_point_addx: 4135 .byte 9,0,0,0 4136 .rva full_handler 4137 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] 4138 .long 32*18+56,0 4139.LSEH_info_ecp_nistz256_point_add_affinex: 4140 .byte 9,0,0,0 4141 .rva full_handler 4142 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] 4143 .long 32*15+56,0 4144___ 4145} 4146 4147$code =~ s/\`([^\`]*)\`/eval $1/gem; 4148print $code; 4149close STDOUT; 4150