1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4# Copyright (c) 2015 CloudFlare, Inc. 5# 6# Licensed under the OpenSSL license (the "License"). You may not use 7# this file except in compliance with the License. You can obtain a copy 8# in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12# (1) Intel Corporation, Israel Development Center, Haifa, Israel 13# (2) University of Haifa, Israel 14# (3) CloudFlare, Inc. 15# 16# Reference: 17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18# 256 Bit Primes" 19 20# Further optimization by <appro@openssl.org>: 21# 22# this/original with/without -DECP_NISTZ256_ASM(*) 23# Opteron +15-49% +150-195% 24# Bulldozer +18-45% +175-240% 25# P4 +24-46% +100-150% 26# Westmere +18-34% +87-160% 27# Sandy Bridge +14-35% +120-185% 28# Ivy Bridge +11-35% +125-180% 29# Haswell +10-37% +160-200% 30# Broadwell +24-58% +210-270% 31# Atom +20-50% +180-240% 32# VIA Nano +50-160% +480-480% 33# 34# (*) "without -DECP_NISTZ256_ASM" refers to build with 35# "enable-ec_nistp_64_gcc_128"; 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. In "this/original" column lower coefficient is for 39# ECDSA sign, while in "with/without" - for ECDH key agreement, and 40# higher - for ECDSA sign, relatively fastest server-side operation. 41# Keep in mind that +100% means 2x improvement. 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 55*STDOUT=*OUT; 56 57$avx = 2; 58$addx = 1; 59 60$code.=<<___; 61.text 62.extern OPENSSL_ia32cap_P 63 64# The polynomial 65.align 64 66.Lpoly: 67.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 68 69.LOne: 70.long 1,1,1,1,1,1,1,1 71.LTwo: 72.long 2,2,2,2,2,2,2,2 73.LThree: 74.long 3,3,3,3,3,3,3,3 75.LONE_mont: 76.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 77 78# Constants for computations modulo ord(p256) 79.Lord: 80.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 81.LordK: 82.quad 0xccd1c8aaee00bc4f 83___ 84 85{ 86my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 87my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 88my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 89 90$code.=<<___; 91 92################################################################################ 93# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 94.globl ecp_nistz256_neg 95.type ecp_nistz256_neg,\@function,2 96.align 32 97ecp_nistz256_neg: 98.cfi_startproc 99 push %r12 100.cfi_push %r12 101 push %r13 102.cfi_push %r13 103.Lneg_body: 104 105 xor $a0, $a0 106 xor $a1, $a1 107 xor $a2, $a2 108 xor $a3, $a3 109 xor $t4, $t4 110 111 sub 8*0($a_ptr), $a0 112 sbb 8*1($a_ptr), $a1 113 sbb 8*2($a_ptr), $a2 114 mov $a0, $t0 115 sbb 8*3($a_ptr), $a3 116 lea .Lpoly(%rip), $a_ptr 117 mov $a1, $t1 118 sbb \$0, $t4 119 120 add 8*0($a_ptr), $a0 121 mov $a2, $t2 122 adc 8*1($a_ptr), $a1 123 adc 8*2($a_ptr), $a2 124 mov $a3, $t3 125 adc 8*3($a_ptr), $a3 126 test $t4, $t4 127 128 cmovz $t0, $a0 129 cmovz $t1, $a1 130 mov $a0, 8*0($r_ptr) 131 cmovz $t2, $a2 132 mov $a1, 8*1($r_ptr) 133 cmovz $t3, $a3 134 mov $a2, 8*2($r_ptr) 135 mov $a3, 8*3($r_ptr) 136 137 mov 0(%rsp),%r13 138.cfi_restore %r13 139 mov 8(%rsp),%r12 140.cfi_restore %r12 141 lea 16(%rsp),%rsp 142.cfi_adjust_cfa_offset -16 143.Lneg_epilogue: 144 ret 145.cfi_endproc 146.size ecp_nistz256_neg,.-ecp_nistz256_neg 147___ 148} 149{ 150my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 151my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 152my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 153my ($poly1,$poly3)=($acc6,$acc7); 154 155$code.=<<___; 156################################################################################ 157# void ecp_nistz256_ord_mul_mont( 158# uint64_t res[4], 159# uint64_t a[4], 160# uint64_t b[4]); 161 162.globl ecp_nistz256_ord_mul_mont 163.type ecp_nistz256_ord_mul_mont,\@function,3 164.align 32 165ecp_nistz256_ord_mul_mont: 166.cfi_startproc 167___ 168$code.=<<___ if ($addx); 169 leaq OPENSSL_ia32cap_P(%rip), %rcx 170 mov 8(%rcx), %rcx 171 and \$0x80100, %ecx 172 cmp \$0x80100, %ecx 173 je .Lecp_nistz256_ord_mul_montx 174___ 175$code.=<<___; 176 push %rbp 177.cfi_push %rbp 178 push %rbx 179.cfi_push %rbx 180 push %r12 181.cfi_push %r12 182 push %r13 183.cfi_push %r13 184 push %r14 185.cfi_push %r14 186 push %r15 187.cfi_push %r15 188.Lord_mul_body: 189 190 mov 8*0($b_org), %rax 191 mov $b_org, $b_ptr 192 lea .Lord(%rip), %r14 193 mov .LordK(%rip), %r15 194 195 ################################# * b[0] 196 mov %rax, $t0 197 mulq 8*0($a_ptr) 198 mov %rax, $acc0 199 mov $t0, %rax 200 mov %rdx, $acc1 201 202 mulq 8*1($a_ptr) 203 add %rax, $acc1 204 mov $t0, %rax 205 adc \$0, %rdx 206 mov %rdx, $acc2 207 208 mulq 8*2($a_ptr) 209 add %rax, $acc2 210 mov $t0, %rax 211 adc \$0, %rdx 212 213 mov $acc0, $acc5 214 imulq %r15,$acc0 215 216 mov %rdx, $acc3 217 mulq 8*3($a_ptr) 218 add %rax, $acc3 219 mov $acc0, %rax 220 adc \$0, %rdx 221 mov %rdx, $acc4 222 223 ################################# First reduction step 224 mulq 8*0(%r14) 225 mov $acc0, $t1 226 add %rax, $acc5 # guaranteed to be zero 227 mov $acc0, %rax 228 adc \$0, %rdx 229 mov %rdx, $t0 230 231 sub $acc0, $acc2 232 sbb \$0, $acc0 # can't borrow 233 234 mulq 8*1(%r14) 235 add $t0, $acc1 236 adc \$0, %rdx 237 add %rax, $acc1 238 mov $t1, %rax 239 adc %rdx, $acc2 240 mov $t1, %rdx 241 adc \$0, $acc0 # can't overflow 242 243 shl \$32, %rax 244 shr \$32, %rdx 245 sub %rax, $acc3 246 mov 8*1($b_ptr), %rax 247 sbb %rdx, $t1 # can't borrow 248 249 add $acc0, $acc3 250 adc $t1, $acc4 251 adc \$0, $acc5 252 253 ################################# * b[1] 254 mov %rax, $t0 255 mulq 8*0($a_ptr) 256 add %rax, $acc1 257 mov $t0, %rax 258 adc \$0, %rdx 259 mov %rdx, $t1 260 261 mulq 8*1($a_ptr) 262 add $t1, $acc2 263 adc \$0, %rdx 264 add %rax, $acc2 265 mov $t0, %rax 266 adc \$0, %rdx 267 mov %rdx, $t1 268 269 mulq 8*2($a_ptr) 270 add $t1, $acc3 271 adc \$0, %rdx 272 add %rax, $acc3 273 mov $t0, %rax 274 adc \$0, %rdx 275 276 mov $acc1, $t0 277 imulq %r15, $acc1 278 279 mov %rdx, $t1 280 mulq 8*3($a_ptr) 281 add $t1, $acc4 282 adc \$0, %rdx 283 xor $acc0, $acc0 284 add %rax, $acc4 285 mov $acc1, %rax 286 adc %rdx, $acc5 287 adc \$0, $acc0 288 289 ################################# Second reduction step 290 mulq 8*0(%r14) 291 mov $acc1, $t1 292 add %rax, $t0 # guaranteed to be zero 293 mov $acc1, %rax 294 adc %rdx, $t0 295 296 sub $acc1, $acc3 297 sbb \$0, $acc1 # can't borrow 298 299 mulq 8*1(%r14) 300 add $t0, $acc2 301 adc \$0, %rdx 302 add %rax, $acc2 303 mov $t1, %rax 304 adc %rdx, $acc3 305 mov $t1, %rdx 306 adc \$0, $acc1 # can't overflow 307 308 shl \$32, %rax 309 shr \$32, %rdx 310 sub %rax, $acc4 311 mov 8*2($b_ptr), %rax 312 sbb %rdx, $t1 # can't borrow 313 314 add $acc1, $acc4 315 adc $t1, $acc5 316 adc \$0, $acc0 317 318 ################################## * b[2] 319 mov %rax, $t0 320 mulq 8*0($a_ptr) 321 add %rax, $acc2 322 mov $t0, %rax 323 adc \$0, %rdx 324 mov %rdx, $t1 325 326 mulq 8*1($a_ptr) 327 add $t1, $acc3 328 adc \$0, %rdx 329 add %rax, $acc3 330 mov $t0, %rax 331 adc \$0, %rdx 332 mov %rdx, $t1 333 334 mulq 8*2($a_ptr) 335 add $t1, $acc4 336 adc \$0, %rdx 337 add %rax, $acc4 338 mov $t0, %rax 339 adc \$0, %rdx 340 341 mov $acc2, $t0 342 imulq %r15, $acc2 343 344 mov %rdx, $t1 345 mulq 8*3($a_ptr) 346 add $t1, $acc5 347 adc \$0, %rdx 348 xor $acc1, $acc1 349 add %rax, $acc5 350 mov $acc2, %rax 351 adc %rdx, $acc0 352 adc \$0, $acc1 353 354 ################################# Third reduction step 355 mulq 8*0(%r14) 356 mov $acc2, $t1 357 add %rax, $t0 # guaranteed to be zero 358 mov $acc2, %rax 359 adc %rdx, $t0 360 361 sub $acc2, $acc4 362 sbb \$0, $acc2 # can't borrow 363 364 mulq 8*1(%r14) 365 add $t0, $acc3 366 adc \$0, %rdx 367 add %rax, $acc3 368 mov $t1, %rax 369 adc %rdx, $acc4 370 mov $t1, %rdx 371 adc \$0, $acc2 # can't overflow 372 373 shl \$32, %rax 374 shr \$32, %rdx 375 sub %rax, $acc5 376 mov 8*3($b_ptr), %rax 377 sbb %rdx, $t1 # can't borrow 378 379 add $acc2, $acc5 380 adc $t1, $acc0 381 adc \$0, $acc1 382 383 ################################# * b[3] 384 mov %rax, $t0 385 mulq 8*0($a_ptr) 386 add %rax, $acc3 387 mov $t0, %rax 388 adc \$0, %rdx 389 mov %rdx, $t1 390 391 mulq 8*1($a_ptr) 392 add $t1, $acc4 393 adc \$0, %rdx 394 add %rax, $acc4 395 mov $t0, %rax 396 adc \$0, %rdx 397 mov %rdx, $t1 398 399 mulq 8*2($a_ptr) 400 add $t1, $acc5 401 adc \$0, %rdx 402 add %rax, $acc5 403 mov $t0, %rax 404 adc \$0, %rdx 405 406 mov $acc3, $t0 407 imulq %r15, $acc3 408 409 mov %rdx, $t1 410 mulq 8*3($a_ptr) 411 add $t1, $acc0 412 adc \$0, %rdx 413 xor $acc2, $acc2 414 add %rax, $acc0 415 mov $acc3, %rax 416 adc %rdx, $acc1 417 adc \$0, $acc2 418 419 ################################# Last reduction step 420 mulq 8*0(%r14) 421 mov $acc3, $t1 422 add %rax, $t0 # guaranteed to be zero 423 mov $acc3, %rax 424 adc %rdx, $t0 425 426 sub $acc3, $acc5 427 sbb \$0, $acc3 # can't borrow 428 429 mulq 8*1(%r14) 430 add $t0, $acc4 431 adc \$0, %rdx 432 add %rax, $acc4 433 mov $t1, %rax 434 adc %rdx, $acc5 435 mov $t1, %rdx 436 adc \$0, $acc3 # can't overflow 437 438 shl \$32, %rax 439 shr \$32, %rdx 440 sub %rax, $acc0 441 sbb %rdx, $t1 # can't borrow 442 443 add $acc3, $acc0 444 adc $t1, $acc1 445 adc \$0, $acc2 446 447 ################################# Subtract ord 448 mov $acc4, $a_ptr 449 sub 8*0(%r14), $acc4 450 mov $acc5, $acc3 451 sbb 8*1(%r14), $acc5 452 mov $acc0, $t0 453 sbb 8*2(%r14), $acc0 454 mov $acc1, $t1 455 sbb 8*3(%r14), $acc1 456 sbb \$0, $acc2 457 458 cmovc $a_ptr, $acc4 459 cmovc $acc3, $acc5 460 cmovc $t0, $acc0 461 cmovc $t1, $acc1 462 463 mov $acc4, 8*0($r_ptr) 464 mov $acc5, 8*1($r_ptr) 465 mov $acc0, 8*2($r_ptr) 466 mov $acc1, 8*3($r_ptr) 467 468 mov 0(%rsp),%r15 469.cfi_restore %r15 470 mov 8(%rsp),%r14 471.cfi_restore %r14 472 mov 16(%rsp),%r13 473.cfi_restore %r13 474 mov 24(%rsp),%r12 475.cfi_restore %r12 476 mov 32(%rsp),%rbx 477.cfi_restore %rbx 478 mov 40(%rsp),%rbp 479.cfi_restore %rbp 480 lea 48(%rsp),%rsp 481.cfi_adjust_cfa_offset -48 482.Lord_mul_epilogue: 483 ret 484.cfi_endproc 485.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 486 487################################################################################ 488# void ecp_nistz256_ord_sqr_mont( 489# uint64_t res[4], 490# uint64_t a[4], 491# uint64_t rep); 492 493.globl ecp_nistz256_ord_sqr_mont 494.type ecp_nistz256_ord_sqr_mont,\@function,3 495.align 32 496ecp_nistz256_ord_sqr_mont: 497.cfi_startproc 498___ 499$code.=<<___ if ($addx); 500 leaq OPENSSL_ia32cap_P(%rip), %rcx 501 mov 8(%rcx), %rcx 502 and \$0x80100, %ecx 503 cmp \$0x80100, %ecx 504 je .Lecp_nistz256_ord_sqr_montx 505___ 506$code.=<<___; 507 push %rbp 508.cfi_push %rbp 509 push %rbx 510.cfi_push %rbx 511 push %r12 512.cfi_push %r12 513 push %r13 514.cfi_push %r13 515 push %r14 516.cfi_push %r14 517 push %r15 518.cfi_push %r15 519.Lord_sqr_body: 520 521 mov 8*0($a_ptr), $acc0 522 mov 8*1($a_ptr), %rax 523 mov 8*2($a_ptr), $acc6 524 mov 8*3($a_ptr), $acc7 525 lea .Lord(%rip), $a_ptr # pointer to modulus 526 mov $b_org, $b_ptr 527 jmp .Loop_ord_sqr 528 529.align 32 530.Loop_ord_sqr: 531 ################################# a[1:] * a[0] 532 mov %rax, $t1 # put aside a[1] 533 mul $acc0 # a[1] * a[0] 534 mov %rax, $acc1 535 movq $t1, %xmm1 # offload a[1] 536 mov $acc6, %rax 537 mov %rdx, $acc2 538 539 mul $acc0 # a[2] * a[0] 540 add %rax, $acc2 541 mov $acc7, %rax 542 movq $acc6, %xmm2 # offload a[2] 543 adc \$0, %rdx 544 mov %rdx, $acc3 545 546 mul $acc0 # a[3] * a[0] 547 add %rax, $acc3 548 mov $acc7, %rax 549 movq $acc7, %xmm3 # offload a[3] 550 adc \$0, %rdx 551 mov %rdx, $acc4 552 553 ################################# a[3] * a[2] 554 mul $acc6 # a[3] * a[2] 555 mov %rax, $acc5 556 mov $acc6, %rax 557 mov %rdx, $acc6 558 559 ################################# a[2:] * a[1] 560 mul $t1 # a[2] * a[1] 561 add %rax, $acc3 562 mov $acc7, %rax 563 adc \$0, %rdx 564 mov %rdx, $acc7 565 566 mul $t1 # a[3] * a[1] 567 add %rax, $acc4 568 adc \$0, %rdx 569 570 add $acc7, $acc4 571 adc %rdx, $acc5 572 adc \$0, $acc6 # can't overflow 573 574 ################################# *2 575 xor $acc7, $acc7 576 mov $acc0, %rax 577 add $acc1, $acc1 578 adc $acc2, $acc2 579 adc $acc3, $acc3 580 adc $acc4, $acc4 581 adc $acc5, $acc5 582 adc $acc6, $acc6 583 adc \$0, $acc7 584 585 ################################# Missing products 586 mul %rax # a[0] * a[0] 587 mov %rax, $acc0 588 movq %xmm1, %rax 589 mov %rdx, $t1 590 591 mul %rax # a[1] * a[1] 592 add $t1, $acc1 593 adc %rax, $acc2 594 movq %xmm2, %rax 595 adc \$0, %rdx 596 mov %rdx, $t1 597 598 mul %rax # a[2] * a[2] 599 add $t1, $acc3 600 adc %rax, $acc4 601 movq %xmm3, %rax 602 adc \$0, %rdx 603 mov %rdx, $t1 604 605 mov $acc0, $t0 606 imulq 8*4($a_ptr), $acc0 # *= .LordK 607 608 mul %rax # a[3] * a[3] 609 add $t1, $acc5 610 adc %rax, $acc6 611 mov 8*0($a_ptr), %rax # modulus[0] 612 adc %rdx, $acc7 # can't overflow 613 614 ################################# First reduction step 615 mul $acc0 616 mov $acc0, $t1 617 add %rax, $t0 # guaranteed to be zero 618 mov 8*1($a_ptr), %rax # modulus[1] 619 adc %rdx, $t0 620 621 sub $acc0, $acc2 622 sbb \$0, $t1 # can't borrow 623 624 mul $acc0 625 add $t0, $acc1 626 adc \$0, %rdx 627 add %rax, $acc1 628 mov $acc0, %rax 629 adc %rdx, $acc2 630 mov $acc0, %rdx 631 adc \$0, $t1 # can't overflow 632 633 mov $acc1, $t0 634 imulq 8*4($a_ptr), $acc1 # *= .LordK 635 636 shl \$32, %rax 637 shr \$32, %rdx 638 sub %rax, $acc3 639 mov 8*0($a_ptr), %rax 640 sbb %rdx, $acc0 # can't borrow 641 642 add $t1, $acc3 643 adc \$0, $acc0 # can't overflow 644 645 ################################# Second reduction step 646 mul $acc1 647 mov $acc1, $t1 648 add %rax, $t0 # guaranteed to be zero 649 mov 8*1($a_ptr), %rax 650 adc %rdx, $t0 651 652 sub $acc1, $acc3 653 sbb \$0, $t1 # can't borrow 654 655 mul $acc1 656 add $t0, $acc2 657 adc \$0, %rdx 658 add %rax, $acc2 659 mov $acc1, %rax 660 adc %rdx, $acc3 661 mov $acc1, %rdx 662 adc \$0, $t1 # can't overflow 663 664 mov $acc2, $t0 665 imulq 8*4($a_ptr), $acc2 # *= .LordK 666 667 shl \$32, %rax 668 shr \$32, %rdx 669 sub %rax, $acc0 670 mov 8*0($a_ptr), %rax 671 sbb %rdx, $acc1 # can't borrow 672 673 add $t1, $acc0 674 adc \$0, $acc1 # can't overflow 675 676 ################################# Third reduction step 677 mul $acc2 678 mov $acc2, $t1 679 add %rax, $t0 # guaranteed to be zero 680 mov 8*1($a_ptr), %rax 681 adc %rdx, $t0 682 683 sub $acc2, $acc0 684 sbb \$0, $t1 # can't borrow 685 686 mul $acc2 687 add $t0, $acc3 688 adc \$0, %rdx 689 add %rax, $acc3 690 mov $acc2, %rax 691 adc %rdx, $acc0 692 mov $acc2, %rdx 693 adc \$0, $t1 # can't overflow 694 695 mov $acc3, $t0 696 imulq 8*4($a_ptr), $acc3 # *= .LordK 697 698 shl \$32, %rax 699 shr \$32, %rdx 700 sub %rax, $acc1 701 mov 8*0($a_ptr), %rax 702 sbb %rdx, $acc2 # can't borrow 703 704 add $t1, $acc1 705 adc \$0, $acc2 # can't overflow 706 707 ################################# Last reduction step 708 mul $acc3 709 mov $acc3, $t1 710 add %rax, $t0 # guaranteed to be zero 711 mov 8*1($a_ptr), %rax 712 adc %rdx, $t0 713 714 sub $acc3, $acc1 715 sbb \$0, $t1 # can't borrow 716 717 mul $acc3 718 add $t0, $acc0 719 adc \$0, %rdx 720 add %rax, $acc0 721 mov $acc3, %rax 722 adc %rdx, $acc1 723 mov $acc3, %rdx 724 adc \$0, $t1 # can't overflow 725 726 shl \$32, %rax 727 shr \$32, %rdx 728 sub %rax, $acc2 729 sbb %rdx, $acc3 # can't borrow 730 731 add $t1, $acc2 732 adc \$0, $acc3 # can't overflow 733 734 ################################# Add bits [511:256] of the sqr result 735 xor %rdx, %rdx 736 add $acc4, $acc0 737 adc $acc5, $acc1 738 mov $acc0, $acc4 739 adc $acc6, $acc2 740 adc $acc7, $acc3 741 mov $acc1, %rax 742 adc \$0, %rdx 743 744 ################################# Compare to modulus 745 sub 8*0($a_ptr), $acc0 746 mov $acc2, $acc6 747 sbb 8*1($a_ptr), $acc1 748 sbb 8*2($a_ptr), $acc2 749 mov $acc3, $acc7 750 sbb 8*3($a_ptr), $acc3 751 sbb \$0, %rdx 752 753 cmovc $acc4, $acc0 754 cmovnc $acc1, %rax 755 cmovnc $acc2, $acc6 756 cmovnc $acc3, $acc7 757 758 dec $b_ptr 759 jnz .Loop_ord_sqr 760 761 mov $acc0, 8*0($r_ptr) 762 mov %rax, 8*1($r_ptr) 763 pxor %xmm1, %xmm1 764 mov $acc6, 8*2($r_ptr) 765 pxor %xmm2, %xmm2 766 mov $acc7, 8*3($r_ptr) 767 pxor %xmm3, %xmm3 768 769 mov 0(%rsp),%r15 770.cfi_restore %r15 771 mov 8(%rsp),%r14 772.cfi_restore %r14 773 mov 16(%rsp),%r13 774.cfi_restore %r13 775 mov 24(%rsp),%r12 776.cfi_restore %r12 777 mov 32(%rsp),%rbx 778.cfi_restore %rbx 779 mov 40(%rsp),%rbp 780.cfi_restore %rbp 781 lea 48(%rsp),%rsp 782.cfi_adjust_cfa_offset -48 783.Lord_sqr_epilogue: 784 ret 785.cfi_endproc 786.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 787___ 788 789$code.=<<___ if ($addx); 790################################################################################ 791.type ecp_nistz256_ord_mul_montx,\@function,3 792.align 32 793ecp_nistz256_ord_mul_montx: 794.cfi_startproc 795.Lecp_nistz256_ord_mul_montx: 796 push %rbp 797.cfi_push %rbp 798 push %rbx 799.cfi_push %rbx 800 push %r12 801.cfi_push %r12 802 push %r13 803.cfi_push %r13 804 push %r14 805.cfi_push %r14 806 push %r15 807.cfi_push %r15 808.Lord_mulx_body: 809 810 mov $b_org, $b_ptr 811 mov 8*0($b_org), %rdx 812 mov 8*0($a_ptr), $acc1 813 mov 8*1($a_ptr), $acc2 814 mov 8*2($a_ptr), $acc3 815 mov 8*3($a_ptr), $acc4 816 lea -128($a_ptr), $a_ptr # control u-op density 817 lea .Lord-128(%rip), %r14 818 mov .LordK(%rip), %r15 819 820 ################################# Multiply by b[0] 821 mulx $acc1, $acc0, $acc1 822 mulx $acc2, $t0, $acc2 823 mulx $acc3, $t1, $acc3 824 add $t0, $acc1 825 mulx $acc4, $t0, $acc4 826 mov $acc0, %rdx 827 mulx %r15, %rdx, %rax 828 adc $t1, $acc2 829 adc $t0, $acc3 830 adc \$0, $acc4 831 832 ################################# reduction 833 xor $acc5, $acc5 # $acc5=0, cf=0, of=0 834 mulx 8*0+128(%r14), $t0, $t1 835 adcx $t0, $acc0 # guaranteed to be zero 836 adox $t1, $acc1 837 838 mulx 8*1+128(%r14), $t0, $t1 839 adcx $t0, $acc1 840 adox $t1, $acc2 841 842 mulx 8*2+128(%r14), $t0, $t1 843 adcx $t0, $acc2 844 adox $t1, $acc3 845 846 mulx 8*3+128(%r14), $t0, $t1 847 mov 8*1($b_ptr), %rdx 848 adcx $t0, $acc3 849 adox $t1, $acc4 850 adcx $acc0, $acc4 851 adox $acc0, $acc5 852 adc \$0, $acc5 # cf=0, of=0 853 854 ################################# Multiply by b[1] 855 mulx 8*0+128($a_ptr), $t0, $t1 856 adcx $t0, $acc1 857 adox $t1, $acc2 858 859 mulx 8*1+128($a_ptr), $t0, $t1 860 adcx $t0, $acc2 861 adox $t1, $acc3 862 863 mulx 8*2+128($a_ptr), $t0, $t1 864 adcx $t0, $acc3 865 adox $t1, $acc4 866 867 mulx 8*3+128($a_ptr), $t0, $t1 868 mov $acc1, %rdx 869 mulx %r15, %rdx, %rax 870 adcx $t0, $acc4 871 adox $t1, $acc5 872 873 adcx $acc0, $acc5 874 adox $acc0, $acc0 875 adc \$0, $acc0 # cf=0, of=0 876 877 ################################# reduction 878 mulx 8*0+128(%r14), $t0, $t1 879 adcx $t0, $acc1 # guaranteed to be zero 880 adox $t1, $acc2 881 882 mulx 8*1+128(%r14), $t0, $t1 883 adcx $t0, $acc2 884 adox $t1, $acc3 885 886 mulx 8*2+128(%r14), $t0, $t1 887 adcx $t0, $acc3 888 adox $t1, $acc4 889 890 mulx 8*3+128(%r14), $t0, $t1 891 mov 8*2($b_ptr), %rdx 892 adcx $t0, $acc4 893 adox $t1, $acc5 894 adcx $acc1, $acc5 895 adox $acc1, $acc0 896 adc \$0, $acc0 # cf=0, of=0 897 898 ################################# Multiply by b[2] 899 mulx 8*0+128($a_ptr), $t0, $t1 900 adcx $t0, $acc2 901 adox $t1, $acc3 902 903 mulx 8*1+128($a_ptr), $t0, $t1 904 adcx $t0, $acc3 905 adox $t1, $acc4 906 907 mulx 8*2+128($a_ptr), $t0, $t1 908 adcx $t0, $acc4 909 adox $t1, $acc5 910 911 mulx 8*3+128($a_ptr), $t0, $t1 912 mov $acc2, %rdx 913 mulx %r15, %rdx, %rax 914 adcx $t0, $acc5 915 adox $t1, $acc0 916 917 adcx $acc1, $acc0 918 adox $acc1, $acc1 919 adc \$0, $acc1 # cf=0, of=0 920 921 ################################# reduction 922 mulx 8*0+128(%r14), $t0, $t1 923 adcx $t0, $acc2 # guaranteed to be zero 924 adox $t1, $acc3 925 926 mulx 8*1+128(%r14), $t0, $t1 927 adcx $t0, $acc3 928 adox $t1, $acc4 929 930 mulx 8*2+128(%r14), $t0, $t1 931 adcx $t0, $acc4 932 adox $t1, $acc5 933 934 mulx 8*3+128(%r14), $t0, $t1 935 mov 8*3($b_ptr), %rdx 936 adcx $t0, $acc5 937 adox $t1, $acc0 938 adcx $acc2, $acc0 939 adox $acc2, $acc1 940 adc \$0, $acc1 # cf=0, of=0 941 942 ################################# Multiply by b[3] 943 mulx 8*0+128($a_ptr), $t0, $t1 944 adcx $t0, $acc3 945 adox $t1, $acc4 946 947 mulx 8*1+128($a_ptr), $t0, $t1 948 adcx $t0, $acc4 949 adox $t1, $acc5 950 951 mulx 8*2+128($a_ptr), $t0, $t1 952 adcx $t0, $acc5 953 adox $t1, $acc0 954 955 mulx 8*3+128($a_ptr), $t0, $t1 956 mov $acc3, %rdx 957 mulx %r15, %rdx, %rax 958 adcx $t0, $acc0 959 adox $t1, $acc1 960 961 adcx $acc2, $acc1 962 adox $acc2, $acc2 963 adc \$0, $acc2 # cf=0, of=0 964 965 ################################# reduction 966 mulx 8*0+128(%r14), $t0, $t1 967 adcx $t0, $acc3 # guranteed to be zero 968 adox $t1, $acc4 969 970 mulx 8*1+128(%r14), $t0, $t1 971 adcx $t0, $acc4 972 adox $t1, $acc5 973 974 mulx 8*2+128(%r14), $t0, $t1 975 adcx $t0, $acc5 976 adox $t1, $acc0 977 978 mulx 8*3+128(%r14), $t0, $t1 979 lea 128(%r14),%r14 980 mov $acc4, $t2 981 adcx $t0, $acc0 982 adox $t1, $acc1 983 mov $acc5, $t3 984 adcx $acc3, $acc1 985 adox $acc3, $acc2 986 adc \$0, $acc2 987 988 ################################# 989 # Branch-less conditional subtraction of P 990 mov $acc0, $t0 991 sub 8*0(%r14), $acc4 992 sbb 8*1(%r14), $acc5 993 sbb 8*2(%r14), $acc0 994 mov $acc1, $t1 995 sbb 8*3(%r14), $acc1 996 sbb \$0, $acc2 997 998 cmovc $t2, $acc4 999 cmovc $t3, $acc5 1000 cmovc $t0, $acc0 1001 cmovc $t1, $acc1 1002 1003 mov $acc4, 8*0($r_ptr) 1004 mov $acc5, 8*1($r_ptr) 1005 mov $acc0, 8*2($r_ptr) 1006 mov $acc1, 8*3($r_ptr) 1007 1008 mov 0(%rsp),%r15 1009.cfi_restore %r15 1010 mov 8(%rsp),%r14 1011.cfi_restore %r14 1012 mov 16(%rsp),%r13 1013.cfi_restore %r13 1014 mov 24(%rsp),%r12 1015.cfi_restore %r12 1016 mov 32(%rsp),%rbx 1017.cfi_restore %rbx 1018 mov 40(%rsp),%rbp 1019.cfi_restore %rbp 1020 lea 48(%rsp),%rsp 1021.cfi_adjust_cfa_offset -48 1022.Lord_mulx_epilogue: 1023 ret 1024.cfi_endproc 1025.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx 1026 1027.type ecp_nistz256_ord_sqr_montx,\@function,3 1028.align 32 1029ecp_nistz256_ord_sqr_montx: 1030.cfi_startproc 1031.Lecp_nistz256_ord_sqr_montx: 1032 push %rbp 1033.cfi_push %rbp 1034 push %rbx 1035.cfi_push %rbx 1036 push %r12 1037.cfi_push %r12 1038 push %r13 1039.cfi_push %r13 1040 push %r14 1041.cfi_push %r14 1042 push %r15 1043.cfi_push %r15 1044.Lord_sqrx_body: 1045 1046 mov $b_org, $b_ptr 1047 mov 8*0($a_ptr), %rdx 1048 mov 8*1($a_ptr), $acc6 1049 mov 8*2($a_ptr), $acc7 1050 mov 8*3($a_ptr), $acc0 1051 lea .Lord(%rip), $a_ptr 1052 jmp .Loop_ord_sqrx 1053 1054.align 32 1055.Loop_ord_sqrx: 1056 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1057 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1058 mov %rdx, %rax # offload a[0] 1059 movq $acc6, %xmm1 # offload a[1] 1060 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1061 mov $acc6, %rdx 1062 add $t0, $acc2 1063 movq $acc7, %xmm2 # offload a[2] 1064 adc $t1, $acc3 1065 adc \$0, $acc4 1066 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1067 ################################# 1068 mulx $acc7, $t0, $t1 # a[1]*a[2] 1069 adcx $t0, $acc3 1070 adox $t1, $acc4 1071 1072 mulx $acc0, $t0, $t1 # a[1]*a[3] 1073 mov $acc7, %rdx 1074 adcx $t0, $acc4 1075 adox $t1, $acc5 1076 adc \$0, $acc5 1077 ################################# 1078 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1079 mov %rax, %rdx 1080 movq $acc0, %xmm3 # offload a[3] 1081 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1082 adcx $acc1, $acc1 # acc1:6<<1 1083 adox $t0, $acc5 1084 adcx $acc2, $acc2 1085 adox $acc7, $acc6 # of=0 1086 1087 ################################# a[i]*a[i] 1088 mulx %rdx, $acc0, $t1 1089 movq %xmm1, %rdx 1090 adcx $acc3, $acc3 1091 adox $t1, $acc1 1092 adcx $acc4, $acc4 1093 mulx %rdx, $t0, $t4 1094 movq %xmm2, %rdx 1095 adcx $acc5, $acc5 1096 adox $t0, $acc2 1097 adcx $acc6, $acc6 1098 mulx %rdx, $t0, $t1 1099 .byte 0x67 1100 movq %xmm3, %rdx 1101 adox $t4, $acc3 1102 adcx $acc7, $acc7 1103 adox $t0, $acc4 1104 adox $t1, $acc5 1105 mulx %rdx, $t0, $t4 1106 adox $t0, $acc6 1107 adox $t4, $acc7 1108 1109 ################################# reduction 1110 mov $acc0, %rdx 1111 mulx 8*4($a_ptr), %rdx, $t0 1112 1113 xor %rax, %rax # cf=0, of=0 1114 mulx 8*0($a_ptr), $t0, $t1 1115 adcx $t0, $acc0 # guaranteed to be zero 1116 adox $t1, $acc1 1117 mulx 8*1($a_ptr), $t0, $t1 1118 adcx $t0, $acc1 1119 adox $t1, $acc2 1120 mulx 8*2($a_ptr), $t0, $t1 1121 adcx $t0, $acc2 1122 adox $t1, $acc3 1123 mulx 8*3($a_ptr), $t0, $t1 1124 adcx $t0, $acc3 1125 adox $t1, $acc0 # of=0 1126 adcx %rax, $acc0 # cf=0 1127 1128 ################################# 1129 mov $acc1, %rdx 1130 mulx 8*4($a_ptr), %rdx, $t0 1131 1132 mulx 8*0($a_ptr), $t0, $t1 1133 adox $t0, $acc1 # guaranteed to be zero 1134 adcx $t1, $acc2 1135 mulx 8*1($a_ptr), $t0, $t1 1136 adox $t0, $acc2 1137 adcx $t1, $acc3 1138 mulx 8*2($a_ptr), $t0, $t1 1139 adox $t0, $acc3 1140 adcx $t1, $acc0 1141 mulx 8*3($a_ptr), $t0, $t1 1142 adox $t0, $acc0 1143 adcx $t1, $acc1 # cf=0 1144 adox %rax, $acc1 # of=0 1145 1146 ################################# 1147 mov $acc2, %rdx 1148 mulx 8*4($a_ptr), %rdx, $t0 1149 1150 mulx 8*0($a_ptr), $t0, $t1 1151 adcx $t0, $acc2 # guaranteed to be zero 1152 adox $t1, $acc3 1153 mulx 8*1($a_ptr), $t0, $t1 1154 adcx $t0, $acc3 1155 adox $t1, $acc0 1156 mulx 8*2($a_ptr), $t0, $t1 1157 adcx $t0, $acc0 1158 adox $t1, $acc1 1159 mulx 8*3($a_ptr), $t0, $t1 1160 adcx $t0, $acc1 1161 adox $t1, $acc2 # of=0 1162 adcx %rax, $acc2 # cf=0 1163 1164 ################################# 1165 mov $acc3, %rdx 1166 mulx 8*4($a_ptr), %rdx, $t0 1167 1168 mulx 8*0($a_ptr), $t0, $t1 1169 adox $t0, $acc3 # guaranteed to be zero 1170 adcx $t1, $acc0 1171 mulx 8*1($a_ptr), $t0, $t1 1172 adox $t0, $acc0 1173 adcx $t1, $acc1 1174 mulx 8*2($a_ptr), $t0, $t1 1175 adox $t0, $acc1 1176 adcx $t1, $acc2 1177 mulx 8*3($a_ptr), $t0, $t1 1178 adox $t0, $acc2 1179 adcx $t1, $acc3 1180 adox %rax, $acc3 1181 1182 ################################# accumulate upper half 1183 add $acc0, $acc4 # add $acc4, $acc0 1184 adc $acc5, $acc1 1185 mov $acc4, %rdx 1186 adc $acc6, $acc2 1187 adc $acc7, $acc3 1188 mov $acc1, $acc6 1189 adc \$0, %rax 1190 1191 ################################# compare to modulus 1192 sub 8*0($a_ptr), $acc4 1193 mov $acc2, $acc7 1194 sbb 8*1($a_ptr), $acc1 1195 sbb 8*2($a_ptr), $acc2 1196 mov $acc3, $acc0 1197 sbb 8*3($a_ptr), $acc3 1198 sbb \$0, %rax 1199 1200 cmovnc $acc4, %rdx 1201 cmovnc $acc1, $acc6 1202 cmovnc $acc2, $acc7 1203 cmovnc $acc3, $acc0 1204 1205 dec $b_ptr 1206 jnz .Loop_ord_sqrx 1207 1208 mov %rdx, 8*0($r_ptr) 1209 mov $acc6, 8*1($r_ptr) 1210 pxor %xmm1, %xmm1 1211 mov $acc7, 8*2($r_ptr) 1212 pxor %xmm2, %xmm2 1213 mov $acc0, 8*3($r_ptr) 1214 pxor %xmm3, %xmm3 1215 1216 mov 0(%rsp),%r15 1217.cfi_restore %r15 1218 mov 8(%rsp),%r14 1219.cfi_restore %r14 1220 mov 16(%rsp),%r13 1221.cfi_restore %r13 1222 mov 24(%rsp),%r12 1223.cfi_restore %r12 1224 mov 32(%rsp),%rbx 1225.cfi_restore %rbx 1226 mov 40(%rsp),%rbp 1227.cfi_restore %rbp 1228 lea 48(%rsp),%rsp 1229.cfi_adjust_cfa_offset -48 1230.Lord_sqrx_epilogue: 1231 ret 1232.cfi_endproc 1233.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx 1234___ 1235 1236$code.=<<___; 1237################################################################################ 1238# void ecp_nistz256_mul_mont( 1239# uint64_t res[4], 1240# uint64_t a[4], 1241# uint64_t b[4]); 1242 1243.globl ecp_nistz256_mul_mont 1244.type ecp_nistz256_mul_mont,\@function,3 1245.align 32 1246ecp_nistz256_mul_mont: 1247.cfi_startproc 1248___ 1249$code.=<<___ if ($addx); 1250 leaq OPENSSL_ia32cap_P(%rip), %rcx 1251 mov 8(%rcx), %rcx 1252 and \$0x80100, %ecx 1253___ 1254$code.=<<___; 1255.Lmul_mont: 1256 push %rbp 1257.cfi_push %rbp 1258 push %rbx 1259.cfi_push %rbx 1260 push %r12 1261.cfi_push %r12 1262 push %r13 1263.cfi_push %r13 1264 push %r14 1265.cfi_push %r14 1266 push %r15 1267.cfi_push %r15 1268.Lmul_body: 1269___ 1270$code.=<<___ if ($addx); 1271 cmp \$0x80100, %ecx 1272 je .Lmul_montx 1273___ 1274$code.=<<___; 1275 mov $b_org, $b_ptr 1276 mov 8*0($b_org), %rax 1277 mov 8*0($a_ptr), $acc1 1278 mov 8*1($a_ptr), $acc2 1279 mov 8*2($a_ptr), $acc3 1280 mov 8*3($a_ptr), $acc4 1281 1282 call __ecp_nistz256_mul_montq 1283___ 1284$code.=<<___ if ($addx); 1285 jmp .Lmul_mont_done 1286 1287.align 32 1288.Lmul_montx: 1289 mov $b_org, $b_ptr 1290 mov 8*0($b_org), %rdx 1291 mov 8*0($a_ptr), $acc1 1292 mov 8*1($a_ptr), $acc2 1293 mov 8*2($a_ptr), $acc3 1294 mov 8*3($a_ptr), $acc4 1295 lea -128($a_ptr), $a_ptr # control u-op density 1296 1297 call __ecp_nistz256_mul_montx 1298___ 1299$code.=<<___; 1300.Lmul_mont_done: 1301 mov 0(%rsp),%r15 1302.cfi_restore %r15 1303 mov 8(%rsp),%r14 1304.cfi_restore %r14 1305 mov 16(%rsp),%r13 1306.cfi_restore %r13 1307 mov 24(%rsp),%r12 1308.cfi_restore %r12 1309 mov 32(%rsp),%rbx 1310.cfi_restore %rbx 1311 mov 40(%rsp),%rbp 1312.cfi_restore %rbp 1313 lea 48(%rsp),%rsp 1314.cfi_adjust_cfa_offset -48 1315.Lmul_epilogue: 1316 ret 1317.cfi_endproc 1318.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 1319 1320.type __ecp_nistz256_mul_montq,\@abi-omnipotent 1321.align 32 1322__ecp_nistz256_mul_montq: 1323.cfi_startproc 1324 ######################################################################## 1325 # Multiply a by b[0] 1326 mov %rax, $t1 1327 mulq $acc1 1328 mov .Lpoly+8*1(%rip),$poly1 1329 mov %rax, $acc0 1330 mov $t1, %rax 1331 mov %rdx, $acc1 1332 1333 mulq $acc2 1334 mov .Lpoly+8*3(%rip),$poly3 1335 add %rax, $acc1 1336 mov $t1, %rax 1337 adc \$0, %rdx 1338 mov %rdx, $acc2 1339 1340 mulq $acc3 1341 add %rax, $acc2 1342 mov $t1, %rax 1343 adc \$0, %rdx 1344 mov %rdx, $acc3 1345 1346 mulq $acc4 1347 add %rax, $acc3 1348 mov $acc0, %rax 1349 adc \$0, %rdx 1350 xor $acc5, $acc5 1351 mov %rdx, $acc4 1352 1353 ######################################################################## 1354 # First reduction step 1355 # Basically now we want to multiply acc[0] by p256, 1356 # and add the result to the acc. 1357 # Due to the special form of p256 we do some optimizations 1358 # 1359 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 1360 # then we add acc[0] and get acc[0] x 2^96 1361 1362 mov $acc0, $t1 1363 shl \$32, $acc0 1364 mulq $poly3 1365 shr \$32, $t1 1366 add $acc0, $acc1 # +=acc[0]<<96 1367 adc $t1, $acc2 1368 adc %rax, $acc3 1369 mov 8*1($b_ptr), %rax 1370 adc %rdx, $acc4 1371 adc \$0, $acc5 1372 xor $acc0, $acc0 1373 1374 ######################################################################## 1375 # Multiply by b[1] 1376 mov %rax, $t1 1377 mulq 8*0($a_ptr) 1378 add %rax, $acc1 1379 mov $t1, %rax 1380 adc \$0, %rdx 1381 mov %rdx, $t0 1382 1383 mulq 8*1($a_ptr) 1384 add $t0, $acc2 1385 adc \$0, %rdx 1386 add %rax, $acc2 1387 mov $t1, %rax 1388 adc \$0, %rdx 1389 mov %rdx, $t0 1390 1391 mulq 8*2($a_ptr) 1392 add $t0, $acc3 1393 adc \$0, %rdx 1394 add %rax, $acc3 1395 mov $t1, %rax 1396 adc \$0, %rdx 1397 mov %rdx, $t0 1398 1399 mulq 8*3($a_ptr) 1400 add $t0, $acc4 1401 adc \$0, %rdx 1402 add %rax, $acc4 1403 mov $acc1, %rax 1404 adc %rdx, $acc5 1405 adc \$0, $acc0 1406 1407 ######################################################################## 1408 # Second reduction step 1409 mov $acc1, $t1 1410 shl \$32, $acc1 1411 mulq $poly3 1412 shr \$32, $t1 1413 add $acc1, $acc2 1414 adc $t1, $acc3 1415 adc %rax, $acc4 1416 mov 8*2($b_ptr), %rax 1417 adc %rdx, $acc5 1418 adc \$0, $acc0 1419 xor $acc1, $acc1 1420 1421 ######################################################################## 1422 # Multiply by b[2] 1423 mov %rax, $t1 1424 mulq 8*0($a_ptr) 1425 add %rax, $acc2 1426 mov $t1, %rax 1427 adc \$0, %rdx 1428 mov %rdx, $t0 1429 1430 mulq 8*1($a_ptr) 1431 add $t0, $acc3 1432 adc \$0, %rdx 1433 add %rax, $acc3 1434 mov $t1, %rax 1435 adc \$0, %rdx 1436 mov %rdx, $t0 1437 1438 mulq 8*2($a_ptr) 1439 add $t0, $acc4 1440 adc \$0, %rdx 1441 add %rax, $acc4 1442 mov $t1, %rax 1443 adc \$0, %rdx 1444 mov %rdx, $t0 1445 1446 mulq 8*3($a_ptr) 1447 add $t0, $acc5 1448 adc \$0, %rdx 1449 add %rax, $acc5 1450 mov $acc2, %rax 1451 adc %rdx, $acc0 1452 adc \$0, $acc1 1453 1454 ######################################################################## 1455 # Third reduction step 1456 mov $acc2, $t1 1457 shl \$32, $acc2 1458 mulq $poly3 1459 shr \$32, $t1 1460 add $acc2, $acc3 1461 adc $t1, $acc4 1462 adc %rax, $acc5 1463 mov 8*3($b_ptr), %rax 1464 adc %rdx, $acc0 1465 adc \$0, $acc1 1466 xor $acc2, $acc2 1467 1468 ######################################################################## 1469 # Multiply by b[3] 1470 mov %rax, $t1 1471 mulq 8*0($a_ptr) 1472 add %rax, $acc3 1473 mov $t1, %rax 1474 adc \$0, %rdx 1475 mov %rdx, $t0 1476 1477 mulq 8*1($a_ptr) 1478 add $t0, $acc4 1479 adc \$0, %rdx 1480 add %rax, $acc4 1481 mov $t1, %rax 1482 adc \$0, %rdx 1483 mov %rdx, $t0 1484 1485 mulq 8*2($a_ptr) 1486 add $t0, $acc5 1487 adc \$0, %rdx 1488 add %rax, $acc5 1489 mov $t1, %rax 1490 adc \$0, %rdx 1491 mov %rdx, $t0 1492 1493 mulq 8*3($a_ptr) 1494 add $t0, $acc0 1495 adc \$0, %rdx 1496 add %rax, $acc0 1497 mov $acc3, %rax 1498 adc %rdx, $acc1 1499 adc \$0, $acc2 1500 1501 ######################################################################## 1502 # Final reduction step 1503 mov $acc3, $t1 1504 shl \$32, $acc3 1505 mulq $poly3 1506 shr \$32, $t1 1507 add $acc3, $acc4 1508 adc $t1, $acc5 1509 mov $acc4, $t0 1510 adc %rax, $acc0 1511 adc %rdx, $acc1 1512 mov $acc5, $t1 1513 adc \$0, $acc2 1514 1515 ######################################################################## 1516 # Branch-less conditional subtraction of P 1517 sub \$-1, $acc4 # .Lpoly[0] 1518 mov $acc0, $t2 1519 sbb $poly1, $acc5 # .Lpoly[1] 1520 sbb \$0, $acc0 # .Lpoly[2] 1521 mov $acc1, $t3 1522 sbb $poly3, $acc1 # .Lpoly[3] 1523 sbb \$0, $acc2 1524 1525 cmovc $t0, $acc4 1526 cmovc $t1, $acc5 1527 mov $acc4, 8*0($r_ptr) 1528 cmovc $t2, $acc0 1529 mov $acc5, 8*1($r_ptr) 1530 cmovc $t3, $acc1 1531 mov $acc0, 8*2($r_ptr) 1532 mov $acc1, 8*3($r_ptr) 1533 1534 ret 1535.cfi_endproc 1536.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 1537 1538################################################################################ 1539# void ecp_nistz256_sqr_mont( 1540# uint64_t res[4], 1541# uint64_t a[4]); 1542 1543# we optimize the square according to S.Gueron and V.Krasnov, 1544# "Speeding up Big-Number Squaring" 1545.globl ecp_nistz256_sqr_mont 1546.type ecp_nistz256_sqr_mont,\@function,2 1547.align 32 1548ecp_nistz256_sqr_mont: 1549.cfi_startproc 1550___ 1551$code.=<<___ if ($addx); 1552 leaq OPENSSL_ia32cap_P(%rip), %rcx 1553 mov 8(%rcx), %rcx 1554 and \$0x80100, %ecx 1555___ 1556$code.=<<___; 1557 push %rbp 1558.cfi_push %rbp 1559 push %rbx 1560.cfi_push %rbx 1561 push %r12 1562.cfi_push %r12 1563 push %r13 1564.cfi_push %r13 1565 push %r14 1566.cfi_push %r14 1567 push %r15 1568.cfi_push %r15 1569.Lsqr_body: 1570___ 1571$code.=<<___ if ($addx); 1572 cmp \$0x80100, %ecx 1573 je .Lsqr_montx 1574___ 1575$code.=<<___; 1576 mov 8*0($a_ptr), %rax 1577 mov 8*1($a_ptr), $acc6 1578 mov 8*2($a_ptr), $acc7 1579 mov 8*3($a_ptr), $acc0 1580 1581 call __ecp_nistz256_sqr_montq 1582___ 1583$code.=<<___ if ($addx); 1584 jmp .Lsqr_mont_done 1585 1586.align 32 1587.Lsqr_montx: 1588 mov 8*0($a_ptr), %rdx 1589 mov 8*1($a_ptr), $acc6 1590 mov 8*2($a_ptr), $acc7 1591 mov 8*3($a_ptr), $acc0 1592 lea -128($a_ptr), $a_ptr # control u-op density 1593 1594 call __ecp_nistz256_sqr_montx 1595___ 1596$code.=<<___; 1597.Lsqr_mont_done: 1598 mov 0(%rsp),%r15 1599.cfi_restore %r15 1600 mov 8(%rsp),%r14 1601.cfi_restore %r14 1602 mov 16(%rsp),%r13 1603.cfi_restore %r13 1604 mov 24(%rsp),%r12 1605.cfi_restore %r12 1606 mov 32(%rsp),%rbx 1607.cfi_restore %rbx 1608 mov 40(%rsp),%rbp 1609.cfi_restore %rbp 1610 lea 48(%rsp),%rsp 1611.cfi_adjust_cfa_offset -48 1612.Lsqr_epilogue: 1613 ret 1614.cfi_endproc 1615.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 1616 1617.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 1618.align 32 1619__ecp_nistz256_sqr_montq: 1620.cfi_startproc 1621 mov %rax, $acc5 1622 mulq $acc6 # a[1]*a[0] 1623 mov %rax, $acc1 1624 mov $acc7, %rax 1625 mov %rdx, $acc2 1626 1627 mulq $acc5 # a[0]*a[2] 1628 add %rax, $acc2 1629 mov $acc0, %rax 1630 adc \$0, %rdx 1631 mov %rdx, $acc3 1632 1633 mulq $acc5 # a[0]*a[3] 1634 add %rax, $acc3 1635 mov $acc7, %rax 1636 adc \$0, %rdx 1637 mov %rdx, $acc4 1638 1639 ################################# 1640 mulq $acc6 # a[1]*a[2] 1641 add %rax, $acc3 1642 mov $acc0, %rax 1643 adc \$0, %rdx 1644 mov %rdx, $t1 1645 1646 mulq $acc6 # a[1]*a[3] 1647 add %rax, $acc4 1648 mov $acc0, %rax 1649 adc \$0, %rdx 1650 add $t1, $acc4 1651 mov %rdx, $acc5 1652 adc \$0, $acc5 1653 1654 ################################# 1655 mulq $acc7 # a[2]*a[3] 1656 xor $acc7, $acc7 1657 add %rax, $acc5 1658 mov 8*0($a_ptr), %rax 1659 mov %rdx, $acc6 1660 adc \$0, $acc6 1661 1662 add $acc1, $acc1 # acc1:6<<1 1663 adc $acc2, $acc2 1664 adc $acc3, $acc3 1665 adc $acc4, $acc4 1666 adc $acc5, $acc5 1667 adc $acc6, $acc6 1668 adc \$0, $acc7 1669 1670 mulq %rax 1671 mov %rax, $acc0 1672 mov 8*1($a_ptr), %rax 1673 mov %rdx, $t0 1674 1675 mulq %rax 1676 add $t0, $acc1 1677 adc %rax, $acc2 1678 mov 8*2($a_ptr), %rax 1679 adc \$0, %rdx 1680 mov %rdx, $t0 1681 1682 mulq %rax 1683 add $t0, $acc3 1684 adc %rax, $acc4 1685 mov 8*3($a_ptr), %rax 1686 adc \$0, %rdx 1687 mov %rdx, $t0 1688 1689 mulq %rax 1690 add $t0, $acc5 1691 adc %rax, $acc6 1692 mov $acc0, %rax 1693 adc %rdx, $acc7 1694 1695 mov .Lpoly+8*1(%rip), $a_ptr 1696 mov .Lpoly+8*3(%rip), $t1 1697 1698 ########################################## 1699 # Now the reduction 1700 # First iteration 1701 mov $acc0, $t0 1702 shl \$32, $acc0 1703 mulq $t1 1704 shr \$32, $t0 1705 add $acc0, $acc1 # +=acc[0]<<96 1706 adc $t0, $acc2 1707 adc %rax, $acc3 1708 mov $acc1, %rax 1709 adc \$0, %rdx 1710 1711 ########################################## 1712 # Second iteration 1713 mov $acc1, $t0 1714 shl \$32, $acc1 1715 mov %rdx, $acc0 1716 mulq $t1 1717 shr \$32, $t0 1718 add $acc1, $acc2 1719 adc $t0, $acc3 1720 adc %rax, $acc0 1721 mov $acc2, %rax 1722 adc \$0, %rdx 1723 1724 ########################################## 1725 # Third iteration 1726 mov $acc2, $t0 1727 shl \$32, $acc2 1728 mov %rdx, $acc1 1729 mulq $t1 1730 shr \$32, $t0 1731 add $acc2, $acc3 1732 adc $t0, $acc0 1733 adc %rax, $acc1 1734 mov $acc3, %rax 1735 adc \$0, %rdx 1736 1737 ########################################### 1738 # Last iteration 1739 mov $acc3, $t0 1740 shl \$32, $acc3 1741 mov %rdx, $acc2 1742 mulq $t1 1743 shr \$32, $t0 1744 add $acc3, $acc0 1745 adc $t0, $acc1 1746 adc %rax, $acc2 1747 adc \$0, %rdx 1748 xor $acc3, $acc3 1749 1750 ############################################ 1751 # Add the rest of the acc 1752 add $acc0, $acc4 1753 adc $acc1, $acc5 1754 mov $acc4, $acc0 1755 adc $acc2, $acc6 1756 adc %rdx, $acc7 1757 mov $acc5, $acc1 1758 adc \$0, $acc3 1759 1760 sub \$-1, $acc4 # .Lpoly[0] 1761 mov $acc6, $acc2 1762 sbb $a_ptr, $acc5 # .Lpoly[1] 1763 sbb \$0, $acc6 # .Lpoly[2] 1764 mov $acc7, $t0 1765 sbb $t1, $acc7 # .Lpoly[3] 1766 sbb \$0, $acc3 1767 1768 cmovc $acc0, $acc4 1769 cmovc $acc1, $acc5 1770 mov $acc4, 8*0($r_ptr) 1771 cmovc $acc2, $acc6 1772 mov $acc5, 8*1($r_ptr) 1773 cmovc $t0, $acc7 1774 mov $acc6, 8*2($r_ptr) 1775 mov $acc7, 8*3($r_ptr) 1776 1777 ret 1778.cfi_endproc 1779.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 1780___ 1781 1782if ($addx) { 1783$code.=<<___; 1784.type __ecp_nistz256_mul_montx,\@abi-omnipotent 1785.align 32 1786__ecp_nistz256_mul_montx: 1787.cfi_startproc 1788 ######################################################################## 1789 # Multiply by b[0] 1790 mulx $acc1, $acc0, $acc1 1791 mulx $acc2, $t0, $acc2 1792 mov \$32, $poly1 1793 xor $acc5, $acc5 # cf=0 1794 mulx $acc3, $t1, $acc3 1795 mov .Lpoly+8*3(%rip), $poly3 1796 adc $t0, $acc1 1797 mulx $acc4, $t0, $acc4 1798 mov $acc0, %rdx 1799 adc $t1, $acc2 1800 shlx $poly1,$acc0,$t1 1801 adc $t0, $acc3 1802 shrx $poly1,$acc0,$t0 1803 adc \$0, $acc4 1804 1805 ######################################################################## 1806 # First reduction step 1807 add $t1, $acc1 1808 adc $t0, $acc2 1809 1810 mulx $poly3, $t0, $t1 1811 mov 8*1($b_ptr), %rdx 1812 adc $t0, $acc3 1813 adc $t1, $acc4 1814 adc \$0, $acc5 1815 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 1816 1817 ######################################################################## 1818 # Multiply by b[1] 1819 mulx 8*0+128($a_ptr), $t0, $t1 1820 adcx $t0, $acc1 1821 adox $t1, $acc2 1822 1823 mulx 8*1+128($a_ptr), $t0, $t1 1824 adcx $t0, $acc2 1825 adox $t1, $acc3 1826 1827 mulx 8*2+128($a_ptr), $t0, $t1 1828 adcx $t0, $acc3 1829 adox $t1, $acc4 1830 1831 mulx 8*3+128($a_ptr), $t0, $t1 1832 mov $acc1, %rdx 1833 adcx $t0, $acc4 1834 shlx $poly1, $acc1, $t0 1835 adox $t1, $acc5 1836 shrx $poly1, $acc1, $t1 1837 1838 adcx $acc0, $acc5 1839 adox $acc0, $acc0 1840 adc \$0, $acc0 1841 1842 ######################################################################## 1843 # Second reduction step 1844 add $t0, $acc2 1845 adc $t1, $acc3 1846 1847 mulx $poly3, $t0, $t1 1848 mov 8*2($b_ptr), %rdx 1849 adc $t0, $acc4 1850 adc $t1, $acc5 1851 adc \$0, $acc0 1852 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 1853 1854 ######################################################################## 1855 # Multiply by b[2] 1856 mulx 8*0+128($a_ptr), $t0, $t1 1857 adcx $t0, $acc2 1858 adox $t1, $acc3 1859 1860 mulx 8*1+128($a_ptr), $t0, $t1 1861 adcx $t0, $acc3 1862 adox $t1, $acc4 1863 1864 mulx 8*2+128($a_ptr), $t0, $t1 1865 adcx $t0, $acc4 1866 adox $t1, $acc5 1867 1868 mulx 8*3+128($a_ptr), $t0, $t1 1869 mov $acc2, %rdx 1870 adcx $t0, $acc5 1871 shlx $poly1, $acc2, $t0 1872 adox $t1, $acc0 1873 shrx $poly1, $acc2, $t1 1874 1875 adcx $acc1, $acc0 1876 adox $acc1, $acc1 1877 adc \$0, $acc1 1878 1879 ######################################################################## 1880 # Third reduction step 1881 add $t0, $acc3 1882 adc $t1, $acc4 1883 1884 mulx $poly3, $t0, $t1 1885 mov 8*3($b_ptr), %rdx 1886 adc $t0, $acc5 1887 adc $t1, $acc0 1888 adc \$0, $acc1 1889 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 1890 1891 ######################################################################## 1892 # Multiply by b[3] 1893 mulx 8*0+128($a_ptr), $t0, $t1 1894 adcx $t0, $acc3 1895 adox $t1, $acc4 1896 1897 mulx 8*1+128($a_ptr), $t0, $t1 1898 adcx $t0, $acc4 1899 adox $t1, $acc5 1900 1901 mulx 8*2+128($a_ptr), $t0, $t1 1902 adcx $t0, $acc5 1903 adox $t1, $acc0 1904 1905 mulx 8*3+128($a_ptr), $t0, $t1 1906 mov $acc3, %rdx 1907 adcx $t0, $acc0 1908 shlx $poly1, $acc3, $t0 1909 adox $t1, $acc1 1910 shrx $poly1, $acc3, $t1 1911 1912 adcx $acc2, $acc1 1913 adox $acc2, $acc2 1914 adc \$0, $acc2 1915 1916 ######################################################################## 1917 # Fourth reduction step 1918 add $t0, $acc4 1919 adc $t1, $acc5 1920 1921 mulx $poly3, $t0, $t1 1922 mov $acc4, $t2 1923 mov .Lpoly+8*1(%rip), $poly1 1924 adc $t0, $acc0 1925 mov $acc5, $t3 1926 adc $t1, $acc1 1927 adc \$0, $acc2 1928 1929 ######################################################################## 1930 # Branch-less conditional subtraction of P 1931 xor %eax, %eax 1932 mov $acc0, $t0 1933 sbb \$-1, $acc4 # .Lpoly[0] 1934 sbb $poly1, $acc5 # .Lpoly[1] 1935 sbb \$0, $acc0 # .Lpoly[2] 1936 mov $acc1, $t1 1937 sbb $poly3, $acc1 # .Lpoly[3] 1938 sbb \$0, $acc2 1939 1940 cmovc $t2, $acc4 1941 cmovc $t3, $acc5 1942 mov $acc4, 8*0($r_ptr) 1943 cmovc $t0, $acc0 1944 mov $acc5, 8*1($r_ptr) 1945 cmovc $t1, $acc1 1946 mov $acc0, 8*2($r_ptr) 1947 mov $acc1, 8*3($r_ptr) 1948 1949 ret 1950.cfi_endproc 1951.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 1952 1953.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 1954.align 32 1955__ecp_nistz256_sqr_montx: 1956.cfi_startproc 1957 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1958 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1959 xor %eax, %eax 1960 adc $t0, $acc2 1961 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1962 mov $acc6, %rdx 1963 adc $t1, $acc3 1964 adc \$0, $acc4 1965 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1966 1967 ################################# 1968 mulx $acc7, $t0, $t1 # a[1]*a[2] 1969 adcx $t0, $acc3 1970 adox $t1, $acc4 1971 1972 mulx $acc0, $t0, $t1 # a[1]*a[3] 1973 mov $acc7, %rdx 1974 adcx $t0, $acc4 1975 adox $t1, $acc5 1976 adc \$0, $acc5 1977 1978 ################################# 1979 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1980 mov 8*0+128($a_ptr), %rdx 1981 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1982 adcx $acc1, $acc1 # acc1:6<<1 1983 adox $t0, $acc5 1984 adcx $acc2, $acc2 1985 adox $acc7, $acc6 # of=0 1986 1987 mulx %rdx, $acc0, $t1 1988 mov 8*1+128($a_ptr), %rdx 1989 adcx $acc3, $acc3 1990 adox $t1, $acc1 1991 adcx $acc4, $acc4 1992 mulx %rdx, $t0, $t4 1993 mov 8*2+128($a_ptr), %rdx 1994 adcx $acc5, $acc5 1995 adox $t0, $acc2 1996 adcx $acc6, $acc6 1997 .byte 0x67 1998 mulx %rdx, $t0, $t1 1999 mov 8*3+128($a_ptr), %rdx 2000 adox $t4, $acc3 2001 adcx $acc7, $acc7 2002 adox $t0, $acc4 2003 mov \$32, $a_ptr 2004 adox $t1, $acc5 2005 .byte 0x67,0x67 2006 mulx %rdx, $t0, $t4 2007 mov .Lpoly+8*3(%rip), %rdx 2008 adox $t0, $acc6 2009 shlx $a_ptr, $acc0, $t0 2010 adox $t4, $acc7 2011 shrx $a_ptr, $acc0, $t4 2012 mov %rdx,$t1 2013 2014 # reduction step 1 2015 add $t0, $acc1 2016 adc $t4, $acc2 2017 2018 mulx $acc0, $t0, $acc0 2019 adc $t0, $acc3 2020 shlx $a_ptr, $acc1, $t0 2021 adc \$0, $acc0 2022 shrx $a_ptr, $acc1, $t4 2023 2024 # reduction step 2 2025 add $t0, $acc2 2026 adc $t4, $acc3 2027 2028 mulx $acc1, $t0, $acc1 2029 adc $t0, $acc0 2030 shlx $a_ptr, $acc2, $t0 2031 adc \$0, $acc1 2032 shrx $a_ptr, $acc2, $t4 2033 2034 # reduction step 3 2035 add $t0, $acc3 2036 adc $t4, $acc0 2037 2038 mulx $acc2, $t0, $acc2 2039 adc $t0, $acc1 2040 shlx $a_ptr, $acc3, $t0 2041 adc \$0, $acc2 2042 shrx $a_ptr, $acc3, $t4 2043 2044 # reduction step 4 2045 add $t0, $acc0 2046 adc $t4, $acc1 2047 2048 mulx $acc3, $t0, $acc3 2049 adc $t0, $acc2 2050 adc \$0, $acc3 2051 2052 xor $t3, $t3 2053 add $acc0, $acc4 # accumulate upper half 2054 mov .Lpoly+8*1(%rip), $a_ptr 2055 adc $acc1, $acc5 2056 mov $acc4, $acc0 2057 adc $acc2, $acc6 2058 adc $acc3, $acc7 2059 mov $acc5, $acc1 2060 adc \$0, $t3 2061 2062 sub \$-1, $acc4 # .Lpoly[0] 2063 mov $acc6, $acc2 2064 sbb $a_ptr, $acc5 # .Lpoly[1] 2065 sbb \$0, $acc6 # .Lpoly[2] 2066 mov $acc7, $acc3 2067 sbb $t1, $acc7 # .Lpoly[3] 2068 sbb \$0, $t3 2069 2070 cmovc $acc0, $acc4 2071 cmovc $acc1, $acc5 2072 mov $acc4, 8*0($r_ptr) 2073 cmovc $acc2, $acc6 2074 mov $acc5, 8*1($r_ptr) 2075 cmovc $acc3, $acc7 2076 mov $acc6, 8*2($r_ptr) 2077 mov $acc7, 8*3($r_ptr) 2078 2079 ret 2080.cfi_endproc 2081.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 2082___ 2083} 2084} 2085{ 2086my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2087my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 2088my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 2089my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 2090 2091$code.=<<___; 2092################################################################################ 2093# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 2094.globl ecp_nistz256_select_w5 2095.type ecp_nistz256_select_w5,\@abi-omnipotent 2096.align 32 2097ecp_nistz256_select_w5: 2098.cfi_startproc 2099___ 2100$code.=<<___ if ($avx>1); 2101 leaq OPENSSL_ia32cap_P(%rip), %rax 2102 mov 8(%rax), %rax 2103 test \$`1<<5`, %eax 2104 jnz .Lavx2_select_w5 2105___ 2106$code.=<<___ if ($win64); 2107 lea -0x88(%rsp), %rax 2108.LSEH_begin_ecp_nistz256_select_w5: 2109 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2110 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2111 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2112 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2113 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2114 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2115 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2116 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2117 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2118 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2119 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2120___ 2121$code.=<<___; 2122 movdqa .LOne(%rip), $ONE 2123 movd $index, $INDEX 2124 2125 pxor $Ra, $Ra 2126 pxor $Rb, $Rb 2127 pxor $Rc, $Rc 2128 pxor $Rd, $Rd 2129 pxor $Re, $Re 2130 pxor $Rf, $Rf 2131 2132 movdqa $ONE, $M0 2133 pshufd \$0, $INDEX, $INDEX 2134 2135 mov \$16, %rax 2136.Lselect_loop_sse_w5: 2137 2138 movdqa $M0, $TMP0 2139 paddd $ONE, $M0 2140 pcmpeqd $INDEX, $TMP0 2141 2142 movdqa 16*0($in_t), $T0a 2143 movdqa 16*1($in_t), $T0b 2144 movdqa 16*2($in_t), $T0c 2145 movdqa 16*3($in_t), $T0d 2146 movdqa 16*4($in_t), $T0e 2147 movdqa 16*5($in_t), $T0f 2148 lea 16*6($in_t), $in_t 2149 2150 pand $TMP0, $T0a 2151 pand $TMP0, $T0b 2152 por $T0a, $Ra 2153 pand $TMP0, $T0c 2154 por $T0b, $Rb 2155 pand $TMP0, $T0d 2156 por $T0c, $Rc 2157 pand $TMP0, $T0e 2158 por $T0d, $Rd 2159 pand $TMP0, $T0f 2160 por $T0e, $Re 2161 por $T0f, $Rf 2162 2163 dec %rax 2164 jnz .Lselect_loop_sse_w5 2165 2166 movdqu $Ra, 16*0($val) 2167 movdqu $Rb, 16*1($val) 2168 movdqu $Rc, 16*2($val) 2169 movdqu $Rd, 16*3($val) 2170 movdqu $Re, 16*4($val) 2171 movdqu $Rf, 16*5($val) 2172___ 2173$code.=<<___ if ($win64); 2174 movaps (%rsp), %xmm6 2175 movaps 0x10(%rsp), %xmm7 2176 movaps 0x20(%rsp), %xmm8 2177 movaps 0x30(%rsp), %xmm9 2178 movaps 0x40(%rsp), %xmm10 2179 movaps 0x50(%rsp), %xmm11 2180 movaps 0x60(%rsp), %xmm12 2181 movaps 0x70(%rsp), %xmm13 2182 movaps 0x80(%rsp), %xmm14 2183 movaps 0x90(%rsp), %xmm15 2184 lea 0xa8(%rsp), %rsp 2185___ 2186$code.=<<___; 2187 ret 2188.cfi_endproc 2189.LSEH_end_ecp_nistz256_select_w5: 2190.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 2191 2192################################################################################ 2193# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 2194.globl ecp_nistz256_select_w7 2195.type ecp_nistz256_select_w7,\@abi-omnipotent 2196.align 32 2197ecp_nistz256_select_w7: 2198.cfi_startproc 2199___ 2200$code.=<<___ if ($avx>1); 2201 leaq OPENSSL_ia32cap_P(%rip), %rax 2202 mov 8(%rax), %rax 2203 test \$`1<<5`, %eax 2204 jnz .Lavx2_select_w7 2205___ 2206$code.=<<___ if ($win64); 2207 lea -0x88(%rsp), %rax 2208.LSEH_begin_ecp_nistz256_select_w7: 2209 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2210 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2211 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2212 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2213 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2214 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2215 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2216 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2217 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2218 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2219 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2220___ 2221$code.=<<___; 2222 movdqa .LOne(%rip), $M0 2223 movd $index, $INDEX 2224 2225 pxor $Ra, $Ra 2226 pxor $Rb, $Rb 2227 pxor $Rc, $Rc 2228 pxor $Rd, $Rd 2229 2230 movdqa $M0, $ONE 2231 pshufd \$0, $INDEX, $INDEX 2232 mov \$64, %rax 2233 2234.Lselect_loop_sse_w7: 2235 movdqa $M0, $TMP0 2236 paddd $ONE, $M0 2237 movdqa 16*0($in_t), $T0a 2238 movdqa 16*1($in_t), $T0b 2239 pcmpeqd $INDEX, $TMP0 2240 movdqa 16*2($in_t), $T0c 2241 movdqa 16*3($in_t), $T0d 2242 lea 16*4($in_t), $in_t 2243 2244 pand $TMP0, $T0a 2245 pand $TMP0, $T0b 2246 por $T0a, $Ra 2247 pand $TMP0, $T0c 2248 por $T0b, $Rb 2249 pand $TMP0, $T0d 2250 por $T0c, $Rc 2251 prefetcht0 255($in_t) 2252 por $T0d, $Rd 2253 2254 dec %rax 2255 jnz .Lselect_loop_sse_w7 2256 2257 movdqu $Ra, 16*0($val) 2258 movdqu $Rb, 16*1($val) 2259 movdqu $Rc, 16*2($val) 2260 movdqu $Rd, 16*3($val) 2261___ 2262$code.=<<___ if ($win64); 2263 movaps (%rsp), %xmm6 2264 movaps 0x10(%rsp), %xmm7 2265 movaps 0x20(%rsp), %xmm8 2266 movaps 0x30(%rsp), %xmm9 2267 movaps 0x40(%rsp), %xmm10 2268 movaps 0x50(%rsp), %xmm11 2269 movaps 0x60(%rsp), %xmm12 2270 movaps 0x70(%rsp), %xmm13 2271 movaps 0x80(%rsp), %xmm14 2272 movaps 0x90(%rsp), %xmm15 2273 lea 0xa8(%rsp), %rsp 2274___ 2275$code.=<<___; 2276 ret 2277.cfi_endproc 2278.LSEH_end_ecp_nistz256_select_w7: 2279.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 2280___ 2281} 2282if ($avx>1) { 2283my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2284my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 2285my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 2286my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 2287 2288$code.=<<___; 2289################################################################################ 2290# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index); 2291.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent 2292.align 32 2293ecp_nistz256_avx2_select_w5: 2294.cfi_startproc 2295.Lavx2_select_w5: 2296 vzeroupper 2297___ 2298$code.=<<___ if ($win64); 2299 lea -0x88(%rsp), %rax 2300 mov %rsp,%r11 2301.LSEH_begin_ecp_nistz256_avx2_select_w5: 2302 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2303 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2304 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2305 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2306 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2307 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2308 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2309 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2310 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2311 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2312 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2313___ 2314$code.=<<___; 2315 vmovdqa .LTwo(%rip), $TWO 2316 2317 vpxor $Ra, $Ra, $Ra 2318 vpxor $Rb, $Rb, $Rb 2319 vpxor $Rc, $Rc, $Rc 2320 2321 vmovdqa .LOne(%rip), $M0 2322 vmovdqa .LTwo(%rip), $M1 2323 2324 vmovd $index, %xmm1 2325 vpermd $INDEX, $Ra, $INDEX 2326 2327 mov \$8, %rax 2328.Lselect_loop_avx2_w5: 2329 2330 vmovdqa 32*0($in_t), $T0a 2331 vmovdqa 32*1($in_t), $T0b 2332 vmovdqa 32*2($in_t), $T0c 2333 2334 vmovdqa 32*3($in_t), $T1a 2335 vmovdqa 32*4($in_t), $T1b 2336 vmovdqa 32*5($in_t), $T1c 2337 2338 vpcmpeqd $INDEX, $M0, $TMP0 2339 vpcmpeqd $INDEX, $M1, $TMP1 2340 2341 vpaddd $TWO, $M0, $M0 2342 vpaddd $TWO, $M1, $M1 2343 lea 32*6($in_t), $in_t 2344 2345 vpand $TMP0, $T0a, $T0a 2346 vpand $TMP0, $T0b, $T0b 2347 vpand $TMP0, $T0c, $T0c 2348 vpand $TMP1, $T1a, $T1a 2349 vpand $TMP1, $T1b, $T1b 2350 vpand $TMP1, $T1c, $T1c 2351 2352 vpxor $T0a, $Ra, $Ra 2353 vpxor $T0b, $Rb, $Rb 2354 vpxor $T0c, $Rc, $Rc 2355 vpxor $T1a, $Ra, $Ra 2356 vpxor $T1b, $Rb, $Rb 2357 vpxor $T1c, $Rc, $Rc 2358 2359 dec %rax 2360 jnz .Lselect_loop_avx2_w5 2361 2362 vmovdqu $Ra, 32*0($val) 2363 vmovdqu $Rb, 32*1($val) 2364 vmovdqu $Rc, 32*2($val) 2365 vzeroupper 2366___ 2367$code.=<<___ if ($win64); 2368 movaps (%rsp), %xmm6 2369 movaps 0x10(%rsp), %xmm7 2370 movaps 0x20(%rsp), %xmm8 2371 movaps 0x30(%rsp), %xmm9 2372 movaps 0x40(%rsp), %xmm10 2373 movaps 0x50(%rsp), %xmm11 2374 movaps 0x60(%rsp), %xmm12 2375 movaps 0x70(%rsp), %xmm13 2376 movaps 0x80(%rsp), %xmm14 2377 movaps 0x90(%rsp), %xmm15 2378 lea (%r11), %rsp 2379___ 2380$code.=<<___; 2381 ret 2382.cfi_endproc 2383.LSEH_end_ecp_nistz256_avx2_select_w5: 2384.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 2385___ 2386} 2387if ($avx>1) { 2388my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2389my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 2390my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 2391my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 2392my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 2393 2394$code.=<<___; 2395 2396################################################################################ 2397# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index); 2398.globl ecp_nistz256_avx2_select_w7 2399.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent 2400.align 32 2401ecp_nistz256_avx2_select_w7: 2402.cfi_startproc 2403.Lavx2_select_w7: 2404 vzeroupper 2405___ 2406$code.=<<___ if ($win64); 2407 mov %rsp,%r11 2408 lea -0x88(%rsp), %rax 2409.LSEH_begin_ecp_nistz256_avx2_select_w7: 2410 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2411 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2412 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2413 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2414 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2415 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2416 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2417 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2418 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2419 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2420 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2421___ 2422$code.=<<___; 2423 vmovdqa .LThree(%rip), $THREE 2424 2425 vpxor $Ra, $Ra, $Ra 2426 vpxor $Rb, $Rb, $Rb 2427 2428 vmovdqa .LOne(%rip), $M0 2429 vmovdqa .LTwo(%rip), $M1 2430 vmovdqa .LThree(%rip), $M2 2431 2432 vmovd $index, %xmm1 2433 vpermd $INDEX, $Ra, $INDEX 2434 # Skip index = 0, because it is implicitly the point at infinity 2435 2436 mov \$21, %rax 2437.Lselect_loop_avx2_w7: 2438 2439 vmovdqa 32*0($in_t), $T0a 2440 vmovdqa 32*1($in_t), $T0b 2441 2442 vmovdqa 32*2($in_t), $T1a 2443 vmovdqa 32*3($in_t), $T1b 2444 2445 vmovdqa 32*4($in_t), $T2a 2446 vmovdqa 32*5($in_t), $T2b 2447 2448 vpcmpeqd $INDEX, $M0, $TMP0 2449 vpcmpeqd $INDEX, $M1, $TMP1 2450 vpcmpeqd $INDEX, $M2, $TMP2 2451 2452 vpaddd $THREE, $M0, $M0 2453 vpaddd $THREE, $M1, $M1 2454 vpaddd $THREE, $M2, $M2 2455 lea 32*6($in_t), $in_t 2456 2457 vpand $TMP0, $T0a, $T0a 2458 vpand $TMP0, $T0b, $T0b 2459 vpand $TMP1, $T1a, $T1a 2460 vpand $TMP1, $T1b, $T1b 2461 vpand $TMP2, $T2a, $T2a 2462 vpand $TMP2, $T2b, $T2b 2463 2464 vpxor $T0a, $Ra, $Ra 2465 vpxor $T0b, $Rb, $Rb 2466 vpxor $T1a, $Ra, $Ra 2467 vpxor $T1b, $Rb, $Rb 2468 vpxor $T2a, $Ra, $Ra 2469 vpxor $T2b, $Rb, $Rb 2470 2471 dec %rax 2472 jnz .Lselect_loop_avx2_w7 2473 2474 2475 vmovdqa 32*0($in_t), $T0a 2476 vmovdqa 32*1($in_t), $T0b 2477 2478 vpcmpeqd $INDEX, $M0, $TMP0 2479 2480 vpand $TMP0, $T0a, $T0a 2481 vpand $TMP0, $T0b, $T0b 2482 2483 vpxor $T0a, $Ra, $Ra 2484 vpxor $T0b, $Rb, $Rb 2485 2486 vmovdqu $Ra, 32*0($val) 2487 vmovdqu $Rb, 32*1($val) 2488 vzeroupper 2489___ 2490$code.=<<___ if ($win64); 2491 movaps (%rsp), %xmm6 2492 movaps 0x10(%rsp), %xmm7 2493 movaps 0x20(%rsp), %xmm8 2494 movaps 0x30(%rsp), %xmm9 2495 movaps 0x40(%rsp), %xmm10 2496 movaps 0x50(%rsp), %xmm11 2497 movaps 0x60(%rsp), %xmm12 2498 movaps 0x70(%rsp), %xmm13 2499 movaps 0x80(%rsp), %xmm14 2500 movaps 0x90(%rsp), %xmm15 2501 lea (%r11), %rsp 2502___ 2503$code.=<<___; 2504 ret 2505.cfi_endproc 2506.LSEH_end_ecp_nistz256_avx2_select_w7: 2507.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2508___ 2509} else { 2510$code.=<<___; 2511.globl ecp_nistz256_avx2_select_w7 2512.type ecp_nistz256_avx2_select_w7,\@function,3 2513.align 32 2514ecp_nistz256_avx2_select_w7: 2515 .byte 0x0f,0x0b # ud2 2516 ret 2517.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 2518___ 2519} 2520{{{ 2521######################################################################## 2522# This block implements higher level point_double, point_add and 2523# point_add_affine. The key to performance in this case is to allow 2524# out-of-order execution logic to overlap computations from next step 2525# with tail processing from current step. By using tailored calling 2526# sequence we minimize inter-step overhead to give processor better 2527# shot at overlapping operations... 2528# 2529# You will notice that input data is copied to stack. Trouble is that 2530# there are no registers to spare for holding original pointers and 2531# reloading them, pointers, would create undesired dependencies on 2532# effective addresses calculation paths. In other words it's too done 2533# to favour out-of-order execution logic. 2534# <appro@openssl.org> 2535 2536my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 2537my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 2538my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 2539my ($poly1,$poly3)=($acc6,$acc7); 2540 2541sub load_for_mul () { 2542my ($a,$b,$src0) = @_; 2543my $bias = $src0 eq "%rax" ? 0 : -128; 2544 2545" mov $b, $src0 2546 lea $b, $b_ptr 2547 mov 8*0+$a, $acc1 2548 mov 8*1+$a, $acc2 2549 lea $bias+$a, $a_ptr 2550 mov 8*2+$a, $acc3 2551 mov 8*3+$a, $acc4" 2552} 2553 2554sub load_for_sqr () { 2555my ($a,$src0) = @_; 2556my $bias = $src0 eq "%rax" ? 0 : -128; 2557 2558" mov 8*0+$a, $src0 2559 mov 8*1+$a, $acc6 2560 lea $bias+$a, $a_ptr 2561 mov 8*2+$a, $acc7 2562 mov 8*3+$a, $acc0" 2563} 2564 2565 { 2566######################################################################## 2567# operate in 4-5-0-1 "name space" that matches multiplication output 2568# 2569my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 2570 2571$code.=<<___; 2572.type __ecp_nistz256_add_toq,\@abi-omnipotent 2573.align 32 2574__ecp_nistz256_add_toq: 2575.cfi_startproc 2576 xor $t4,$t4 2577 add 8*0($b_ptr), $a0 2578 adc 8*1($b_ptr), $a1 2579 mov $a0, $t0 2580 adc 8*2($b_ptr), $a2 2581 adc 8*3($b_ptr), $a3 2582 mov $a1, $t1 2583 adc \$0, $t4 2584 2585 sub \$-1, $a0 2586 mov $a2, $t2 2587 sbb $poly1, $a1 2588 sbb \$0, $a2 2589 mov $a3, $t3 2590 sbb $poly3, $a3 2591 sbb \$0, $t4 2592 2593 cmovc $t0, $a0 2594 cmovc $t1, $a1 2595 mov $a0, 8*0($r_ptr) 2596 cmovc $t2, $a2 2597 mov $a1, 8*1($r_ptr) 2598 cmovc $t3, $a3 2599 mov $a2, 8*2($r_ptr) 2600 mov $a3, 8*3($r_ptr) 2601 2602 ret 2603.cfi_endproc 2604.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 2605 2606.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 2607.align 32 2608__ecp_nistz256_sub_fromq: 2609.cfi_startproc 2610 sub 8*0($b_ptr), $a0 2611 sbb 8*1($b_ptr), $a1 2612 mov $a0, $t0 2613 sbb 8*2($b_ptr), $a2 2614 sbb 8*3($b_ptr), $a3 2615 mov $a1, $t1 2616 sbb $t4, $t4 2617 2618 add \$-1, $a0 2619 mov $a2, $t2 2620 adc $poly1, $a1 2621 adc \$0, $a2 2622 mov $a3, $t3 2623 adc $poly3, $a3 2624 test $t4, $t4 2625 2626 cmovz $t0, $a0 2627 cmovz $t1, $a1 2628 mov $a0, 8*0($r_ptr) 2629 cmovz $t2, $a2 2630 mov $a1, 8*1($r_ptr) 2631 cmovz $t3, $a3 2632 mov $a2, 8*2($r_ptr) 2633 mov $a3, 8*3($r_ptr) 2634 2635 ret 2636.cfi_endproc 2637.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 2638 2639.type __ecp_nistz256_subq,\@abi-omnipotent 2640.align 32 2641__ecp_nistz256_subq: 2642.cfi_startproc 2643 sub $a0, $t0 2644 sbb $a1, $t1 2645 mov $t0, $a0 2646 sbb $a2, $t2 2647 sbb $a3, $t3 2648 mov $t1, $a1 2649 sbb $t4, $t4 2650 2651 add \$-1, $t0 2652 mov $t2, $a2 2653 adc $poly1, $t1 2654 adc \$0, $t2 2655 mov $t3, $a3 2656 adc $poly3, $t3 2657 test $t4, $t4 2658 2659 cmovnz $t0, $a0 2660 cmovnz $t1, $a1 2661 cmovnz $t2, $a2 2662 cmovnz $t3, $a3 2663 2664 ret 2665.cfi_endproc 2666.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 2667 2668.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 2669.align 32 2670__ecp_nistz256_mul_by_2q: 2671.cfi_startproc 2672 xor $t4, $t4 2673 add $a0, $a0 # a0:a3+a0:a3 2674 adc $a1, $a1 2675 mov $a0, $t0 2676 adc $a2, $a2 2677 adc $a3, $a3 2678 mov $a1, $t1 2679 adc \$0, $t4 2680 2681 sub \$-1, $a0 2682 mov $a2, $t2 2683 sbb $poly1, $a1 2684 sbb \$0, $a2 2685 mov $a3, $t3 2686 sbb $poly3, $a3 2687 sbb \$0, $t4 2688 2689 cmovc $t0, $a0 2690 cmovc $t1, $a1 2691 mov $a0, 8*0($r_ptr) 2692 cmovc $t2, $a2 2693 mov $a1, 8*1($r_ptr) 2694 cmovc $t3, $a3 2695 mov $a2, 8*2($r_ptr) 2696 mov $a3, 8*3($r_ptr) 2697 2698 ret 2699.cfi_endproc 2700.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 2701___ 2702 } 2703sub gen_double () { 2704 my $x = shift; 2705 my ($src0,$sfx,$bias); 2706 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 2707 2708 if ($x ne "x") { 2709 $src0 = "%rax"; 2710 $sfx = ""; 2711 $bias = 0; 2712 2713$code.=<<___; 2714.globl ecp_nistz256_point_double 2715.type ecp_nistz256_point_double,\@function,2 2716.align 32 2717ecp_nistz256_point_double: 2718.cfi_startproc 2719___ 2720$code.=<<___ if ($addx); 2721 leaq OPENSSL_ia32cap_P(%rip), %rcx 2722 mov 8(%rcx), %rcx 2723 and \$0x80100, %ecx 2724 cmp \$0x80100, %ecx 2725 je .Lpoint_doublex 2726___ 2727 } else { 2728 $src0 = "%rdx"; 2729 $sfx = "x"; 2730 $bias = 128; 2731 2732$code.=<<___; 2733.type ecp_nistz256_point_doublex,\@function,2 2734.align 32 2735ecp_nistz256_point_doublex: 2736.cfi_startproc 2737.Lpoint_doublex: 2738___ 2739 } 2740$code.=<<___; 2741 push %rbp 2742.cfi_push %rbp 2743 push %rbx 2744.cfi_push %rbx 2745 push %r12 2746.cfi_push %r12 2747 push %r13 2748.cfi_push %r13 2749 push %r14 2750.cfi_push %r14 2751 push %r15 2752.cfi_push %r15 2753 sub \$32*5+8, %rsp 2754.cfi_adjust_cfa_offset 32*5+8 2755.Lpoint_double${x}_body: 2756 2757.Lpoint_double_shortcut$x: 2758 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 2759 mov $a_ptr, $b_ptr # backup copy 2760 movdqu 0x10($a_ptr), %xmm1 2761 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 2762 mov 0x20+8*1($a_ptr), $acc5 2763 mov 0x20+8*2($a_ptr), $acc0 2764 mov 0x20+8*3($a_ptr), $acc1 2765 mov .Lpoly+8*1(%rip), $poly1 2766 mov .Lpoly+8*3(%rip), $poly3 2767 movdqa %xmm0, $in_x(%rsp) 2768 movdqa %xmm1, $in_x+0x10(%rsp) 2769 lea 0x20($r_ptr), $acc2 2770 lea 0x40($r_ptr), $acc3 2771 movq $r_ptr, %xmm0 2772 movq $acc2, %xmm1 2773 movq $acc3, %xmm2 2774 2775 lea $S(%rsp), $r_ptr 2776 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 2777 2778 mov 0x40+8*0($a_ptr), $src0 2779 mov 0x40+8*1($a_ptr), $acc6 2780 mov 0x40+8*2($a_ptr), $acc7 2781 mov 0x40+8*3($a_ptr), $acc0 2782 lea 0x40-$bias($a_ptr), $a_ptr 2783 lea $Zsqr(%rsp), $r_ptr 2784 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 2785 2786 `&load_for_sqr("$S(%rsp)", "$src0")` 2787 lea $S(%rsp), $r_ptr 2788 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 2789 2790 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 2791 mov 0x40+8*0($b_ptr), $acc1 2792 mov 0x40+8*1($b_ptr), $acc2 2793 mov 0x40+8*2($b_ptr), $acc3 2794 mov 0x40+8*3($b_ptr), $acc4 2795 lea 0x40-$bias($b_ptr), $a_ptr 2796 lea 0x20($b_ptr), $b_ptr 2797 movq %xmm2, $r_ptr 2798 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 2799 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 2800 2801 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2802 mov $in_x+8*1(%rsp), $acc5 2803 lea $Zsqr(%rsp), $b_ptr 2804 mov $in_x+8*2(%rsp), $acc0 2805 mov $in_x+8*3(%rsp), $acc1 2806 lea $M(%rsp), $r_ptr 2807 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 2808 2809 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 2810 mov $in_x+8*1(%rsp), $acc5 2811 lea $Zsqr(%rsp), $b_ptr 2812 mov $in_x+8*2(%rsp), $acc0 2813 mov $in_x+8*3(%rsp), $acc1 2814 lea $Zsqr(%rsp), $r_ptr 2815 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 2816 2817 `&load_for_sqr("$S(%rsp)", "$src0")` 2818 movq %xmm1, $r_ptr 2819 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 2820___ 2821{ 2822######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 2823# operate in 4-5-6-7 "name space" that matches squaring output 2824# 2825my ($poly1,$poly3)=($a_ptr,$t1); 2826my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 2827 2828$code.=<<___; 2829 xor $t4, $t4 2830 mov $a0, $t0 2831 add \$-1, $a0 2832 mov $a1, $t1 2833 adc $poly1, $a1 2834 mov $a2, $t2 2835 adc \$0, $a2 2836 mov $a3, $t3 2837 adc $poly3, $a3 2838 adc \$0, $t4 2839 xor $a_ptr, $a_ptr # borrow $a_ptr 2840 test \$1, $t0 2841 2842 cmovz $t0, $a0 2843 cmovz $t1, $a1 2844 cmovz $t2, $a2 2845 cmovz $t3, $a3 2846 cmovz $a_ptr, $t4 2847 2848 mov $a1, $t0 # a0:a3>>1 2849 shr \$1, $a0 2850 shl \$63, $t0 2851 mov $a2, $t1 2852 shr \$1, $a1 2853 or $t0, $a0 2854 shl \$63, $t1 2855 mov $a3, $t2 2856 shr \$1, $a2 2857 or $t1, $a1 2858 shl \$63, $t2 2859 mov $a0, 8*0($r_ptr) 2860 shr \$1, $a3 2861 mov $a1, 8*1($r_ptr) 2862 shl \$63, $t4 2863 or $t2, $a2 2864 or $t4, $a3 2865 mov $a2, 8*2($r_ptr) 2866 mov $a3, 8*3($r_ptr) 2867___ 2868} 2869$code.=<<___; 2870 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 2871 lea $M(%rsp), $r_ptr 2872 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 2873 2874 lea $tmp0(%rsp), $r_ptr 2875 call __ecp_nistz256_mul_by_2$x 2876 2877 lea $M(%rsp), $b_ptr 2878 lea $M(%rsp), $r_ptr 2879 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 2880 2881 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 2882 lea $S(%rsp), $r_ptr 2883 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 2884 2885 lea $tmp0(%rsp), $r_ptr 2886 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 2887 2888 `&load_for_sqr("$M(%rsp)", "$src0")` 2889 movq %xmm0, $r_ptr 2890 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 2891 2892 lea $tmp0(%rsp), $b_ptr 2893 mov $acc6, $acc0 # harmonize sqr output and sub input 2894 mov $acc7, $acc1 2895 mov $a_ptr, $poly1 2896 mov $t1, $poly3 2897 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 2898 2899 mov $S+8*0(%rsp), $t0 2900 mov $S+8*1(%rsp), $t1 2901 mov $S+8*2(%rsp), $t2 2902 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 2903 lea $S(%rsp), $r_ptr 2904 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 2905 2906 mov $M(%rsp), $src0 2907 lea $M(%rsp), $b_ptr 2908 mov $acc4, $acc6 # harmonize sub output and mul input 2909 xor %ecx, %ecx 2910 mov $acc4, $S+8*0(%rsp) # have to save:-( 2911 mov $acc5, $acc2 2912 mov $acc5, $S+8*1(%rsp) 2913 cmovz $acc0, $acc3 2914 mov $acc0, $S+8*2(%rsp) 2915 lea $S-$bias(%rsp), $a_ptr 2916 cmovz $acc1, $acc4 2917 mov $acc1, $S+8*3(%rsp) 2918 mov $acc6, $acc1 2919 lea $S(%rsp), $r_ptr 2920 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 2921 2922 movq %xmm1, $b_ptr 2923 movq %xmm1, $r_ptr 2924 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 2925 2926 lea 32*5+56(%rsp), %rsi 2927.cfi_def_cfa %rsi,8 2928 mov -48(%rsi),%r15 2929.cfi_restore %r15 2930 mov -40(%rsi),%r14 2931.cfi_restore %r14 2932 mov -32(%rsi),%r13 2933.cfi_restore %r13 2934 mov -24(%rsi),%r12 2935.cfi_restore %r12 2936 mov -16(%rsi),%rbx 2937.cfi_restore %rbx 2938 mov -8(%rsi),%rbp 2939.cfi_restore %rbp 2940 lea (%rsi),%rsp 2941.cfi_def_cfa_register %rsp 2942.Lpoint_double${x}_epilogue: 2943 ret 2944.cfi_endproc 2945.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 2946___ 2947} 2948&gen_double("q"); 2949 2950sub gen_add () { 2951 my $x = shift; 2952 my ($src0,$sfx,$bias); 2953 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 2954 $U1,$U2,$S1,$S2, 2955 $res_x,$res_y,$res_z, 2956 $in1_x,$in1_y,$in1_z, 2957 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 2958 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 2959 2960 if ($x ne "x") { 2961 $src0 = "%rax"; 2962 $sfx = ""; 2963 $bias = 0; 2964 2965$code.=<<___; 2966.globl ecp_nistz256_point_add 2967.type ecp_nistz256_point_add,\@function,3 2968.align 32 2969ecp_nistz256_point_add: 2970.cfi_startproc 2971___ 2972$code.=<<___ if ($addx); 2973 leaq OPENSSL_ia32cap_P(%rip), %rcx 2974 mov 8(%rcx), %rcx 2975 and \$0x80100, %ecx 2976 cmp \$0x80100, %ecx 2977 je .Lpoint_addx 2978___ 2979 } else { 2980 $src0 = "%rdx"; 2981 $sfx = "x"; 2982 $bias = 128; 2983 2984$code.=<<___; 2985.type ecp_nistz256_point_addx,\@function,3 2986.align 32 2987ecp_nistz256_point_addx: 2988.cfi_startproc 2989.Lpoint_addx: 2990___ 2991 } 2992$code.=<<___; 2993 push %rbp 2994.cfi_push %rbp 2995 push %rbx 2996.cfi_push %rbx 2997 push %r12 2998.cfi_push %r12 2999 push %r13 3000.cfi_push %r13 3001 push %r14 3002.cfi_push %r14 3003 push %r15 3004.cfi_push %r15 3005 sub \$32*18+8, %rsp 3006.cfi_adjust_cfa_offset 32*18+8 3007.Lpoint_add${x}_body: 3008 3009 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3010 movdqu 0x10($a_ptr), %xmm1 3011 movdqu 0x20($a_ptr), %xmm2 3012 movdqu 0x30($a_ptr), %xmm3 3013 movdqu 0x40($a_ptr), %xmm4 3014 movdqu 0x50($a_ptr), %xmm5 3015 mov $a_ptr, $b_ptr # reassign 3016 mov $b_org, $a_ptr # reassign 3017 movdqa %xmm0, $in1_x(%rsp) 3018 movdqa %xmm1, $in1_x+0x10(%rsp) 3019 movdqa %xmm2, $in1_y(%rsp) 3020 movdqa %xmm3, $in1_y+0x10(%rsp) 3021 movdqa %xmm4, $in1_z(%rsp) 3022 movdqa %xmm5, $in1_z+0x10(%rsp) 3023 por %xmm4, %xmm5 3024 3025 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 3026 pshufd \$0xb1, %xmm5, %xmm3 3027 movdqu 0x10($a_ptr), %xmm1 3028 movdqu 0x20($a_ptr), %xmm2 3029 por %xmm3, %xmm5 3030 movdqu 0x30($a_ptr), %xmm3 3031 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 3032 mov 0x40+8*1($a_ptr), $acc6 3033 mov 0x40+8*2($a_ptr), $acc7 3034 mov 0x40+8*3($a_ptr), $acc0 3035 movdqa %xmm0, $in2_x(%rsp) 3036 pshufd \$0x1e, %xmm5, %xmm4 3037 movdqa %xmm1, $in2_x+0x10(%rsp) 3038 movdqu 0x40($a_ptr),%xmm0 # in2_z again 3039 movdqu 0x50($a_ptr),%xmm1 3040 movdqa %xmm2, $in2_y(%rsp) 3041 movdqa %xmm3, $in2_y+0x10(%rsp) 3042 por %xmm4, %xmm5 3043 pxor %xmm4, %xmm4 3044 por %xmm0, %xmm1 3045 movq $r_ptr, %xmm0 # save $r_ptr 3046 3047 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3048 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 3049 mov $acc6, $in2_z+8*1(%rsp) 3050 mov $acc7, $in2_z+8*2(%rsp) 3051 mov $acc0, $in2_z+8*3(%rsp) 3052 lea $Z2sqr(%rsp), $r_ptr # Z2^2 3053 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 3054 3055 pcmpeqd %xmm4, %xmm5 3056 pshufd \$0xb1, %xmm1, %xmm4 3057 por %xmm1, %xmm4 3058 pshufd \$0, %xmm5, %xmm5 # in1infty 3059 pshufd \$0x1e, %xmm4, %xmm3 3060 por %xmm3, %xmm4 3061 pxor %xmm3, %xmm3 3062 pcmpeqd %xmm3, %xmm4 3063 pshufd \$0, %xmm4, %xmm4 # in2infty 3064 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 3065 mov 0x40+8*1($b_ptr), $acc6 3066 mov 0x40+8*2($b_ptr), $acc7 3067 mov 0x40+8*3($b_ptr), $acc0 3068 movq $b_ptr, %xmm1 3069 3070 lea 0x40-$bias($b_ptr), $a_ptr 3071 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3072 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3073 3074 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 3075 lea $S1(%rsp), $r_ptr # S1 = Z2^3 3076 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 3077 3078 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3079 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3080 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3081 3082 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 3083 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 3084 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 3085 3086 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3087 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3088 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3089 3090 lea $S1(%rsp), $b_ptr 3091 lea $R(%rsp), $r_ptr # R = S2 - S1 3092 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 3093 3094 or $acc5, $acc4 # see if result is zero 3095 movdqa %xmm4, %xmm2 3096 or $acc0, $acc4 3097 or $acc1, $acc4 3098 por %xmm5, %xmm2 # in1infty || in2infty 3099 movq $acc4, %xmm3 3100 3101 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3102 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 3103 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 3104 3105 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 3106 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3107 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 3108 3109 lea $U1(%rsp), $b_ptr 3110 lea $H(%rsp), $r_ptr # H = U2 - U1 3111 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 3112 3113 or $acc5, $acc4 # see if result is zero 3114 or $acc0, $acc4 3115 or $acc1, $acc4 # !is_equal(U1, U2) 3116 3117 movq %xmm2, $acc0 3118 movq %xmm3, $acc1 3119 or $acc0, $acc4 3120 .byte 0x3e # predict taken 3121 jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty 3122 3123 # We now know A = B or A = -B and neither is infinity. Compare the 3124 # y-coordinates via S1 and S2. 3125 test $acc1, $acc1 3126 jz .Ladd_double$x # is_equal(S1, S2) 3127 3128 # A = -B, so the result is infinity. 3129 # 3130 # TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in 3131 # which case we should eliminate this special-case and simplify the 3132 # timing analysis. 3133 movq %xmm0, $r_ptr # restore $r_ptr 3134 pxor %xmm0, %xmm0 3135 movdqu %xmm0, 0x00($r_ptr) 3136 movdqu %xmm0, 0x10($r_ptr) 3137 movdqu %xmm0, 0x20($r_ptr) 3138 movdqu %xmm0, 0x30($r_ptr) 3139 movdqu %xmm0, 0x40($r_ptr) 3140 movdqu %xmm0, 0x50($r_ptr) 3141 jmp .Ladd_done$x 3142 3143.align 32 3144.Ladd_double$x: 3145 movq %xmm1, $a_ptr # restore $a_ptr 3146 movq %xmm0, $r_ptr # restore $r_ptr 3147 add \$`32*(18-5)`, %rsp # difference in frame sizes 3148.cfi_adjust_cfa_offset `-32*(18-5)` 3149 jmp .Lpoint_double_shortcut$x 3150.cfi_adjust_cfa_offset `32*(18-5)` 3151 3152.align 32 3153.Ladd_proceed$x: 3154 `&load_for_sqr("$R(%rsp)", "$src0")` 3155 lea $Rsqr(%rsp), $r_ptr # R^2 3156 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3157 3158 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3159 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3160 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3161 3162 `&load_for_sqr("$H(%rsp)", "$src0")` 3163 lea $Hsqr(%rsp), $r_ptr # H^2 3164 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3165 3166 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 3167 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3168 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 3169 3170 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 3171 lea $Hcub(%rsp), $r_ptr # H^3 3172 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3173 3174 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 3175 lea $U2(%rsp), $r_ptr # U1*H^2 3176 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 3177___ 3178{ 3179####################################################################### 3180# operate in 4-5-0-1 "name space" that matches multiplication output 3181# 3182my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3183my ($poly1, $poly3)=($acc6,$acc7); 3184 3185$code.=<<___; 3186 #lea $U2(%rsp), $a_ptr 3187 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3188 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3189 3190 xor $t4, $t4 3191 add $acc0, $acc0 # a0:a3+a0:a3 3192 lea $Rsqr(%rsp), $a_ptr 3193 adc $acc1, $acc1 3194 mov $acc0, $t0 3195 adc $acc2, $acc2 3196 adc $acc3, $acc3 3197 mov $acc1, $t1 3198 adc \$0, $t4 3199 3200 sub \$-1, $acc0 3201 mov $acc2, $t2 3202 sbb $poly1, $acc1 3203 sbb \$0, $acc2 3204 mov $acc3, $t3 3205 sbb $poly3, $acc3 3206 sbb \$0, $t4 3207 3208 cmovc $t0, $acc0 3209 mov 8*0($a_ptr), $t0 3210 cmovc $t1, $acc1 3211 mov 8*1($a_ptr), $t1 3212 cmovc $t2, $acc2 3213 mov 8*2($a_ptr), $t2 3214 cmovc $t3, $acc3 3215 mov 8*3($a_ptr), $t3 3216 3217 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3218 3219 lea $Hcub(%rsp), $b_ptr 3220 lea $res_x(%rsp), $r_ptr 3221 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3222 3223 mov $U2+8*0(%rsp), $t0 3224 mov $U2+8*1(%rsp), $t1 3225 mov $U2+8*2(%rsp), $t2 3226 mov $U2+8*3(%rsp), $t3 3227 lea $res_y(%rsp), $r_ptr 3228 3229 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 3230 3231 mov $acc0, 8*0($r_ptr) # save the result, as 3232 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3233 mov $acc2, 8*2($r_ptr) 3234 mov $acc3, 8*3($r_ptr) 3235___ 3236} 3237$code.=<<___; 3238 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 3239 lea $S2(%rsp), $r_ptr 3240 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 3241 3242 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 3243 lea $res_y(%rsp), $r_ptr 3244 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 3245 3246 lea $S2(%rsp), $b_ptr 3247 lea $res_y(%rsp), $r_ptr 3248 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 3249 3250 movq %xmm0, $r_ptr # restore $r_ptr 3251 3252 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 3253 movdqa %xmm5, %xmm1 3254 pandn $res_z(%rsp), %xmm0 3255 movdqa %xmm5, %xmm2 3256 pandn $res_z+0x10(%rsp), %xmm1 3257 movdqa %xmm5, %xmm3 3258 pand $in2_z(%rsp), %xmm2 3259 pand $in2_z+0x10(%rsp), %xmm3 3260 por %xmm0, %xmm2 3261 por %xmm1, %xmm3 3262 3263 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3264 movdqa %xmm4, %xmm1 3265 pandn %xmm2, %xmm0 3266 movdqa %xmm4, %xmm2 3267 pandn %xmm3, %xmm1 3268 movdqa %xmm4, %xmm3 3269 pand $in1_z(%rsp), %xmm2 3270 pand $in1_z+0x10(%rsp), %xmm3 3271 por %xmm0, %xmm2 3272 por %xmm1, %xmm3 3273 movdqu %xmm2, 0x40($r_ptr) 3274 movdqu %xmm3, 0x50($r_ptr) 3275 3276 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3277 movdqa %xmm5, %xmm1 3278 pandn $res_x(%rsp), %xmm0 3279 movdqa %xmm5, %xmm2 3280 pandn $res_x+0x10(%rsp), %xmm1 3281 movdqa %xmm5, %xmm3 3282 pand $in2_x(%rsp), %xmm2 3283 pand $in2_x+0x10(%rsp), %xmm3 3284 por %xmm0, %xmm2 3285 por %xmm1, %xmm3 3286 3287 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3288 movdqa %xmm4, %xmm1 3289 pandn %xmm2, %xmm0 3290 movdqa %xmm4, %xmm2 3291 pandn %xmm3, %xmm1 3292 movdqa %xmm4, %xmm3 3293 pand $in1_x(%rsp), %xmm2 3294 pand $in1_x+0x10(%rsp), %xmm3 3295 por %xmm0, %xmm2 3296 por %xmm1, %xmm3 3297 movdqu %xmm2, 0x00($r_ptr) 3298 movdqu %xmm3, 0x10($r_ptr) 3299 3300 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3301 movdqa %xmm5, %xmm1 3302 pandn $res_y(%rsp), %xmm0 3303 movdqa %xmm5, %xmm2 3304 pandn $res_y+0x10(%rsp), %xmm1 3305 movdqa %xmm5, %xmm3 3306 pand $in2_y(%rsp), %xmm2 3307 pand $in2_y+0x10(%rsp), %xmm3 3308 por %xmm0, %xmm2 3309 por %xmm1, %xmm3 3310 3311 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3312 movdqa %xmm4, %xmm1 3313 pandn %xmm2, %xmm0 3314 movdqa %xmm4, %xmm2 3315 pandn %xmm3, %xmm1 3316 movdqa %xmm4, %xmm3 3317 pand $in1_y(%rsp), %xmm2 3318 pand $in1_y+0x10(%rsp), %xmm3 3319 por %xmm0, %xmm2 3320 por %xmm1, %xmm3 3321 movdqu %xmm2, 0x20($r_ptr) 3322 movdqu %xmm3, 0x30($r_ptr) 3323 3324.Ladd_done$x: 3325 lea 32*18+56(%rsp), %rsi 3326.cfi_def_cfa %rsi,8 3327 mov -48(%rsi),%r15 3328.cfi_restore %r15 3329 mov -40(%rsi),%r14 3330.cfi_restore %r14 3331 mov -32(%rsi),%r13 3332.cfi_restore %r13 3333 mov -24(%rsi),%r12 3334.cfi_restore %r12 3335 mov -16(%rsi),%rbx 3336.cfi_restore %rbx 3337 mov -8(%rsi),%rbp 3338.cfi_restore %rbp 3339 lea (%rsi),%rsp 3340.cfi_def_cfa_register %rsp 3341.Lpoint_add${x}_epilogue: 3342 ret 3343.cfi_endproc 3344.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 3345___ 3346} 3347&gen_add("q"); 3348 3349sub gen_add_affine () { 3350 my $x = shift; 3351 my ($src0,$sfx,$bias); 3352 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 3353 $res_x,$res_y,$res_z, 3354 $in1_x,$in1_y,$in1_z, 3355 $in2_x,$in2_y)=map(32*$_,(0..14)); 3356 my $Z1sqr = $S2; 3357 3358 if ($x ne "x") { 3359 $src0 = "%rax"; 3360 $sfx = ""; 3361 $bias = 0; 3362 3363$code.=<<___; 3364.globl ecp_nistz256_point_add_affine 3365.type ecp_nistz256_point_add_affine,\@function,3 3366.align 32 3367ecp_nistz256_point_add_affine: 3368.cfi_startproc 3369___ 3370$code.=<<___ if ($addx); 3371 leaq OPENSSL_ia32cap_P(%rip), %rcx 3372 mov 8(%rcx), %rcx 3373 and \$0x80100, %ecx 3374 cmp \$0x80100, %ecx 3375 je .Lpoint_add_affinex 3376___ 3377 } else { 3378 $src0 = "%rdx"; 3379 $sfx = "x"; 3380 $bias = 128; 3381 3382$code.=<<___; 3383.type ecp_nistz256_point_add_affinex,\@function,3 3384.align 32 3385ecp_nistz256_point_add_affinex: 3386.cfi_startproc 3387.Lpoint_add_affinex: 3388___ 3389 } 3390$code.=<<___; 3391 push %rbp 3392.cfi_push %rbp 3393 push %rbx 3394.cfi_push %rbx 3395 push %r12 3396.cfi_push %r12 3397 push %r13 3398.cfi_push %r13 3399 push %r14 3400.cfi_push %r14 3401 push %r15 3402.cfi_push %r15 3403 sub \$32*15+8, %rsp 3404.cfi_adjust_cfa_offset 32*15+8 3405.Ladd_affine${x}_body: 3406 3407 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3408 mov $b_org, $b_ptr # reassign 3409 movdqu 0x10($a_ptr), %xmm1 3410 movdqu 0x20($a_ptr), %xmm2 3411 movdqu 0x30($a_ptr), %xmm3 3412 movdqu 0x40($a_ptr), %xmm4 3413 movdqu 0x50($a_ptr), %xmm5 3414 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 3415 mov 0x40+8*1($a_ptr), $acc6 3416 mov 0x40+8*2($a_ptr), $acc7 3417 mov 0x40+8*3($a_ptr), $acc0 3418 movdqa %xmm0, $in1_x(%rsp) 3419 movdqa %xmm1, $in1_x+0x10(%rsp) 3420 movdqa %xmm2, $in1_y(%rsp) 3421 movdqa %xmm3, $in1_y+0x10(%rsp) 3422 movdqa %xmm4, $in1_z(%rsp) 3423 movdqa %xmm5, $in1_z+0x10(%rsp) 3424 por %xmm4, %xmm5 3425 3426 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 3427 pshufd \$0xb1, %xmm5, %xmm3 3428 movdqu 0x10($b_ptr), %xmm1 3429 movdqu 0x20($b_ptr), %xmm2 3430 por %xmm3, %xmm5 3431 movdqu 0x30($b_ptr), %xmm3 3432 movdqa %xmm0, $in2_x(%rsp) 3433 pshufd \$0x1e, %xmm5, %xmm4 3434 movdqa %xmm1, $in2_x+0x10(%rsp) 3435 por %xmm0, %xmm1 3436 movq $r_ptr, %xmm0 # save $r_ptr 3437 movdqa %xmm2, $in2_y(%rsp) 3438 movdqa %xmm3, $in2_y+0x10(%rsp) 3439 por %xmm2, %xmm3 3440 por %xmm4, %xmm5 3441 pxor %xmm4, %xmm4 3442 por %xmm1, %xmm3 3443 3444 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3445 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3446 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3447 3448 pcmpeqd %xmm4, %xmm5 3449 pshufd \$0xb1, %xmm3, %xmm4 3450 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 3451 #lea 0x00($b_ptr), $b_ptr 3452 mov $acc4, $acc1 # harmonize sqr output and mul input 3453 por %xmm3, %xmm4 3454 pshufd \$0, %xmm5, %xmm5 # in1infty 3455 pshufd \$0x1e, %xmm4, %xmm3 3456 mov $acc5, $acc2 3457 por %xmm3, %xmm4 3458 pxor %xmm3, %xmm3 3459 mov $acc6, $acc3 3460 pcmpeqd %xmm3, %xmm4 3461 pshufd \$0, %xmm4, %xmm4 # in2infty 3462 3463 lea $Z1sqr-$bias(%rsp), $a_ptr 3464 mov $acc7, $acc4 3465 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3466 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 3467 3468 lea $in1_x(%rsp), $b_ptr 3469 lea $H(%rsp), $r_ptr # H = U2 - U1 3470 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 3471 3472 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3473 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3474 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3475 3476 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3477 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3478 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3479 3480 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3481 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3482 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3483 3484 lea $in1_y(%rsp), $b_ptr 3485 lea $R(%rsp), $r_ptr # R = S2 - S1 3486 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 3487 3488 `&load_for_sqr("$H(%rsp)", "$src0")` 3489 lea $Hsqr(%rsp), $r_ptr # H^2 3490 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3491 3492 `&load_for_sqr("$R(%rsp)", "$src0")` 3493 lea $Rsqr(%rsp), $r_ptr # R^2 3494 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3495 3496 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 3497 lea $Hcub(%rsp), $r_ptr # H^3 3498 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3499 3500 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3501 lea $U2(%rsp), $r_ptr # U1*H^2 3502 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 3503___ 3504{ 3505####################################################################### 3506# operate in 4-5-0-1 "name space" that matches multiplication output 3507# 3508my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3509my ($poly1, $poly3)=($acc6,$acc7); 3510 3511$code.=<<___; 3512 #lea $U2(%rsp), $a_ptr 3513 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3514 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3515 3516 xor $t4, $t4 3517 add $acc0, $acc0 # a0:a3+a0:a3 3518 lea $Rsqr(%rsp), $a_ptr 3519 adc $acc1, $acc1 3520 mov $acc0, $t0 3521 adc $acc2, $acc2 3522 adc $acc3, $acc3 3523 mov $acc1, $t1 3524 adc \$0, $t4 3525 3526 sub \$-1, $acc0 3527 mov $acc2, $t2 3528 sbb $poly1, $acc1 3529 sbb \$0, $acc2 3530 mov $acc3, $t3 3531 sbb $poly3, $acc3 3532 sbb \$0, $t4 3533 3534 cmovc $t0, $acc0 3535 mov 8*0($a_ptr), $t0 3536 cmovc $t1, $acc1 3537 mov 8*1($a_ptr), $t1 3538 cmovc $t2, $acc2 3539 mov 8*2($a_ptr), $t2 3540 cmovc $t3, $acc3 3541 mov 8*3($a_ptr), $t3 3542 3543 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3544 3545 lea $Hcub(%rsp), $b_ptr 3546 lea $res_x(%rsp), $r_ptr 3547 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3548 3549 mov $U2+8*0(%rsp), $t0 3550 mov $U2+8*1(%rsp), $t1 3551 mov $U2+8*2(%rsp), $t2 3552 mov $U2+8*3(%rsp), $t3 3553 lea $H(%rsp), $r_ptr 3554 3555 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 3556 3557 mov $acc0, 8*0($r_ptr) # save the result, as 3558 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3559 mov $acc2, 8*2($r_ptr) 3560 mov $acc3, 8*3($r_ptr) 3561___ 3562} 3563$code.=<<___; 3564 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 3565 lea $S2(%rsp), $r_ptr 3566 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 3567 3568 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 3569 lea $H(%rsp), $r_ptr 3570 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 3571 3572 lea $S2(%rsp), $b_ptr 3573 lea $res_y(%rsp), $r_ptr 3574 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 3575 3576 movq %xmm0, $r_ptr # restore $r_ptr 3577 3578 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 3579 movdqa %xmm5, %xmm1 3580 pandn $res_z(%rsp), %xmm0 3581 movdqa %xmm5, %xmm2 3582 pandn $res_z+0x10(%rsp), %xmm1 3583 movdqa %xmm5, %xmm3 3584 pand .LONE_mont(%rip), %xmm2 3585 pand .LONE_mont+0x10(%rip), %xmm3 3586 por %xmm0, %xmm2 3587 por %xmm1, %xmm3 3588 3589 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3590 movdqa %xmm4, %xmm1 3591 pandn %xmm2, %xmm0 3592 movdqa %xmm4, %xmm2 3593 pandn %xmm3, %xmm1 3594 movdqa %xmm4, %xmm3 3595 pand $in1_z(%rsp), %xmm2 3596 pand $in1_z+0x10(%rsp), %xmm3 3597 por %xmm0, %xmm2 3598 por %xmm1, %xmm3 3599 movdqu %xmm2, 0x40($r_ptr) 3600 movdqu %xmm3, 0x50($r_ptr) 3601 3602 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3603 movdqa %xmm5, %xmm1 3604 pandn $res_x(%rsp), %xmm0 3605 movdqa %xmm5, %xmm2 3606 pandn $res_x+0x10(%rsp), %xmm1 3607 movdqa %xmm5, %xmm3 3608 pand $in2_x(%rsp), %xmm2 3609 pand $in2_x+0x10(%rsp), %xmm3 3610 por %xmm0, %xmm2 3611 por %xmm1, %xmm3 3612 3613 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3614 movdqa %xmm4, %xmm1 3615 pandn %xmm2, %xmm0 3616 movdqa %xmm4, %xmm2 3617 pandn %xmm3, %xmm1 3618 movdqa %xmm4, %xmm3 3619 pand $in1_x(%rsp), %xmm2 3620 pand $in1_x+0x10(%rsp), %xmm3 3621 por %xmm0, %xmm2 3622 por %xmm1, %xmm3 3623 movdqu %xmm2, 0x00($r_ptr) 3624 movdqu %xmm3, 0x10($r_ptr) 3625 3626 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3627 movdqa %xmm5, %xmm1 3628 pandn $res_y(%rsp), %xmm0 3629 movdqa %xmm5, %xmm2 3630 pandn $res_y+0x10(%rsp), %xmm1 3631 movdqa %xmm5, %xmm3 3632 pand $in2_y(%rsp), %xmm2 3633 pand $in2_y+0x10(%rsp), %xmm3 3634 por %xmm0, %xmm2 3635 por %xmm1, %xmm3 3636 3637 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3638 movdqa %xmm4, %xmm1 3639 pandn %xmm2, %xmm0 3640 movdqa %xmm4, %xmm2 3641 pandn %xmm3, %xmm1 3642 movdqa %xmm4, %xmm3 3643 pand $in1_y(%rsp), %xmm2 3644 pand $in1_y+0x10(%rsp), %xmm3 3645 por %xmm0, %xmm2 3646 por %xmm1, %xmm3 3647 movdqu %xmm2, 0x20($r_ptr) 3648 movdqu %xmm3, 0x30($r_ptr) 3649 3650 lea 32*15+56(%rsp), %rsi 3651.cfi_def_cfa %rsi,8 3652 mov -48(%rsi),%r15 3653.cfi_restore %r15 3654 mov -40(%rsi),%r14 3655.cfi_restore %r14 3656 mov -32(%rsi),%r13 3657.cfi_restore %r13 3658 mov -24(%rsi),%r12 3659.cfi_restore %r12 3660 mov -16(%rsi),%rbx 3661.cfi_restore %rbx 3662 mov -8(%rsi),%rbp 3663.cfi_restore %rbp 3664 lea (%rsi),%rsp 3665.cfi_def_cfa_register %rsp 3666.Ladd_affine${x}_epilogue: 3667 ret 3668.cfi_endproc 3669.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 3670___ 3671} 3672&gen_add_affine("q"); 3673 3674######################################################################## 3675# AD*X magic 3676# 3677if ($addx) { { 3678######################################################################## 3679# operate in 4-5-0-1 "name space" that matches multiplication output 3680# 3681my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3682 3683$code.=<<___; 3684.type __ecp_nistz256_add_tox,\@abi-omnipotent 3685.align 32 3686__ecp_nistz256_add_tox: 3687.cfi_startproc 3688 xor $t4, $t4 3689 adc 8*0($b_ptr), $a0 3690 adc 8*1($b_ptr), $a1 3691 mov $a0, $t0 3692 adc 8*2($b_ptr), $a2 3693 adc 8*3($b_ptr), $a3 3694 mov $a1, $t1 3695 adc \$0, $t4 3696 3697 xor $t3, $t3 3698 sbb \$-1, $a0 3699 mov $a2, $t2 3700 sbb $poly1, $a1 3701 sbb \$0, $a2 3702 mov $a3, $t3 3703 sbb $poly3, $a3 3704 sbb \$0, $t4 3705 3706 cmovc $t0, $a0 3707 cmovc $t1, $a1 3708 mov $a0, 8*0($r_ptr) 3709 cmovc $t2, $a2 3710 mov $a1, 8*1($r_ptr) 3711 cmovc $t3, $a3 3712 mov $a2, 8*2($r_ptr) 3713 mov $a3, 8*3($r_ptr) 3714 3715 ret 3716.cfi_endproc 3717.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 3718 3719.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 3720.align 32 3721__ecp_nistz256_sub_fromx: 3722.cfi_startproc 3723 xor $t4, $t4 3724 sbb 8*0($b_ptr), $a0 3725 sbb 8*1($b_ptr), $a1 3726 mov $a0, $t0 3727 sbb 8*2($b_ptr), $a2 3728 sbb 8*3($b_ptr), $a3 3729 mov $a1, $t1 3730 sbb \$0, $t4 3731 3732 xor $t3, $t3 3733 adc \$-1, $a0 3734 mov $a2, $t2 3735 adc $poly1, $a1 3736 adc \$0, $a2 3737 mov $a3, $t3 3738 adc $poly3, $a3 3739 3740 bt \$0, $t4 3741 cmovnc $t0, $a0 3742 cmovnc $t1, $a1 3743 mov $a0, 8*0($r_ptr) 3744 cmovnc $t2, $a2 3745 mov $a1, 8*1($r_ptr) 3746 cmovnc $t3, $a3 3747 mov $a2, 8*2($r_ptr) 3748 mov $a3, 8*3($r_ptr) 3749 3750 ret 3751.cfi_endproc 3752.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 3753 3754.type __ecp_nistz256_subx,\@abi-omnipotent 3755.align 32 3756__ecp_nistz256_subx: 3757.cfi_startproc 3758 xor $t4, $t4 3759 sbb $a0, $t0 3760 sbb $a1, $t1 3761 mov $t0, $a0 3762 sbb $a2, $t2 3763 sbb $a3, $t3 3764 mov $t1, $a1 3765 sbb \$0, $t4 3766 3767 xor $a3 ,$a3 3768 adc \$-1, $t0 3769 mov $t2, $a2 3770 adc $poly1, $t1 3771 adc \$0, $t2 3772 mov $t3, $a3 3773 adc $poly3, $t3 3774 3775 bt \$0, $t4 3776 cmovc $t0, $a0 3777 cmovc $t1, $a1 3778 cmovc $t2, $a2 3779 cmovc $t3, $a3 3780 3781 ret 3782.cfi_endproc 3783.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 3784 3785.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 3786.align 32 3787__ecp_nistz256_mul_by_2x: 3788.cfi_startproc 3789 xor $t4, $t4 3790 adc $a0, $a0 # a0:a3+a0:a3 3791 adc $a1, $a1 3792 mov $a0, $t0 3793 adc $a2, $a2 3794 adc $a3, $a3 3795 mov $a1, $t1 3796 adc \$0, $t4 3797 3798 xor $t3, $t3 3799 sbb \$-1, $a0 3800 mov $a2, $t2 3801 sbb $poly1, $a1 3802 sbb \$0, $a2 3803 mov $a3, $t3 3804 sbb $poly3, $a3 3805 sbb \$0, $t4 3806 3807 cmovc $t0, $a0 3808 cmovc $t1, $a1 3809 mov $a0, 8*0($r_ptr) 3810 cmovc $t2, $a2 3811 mov $a1, 8*1($r_ptr) 3812 cmovc $t3, $a3 3813 mov $a2, 8*2($r_ptr) 3814 mov $a3, 8*3($r_ptr) 3815 3816 ret 3817.cfi_endproc 3818.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 3819___ 3820 } 3821&gen_double("x"); 3822&gen_add("x"); 3823&gen_add_affine("x"); 3824} 3825}}} 3826 3827# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3828# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3829if ($win64) { 3830$rec="%rcx"; 3831$frame="%rdx"; 3832$context="%r8"; 3833$disp="%r9"; 3834 3835$code.=<<___; 3836.extern __imp_RtlVirtualUnwind 3837 3838.type short_handler,\@abi-omnipotent 3839.align 16 3840short_handler: 3841 push %rsi 3842 push %rdi 3843 push %rbx 3844 push %rbp 3845 push %r12 3846 push %r13 3847 push %r14 3848 push %r15 3849 pushfq 3850 sub \$64,%rsp 3851 3852 mov 120($context),%rax # pull context->Rax 3853 mov 248($context),%rbx # pull context->Rip 3854 3855 mov 8($disp),%rsi # disp->ImageBase 3856 mov 56($disp),%r11 # disp->HandlerData 3857 3858 mov 0(%r11),%r10d # HandlerData[0] 3859 lea (%rsi,%r10),%r10 # end of prologue label 3860 cmp %r10,%rbx # context->Rip<end of prologue label 3861 jb .Lcommon_seh_tail 3862 3863 mov 152($context),%rax # pull context->Rsp 3864 3865 mov 4(%r11),%r10d # HandlerData[1] 3866 lea (%rsi,%r10),%r10 # epilogue label 3867 cmp %r10,%rbx # context->Rip>=epilogue label 3868 jae .Lcommon_seh_tail 3869 3870 lea 16(%rax),%rax 3871 3872 mov -8(%rax),%r12 3873 mov -16(%rax),%r13 3874 mov %r12,216($context) # restore context->R12 3875 mov %r13,224($context) # restore context->R13 3876 3877 jmp .Lcommon_seh_tail 3878.size short_handler,.-short_handler 3879 3880.type full_handler,\@abi-omnipotent 3881.align 16 3882full_handler: 3883 push %rsi 3884 push %rdi 3885 push %rbx 3886 push %rbp 3887 push %r12 3888 push %r13 3889 push %r14 3890 push %r15 3891 pushfq 3892 sub \$64,%rsp 3893 3894 mov 120($context),%rax # pull context->Rax 3895 mov 248($context),%rbx # pull context->Rip 3896 3897 mov 8($disp),%rsi # disp->ImageBase 3898 mov 56($disp),%r11 # disp->HandlerData 3899 3900 mov 0(%r11),%r10d # HandlerData[0] 3901 lea (%rsi,%r10),%r10 # end of prologue label 3902 cmp %r10,%rbx # context->Rip<end of prologue label 3903 jb .Lcommon_seh_tail 3904 3905 mov 152($context),%rax # pull context->Rsp 3906 3907 mov 4(%r11),%r10d # HandlerData[1] 3908 lea (%rsi,%r10),%r10 # epilogue label 3909 cmp %r10,%rbx # context->Rip>=epilogue label 3910 jae .Lcommon_seh_tail 3911 3912 mov 8(%r11),%r10d # HandlerData[2] 3913 lea (%rax,%r10),%rax 3914 3915 mov -8(%rax),%rbp 3916 mov -16(%rax),%rbx 3917 mov -24(%rax),%r12 3918 mov -32(%rax),%r13 3919 mov -40(%rax),%r14 3920 mov -48(%rax),%r15 3921 mov %rbx,144($context) # restore context->Rbx 3922 mov %rbp,160($context) # restore context->Rbp 3923 mov %r12,216($context) # restore context->R12 3924 mov %r13,224($context) # restore context->R13 3925 mov %r14,232($context) # restore context->R14 3926 mov %r15,240($context) # restore context->R15 3927 3928.Lcommon_seh_tail: 3929 mov 8(%rax),%rdi 3930 mov 16(%rax),%rsi 3931 mov %rax,152($context) # restore context->Rsp 3932 mov %rsi,168($context) # restore context->Rsi 3933 mov %rdi,176($context) # restore context->Rdi 3934 3935 mov 40($disp),%rdi # disp->ContextRecord 3936 mov $context,%rsi # context 3937 mov \$154,%ecx # sizeof(CONTEXT) 3938 .long 0xa548f3fc # cld; rep movsq 3939 3940 mov $disp,%rsi 3941 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3942 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3943 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3944 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3945 mov 40(%rsi),%r10 # disp->ContextRecord 3946 lea 56(%rsi),%r11 # &disp->HandlerData 3947 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3948 mov %r10,32(%rsp) # arg5 3949 mov %r11,40(%rsp) # arg6 3950 mov %r12,48(%rsp) # arg7 3951 mov %rcx,56(%rsp) # arg8, (NULL) 3952 call *__imp_RtlVirtualUnwind(%rip) 3953 3954 mov \$1,%eax # ExceptionContinueSearch 3955 add \$64,%rsp 3956 popfq 3957 pop %r15 3958 pop %r14 3959 pop %r13 3960 pop %r12 3961 pop %rbp 3962 pop %rbx 3963 pop %rdi 3964 pop %rsi 3965 ret 3966.size full_handler,.-full_handler 3967 3968.section .pdata 3969.align 4 3970 .rva .LSEH_begin_ecp_nistz256_neg 3971 .rva .LSEH_end_ecp_nistz256_neg 3972 .rva .LSEH_info_ecp_nistz256_neg 3973 3974 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont 3975 .rva .LSEH_end_ecp_nistz256_ord_mul_mont 3976 .rva .LSEH_info_ecp_nistz256_ord_mul_mont 3977 3978 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont 3979 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont 3980 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont 3981___ 3982$code.=<<___ if ($addx); 3983 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx 3984 .rva .LSEH_end_ecp_nistz256_ord_mul_montx 3985 .rva .LSEH_info_ecp_nistz256_ord_mul_montx 3986 3987 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx 3988 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx 3989 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx 3990___ 3991$code.=<<___; 3992 .rva .LSEH_begin_ecp_nistz256_mul_mont 3993 .rva .LSEH_end_ecp_nistz256_mul_mont 3994 .rva .LSEH_info_ecp_nistz256_mul_mont 3995 3996 .rva .LSEH_begin_ecp_nistz256_sqr_mont 3997 .rva .LSEH_end_ecp_nistz256_sqr_mont 3998 .rva .LSEH_info_ecp_nistz256_sqr_mont 3999 4000 .rva .LSEH_begin_ecp_nistz256_select_w5 4001 .rva .LSEH_end_ecp_nistz256_select_w5 4002 .rva .LSEH_info_ecp_nistz256_select_wX 4003 4004 .rva .LSEH_begin_ecp_nistz256_select_w7 4005 .rva .LSEH_end_ecp_nistz256_select_w7 4006 .rva .LSEH_info_ecp_nistz256_select_wX 4007___ 4008$code.=<<___ if ($avx>1); 4009 .rva .LSEH_begin_ecp_nistz256_avx2_select_w5 4010 .rva .LSEH_end_ecp_nistz256_avx2_select_w5 4011 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4012 4013 .rva .LSEH_begin_ecp_nistz256_avx2_select_w7 4014 .rva .LSEH_end_ecp_nistz256_avx2_select_w7 4015 .rva .LSEH_info_ecp_nistz256_avx2_select_wX 4016___ 4017$code.=<<___; 4018 .rva .LSEH_begin_ecp_nistz256_point_double 4019 .rva .LSEH_end_ecp_nistz256_point_double 4020 .rva .LSEH_info_ecp_nistz256_point_double 4021 4022 .rva .LSEH_begin_ecp_nistz256_point_add 4023 .rva .LSEH_end_ecp_nistz256_point_add 4024 .rva .LSEH_info_ecp_nistz256_point_add 4025 4026 .rva .LSEH_begin_ecp_nistz256_point_add_affine 4027 .rva .LSEH_end_ecp_nistz256_point_add_affine 4028 .rva .LSEH_info_ecp_nistz256_point_add_affine 4029___ 4030$code.=<<___ if ($addx); 4031 .rva .LSEH_begin_ecp_nistz256_point_doublex 4032 .rva .LSEH_end_ecp_nistz256_point_doublex 4033 .rva .LSEH_info_ecp_nistz256_point_doublex 4034 4035 .rva .LSEH_begin_ecp_nistz256_point_addx 4036 .rva .LSEH_end_ecp_nistz256_point_addx 4037 .rva .LSEH_info_ecp_nistz256_point_addx 4038 4039 .rva .LSEH_begin_ecp_nistz256_point_add_affinex 4040 .rva .LSEH_end_ecp_nistz256_point_add_affinex 4041 .rva .LSEH_info_ecp_nistz256_point_add_affinex 4042___ 4043$code.=<<___; 4044 4045.section .xdata 4046.align 8 4047.LSEH_info_ecp_nistz256_neg: 4048 .byte 9,0,0,0 4049 .rva short_handler 4050 .rva .Lneg_body,.Lneg_epilogue # HandlerData[] 4051.LSEH_info_ecp_nistz256_ord_mul_mont: 4052 .byte 9,0,0,0 4053 .rva full_handler 4054 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] 4055 .long 48,0 4056.LSEH_info_ecp_nistz256_ord_sqr_mont: 4057 .byte 9,0,0,0 4058 .rva full_handler 4059 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] 4060 .long 48,0 4061___ 4062$code.=<<___ if ($addx); 4063.LSEH_info_ecp_nistz256_ord_mul_montx: 4064 .byte 9,0,0,0 4065 .rva full_handler 4066 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] 4067 .long 48,0 4068.LSEH_info_ecp_nistz256_ord_sqr_montx: 4069 .byte 9,0,0,0 4070 .rva full_handler 4071 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] 4072 .long 48,0 4073___ 4074$code.=<<___; 4075.LSEH_info_ecp_nistz256_mul_mont: 4076 .byte 9,0,0,0 4077 .rva full_handler 4078 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4079 .long 48,0 4080.LSEH_info_ecp_nistz256_sqr_mont: 4081 .byte 9,0,0,0 4082 .rva full_handler 4083 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 4084 .long 48,0 4085.LSEH_info_ecp_nistz256_select_wX: 4086 .byte 0x01,0x33,0x16,0x00 4087 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 4088 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 4089 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 4090 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 4091 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 4092 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 4093 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 4094 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 4095 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 4096 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 4097 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 4098 .align 8 4099___ 4100$code.=<<___ if ($avx>1); 4101.LSEH_info_ecp_nistz256_avx2_select_wX: 4102 .byte 0x01,0x36,0x17,0x0b 4103 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 4104 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 4105 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 4106 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 4107 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 4108 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 4109 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 4110 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 4111 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 4112 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 4113 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 4114 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 4115 .align 8 4116___ 4117$code.=<<___; 4118.LSEH_info_ecp_nistz256_point_double: 4119 .byte 9,0,0,0 4120 .rva full_handler 4121 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] 4122 .long 32*5+56,0 4123.LSEH_info_ecp_nistz256_point_add: 4124 .byte 9,0,0,0 4125 .rva full_handler 4126 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] 4127 .long 32*18+56,0 4128.LSEH_info_ecp_nistz256_point_add_affine: 4129 .byte 9,0,0,0 4130 .rva full_handler 4131 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] 4132 .long 32*15+56,0 4133___ 4134$code.=<<___ if ($addx); 4135.align 8 4136.LSEH_info_ecp_nistz256_point_doublex: 4137 .byte 9,0,0,0 4138 .rva full_handler 4139 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] 4140 .long 32*5+56,0 4141.LSEH_info_ecp_nistz256_point_addx: 4142 .byte 9,0,0,0 4143 .rva full_handler 4144 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] 4145 .long 32*18+56,0 4146.LSEH_info_ecp_nistz256_point_add_affinex: 4147 .byte 9,0,0,0 4148 .rva full_handler 4149 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] 4150 .long 32*15+56,0 4151___ 4152} 4153 4154$code =~ s/\`([^\`]*)\`/eval $1/gem; 4155print $code; 4156close STDOUT or die "error closing STDOUT"; 4157