1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4# 5# Licensed under the OpenSSL license (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9# 10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12# (2) University of Haifa, Israel 13# 14# References: 15# [1] S. Gueron, "Efficient Software Implementations of Modular 16# Exponentiation", http://eprint.iacr.org/2011/239 17# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". 18# IEEE Proceedings of 9th International Conference on Information 19# Technology: New Generations (ITNG 2012), 821-823 (2012). 20# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation 21# Journal of Cryptographic Engineering 2:31-43 (2012). 22# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 23# resistant 512-bit and 1024-bit modular exponentiation for optimizing 24# RSA1024 and RSA2048 on x86_64 platforms", 25# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest 26# 27# While original submission covers 512- and 1024-bit exponentiation, 28# this module is limited to 512-bit version only (and as such 29# accelerates RSA1024 sign). This is because improvement for longer 30# keys is not high enough to justify the effort, highest measured 31# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 32# for the moment of this writing!] Nor does this module implement 33# "monolithic" complete exponentiation jumbo-subroutine, but adheres 34# to more modular mixture of C and assembly. And it's optimized even 35# for processors other than Intel Core family (see table below for 36# improvement coefficients). 37# <appro@openssl.org> 38# 39# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 40# ----------------+--------------------------- 41# Opteron +13% |+5% +20% 42# Bulldozer -0% |-1% +10% 43# P4 +11% |+7% +8% 44# Westmere +5% |+14% +17% 45# Sandy Bridge +2% |+12% +29% 46# Ivy Bridge +1% |+11% +35% 47# Haswell(**) -0% |+12% +39% 48# Atom +13% |+11% +4% 49# VIA Nano +70% |+9% +25% 50# 51# (*) rsax engine and fips numbers are presented for reference 52# purposes; 53# (**) MULX was attempted, but found to give only marginal improvement; 54 55$flavour = shift; 56$output = shift; 57if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 58 59$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 60 61$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 62( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 63( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 64die "can't locate x86_64-xlate.pl"; 65 66open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 67*STDOUT=*OUT; 68 69if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 70 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 71 $addx = ($1>=2.23); 72} 73 74if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 75 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 76 $addx = ($1>=2.10); 77} 78 79if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 80 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 81 $addx = ($1>=12); 82} 83 84if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 85 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 86 $addx = ($ver>=3.03); 87} 88 89($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 90{ 91my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 92 93$code.=<<___; 94.text 95 96.extern OPENSSL_ia32cap_P 97 98.globl rsaz_512_sqr 99.type rsaz_512_sqr,\@function,5 100.align 32 101rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 102.cfi_startproc 103 push %rbx 104.cfi_push %rbx 105 push %rbp 106.cfi_push %rbp 107 push %r12 108.cfi_push %r12 109 push %r13 110.cfi_push %r13 111 push %r14 112.cfi_push %r14 113 push %r15 114.cfi_push %r15 115 116 subq \$128+24, %rsp 117.cfi_adjust_cfa_offset 128+24 118.Lsqr_body: 119 movq $mod, %xmm1 # common off-load 120 movq ($inp), %rdx 121 movq 8($inp), %rax 122 movq $n0, 128(%rsp) 123___ 124$code.=<<___ if ($addx); 125 movl \$0x80100,%r11d 126 andl OPENSSL_ia32cap_P+8(%rip),%r11d 127 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 128 je .Loop_sqrx 129___ 130$code.=<<___; 131 jmp .Loop_sqr 132 133.align 32 134.Loop_sqr: 135 movl $times,128+8(%rsp) 136#first iteration 137 movq %rdx, %rbx # 0($inp) 138 mov %rax, %rbp # 8($inp) 139 mulq %rdx 140 movq %rax, %r8 141 movq 16($inp), %rax 142 movq %rdx, %r9 143 144 mulq %rbx 145 addq %rax, %r9 146 movq 24($inp), %rax 147 movq %rdx, %r10 148 adcq \$0, %r10 149 150 mulq %rbx 151 addq %rax, %r10 152 movq 32($inp), %rax 153 movq %rdx, %r11 154 adcq \$0, %r11 155 156 mulq %rbx 157 addq %rax, %r11 158 movq 40($inp), %rax 159 movq %rdx, %r12 160 adcq \$0, %r12 161 162 mulq %rbx 163 addq %rax, %r12 164 movq 48($inp), %rax 165 movq %rdx, %r13 166 adcq \$0, %r13 167 168 mulq %rbx 169 addq %rax, %r13 170 movq 56($inp), %rax 171 movq %rdx, %r14 172 adcq \$0, %r14 173 174 mulq %rbx 175 addq %rax, %r14 176 movq %rbx, %rax 177 adcq \$0, %rdx 178 179 xorq %rcx,%rcx # rcx:r8 = r8 << 1 180 addq %r8, %r8 181 movq %rdx, %r15 182 adcq \$0, %rcx 183 184 mulq %rax 185 addq %r8, %rdx 186 adcq \$0, %rcx 187 188 movq %rax, (%rsp) 189 movq %rdx, 8(%rsp) 190 191#second iteration 192 movq 16($inp), %rax 193 mulq %rbp 194 addq %rax, %r10 195 movq 24($inp), %rax 196 movq %rdx, %rbx 197 adcq \$0, %rbx 198 199 mulq %rbp 200 addq %rax, %r11 201 movq 32($inp), %rax 202 adcq \$0, %rdx 203 addq %rbx, %r11 204 movq %rdx, %rbx 205 adcq \$0, %rbx 206 207 mulq %rbp 208 addq %rax, %r12 209 movq 40($inp), %rax 210 adcq \$0, %rdx 211 addq %rbx, %r12 212 movq %rdx, %rbx 213 adcq \$0, %rbx 214 215 mulq %rbp 216 addq %rax, %r13 217 movq 48($inp), %rax 218 adcq \$0, %rdx 219 addq %rbx, %r13 220 movq %rdx, %rbx 221 adcq \$0, %rbx 222 223 mulq %rbp 224 addq %rax, %r14 225 movq 56($inp), %rax 226 adcq \$0, %rdx 227 addq %rbx, %r14 228 movq %rdx, %rbx 229 adcq \$0, %rbx 230 231 mulq %rbp 232 addq %rax, %r15 233 movq %rbp, %rax 234 adcq \$0, %rdx 235 addq %rbx, %r15 236 adcq \$0, %rdx 237 238 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 239 addq %r9, %r9 240 movq %rdx, %r8 241 adcq %r10, %r10 242 adcq \$0, %rbx 243 244 mulq %rax 245 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 246 addq %rcx, %rax 247 movq 16($inp), %rbp 248 addq %rax, %r9 249 movq 24($inp), %rax 250 adcq %rdx, %r10 251 adcq \$0, %rbx 252 253 movq %r9, 16(%rsp) 254 movq %r10, 24(%rsp) 255 256#third iteration 257 mulq %rbp 258 addq %rax, %r12 259 movq 32($inp), %rax 260 movq %rdx, %rcx 261 adcq \$0, %rcx 262 263 mulq %rbp 264 addq %rax, %r13 265 movq 40($inp), %rax 266 adcq \$0, %rdx 267 addq %rcx, %r13 268 movq %rdx, %rcx 269 adcq \$0, %rcx 270 271 mulq %rbp 272 addq %rax, %r14 273 movq 48($inp), %rax 274 adcq \$0, %rdx 275 addq %rcx, %r14 276 movq %rdx, %rcx 277 adcq \$0, %rcx 278 279 mulq %rbp 280 addq %rax, %r15 281 movq 56($inp), %rax 282 adcq \$0, %rdx 283 addq %rcx, %r15 284 movq %rdx, %rcx 285 adcq \$0, %rcx 286 287 mulq %rbp 288 addq %rax, %r8 289 movq %rbp, %rax 290 adcq \$0, %rdx 291 addq %rcx, %r8 292 adcq \$0, %rdx 293 294 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 295 addq %r11, %r11 296 movq %rdx, %r9 297 adcq %r12, %r12 298 adcq \$0, %rcx 299 300 mulq %rax 301 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 302 addq %rbx, %rax 303 movq 24($inp), %r10 304 addq %rax, %r11 305 movq 32($inp), %rax 306 adcq %rdx, %r12 307 adcq \$0, %rcx 308 309 movq %r11, 32(%rsp) 310 movq %r12, 40(%rsp) 311 312#fourth iteration 313 mov %rax, %r11 # 32($inp) 314 mulq %r10 315 addq %rax, %r14 316 movq 40($inp), %rax 317 movq %rdx, %rbx 318 adcq \$0, %rbx 319 320 mov %rax, %r12 # 40($inp) 321 mulq %r10 322 addq %rax, %r15 323 movq 48($inp), %rax 324 adcq \$0, %rdx 325 addq %rbx, %r15 326 movq %rdx, %rbx 327 adcq \$0, %rbx 328 329 mov %rax, %rbp # 48($inp) 330 mulq %r10 331 addq %rax, %r8 332 movq 56($inp), %rax 333 adcq \$0, %rdx 334 addq %rbx, %r8 335 movq %rdx, %rbx 336 adcq \$0, %rbx 337 338 mulq %r10 339 addq %rax, %r9 340 movq %r10, %rax 341 adcq \$0, %rdx 342 addq %rbx, %r9 343 adcq \$0, %rdx 344 345 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 346 addq %r13, %r13 347 movq %rdx, %r10 348 adcq %r14, %r14 349 adcq \$0, %rbx 350 351 mulq %rax 352 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 353 addq %rcx, %rax 354 addq %rax, %r13 355 movq %r12, %rax # 40($inp) 356 adcq %rdx, %r14 357 adcq \$0, %rbx 358 359 movq %r13, 48(%rsp) 360 movq %r14, 56(%rsp) 361 362#fifth iteration 363 mulq %r11 364 addq %rax, %r8 365 movq %rbp, %rax # 48($inp) 366 movq %rdx, %rcx 367 adcq \$0, %rcx 368 369 mulq %r11 370 addq %rax, %r9 371 movq 56($inp), %rax 372 adcq \$0, %rdx 373 addq %rcx, %r9 374 movq %rdx, %rcx 375 adcq \$0, %rcx 376 377 mov %rax, %r14 # 56($inp) 378 mulq %r11 379 addq %rax, %r10 380 movq %r11, %rax 381 adcq \$0, %rdx 382 addq %rcx, %r10 383 adcq \$0, %rdx 384 385 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 386 addq %r15, %r15 387 movq %rdx, %r11 388 adcq %r8, %r8 389 adcq \$0, %rcx 390 391 mulq %rax 392 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 393 addq %rbx, %rax 394 addq %rax, %r15 395 movq %rbp, %rax # 48($inp) 396 adcq %rdx, %r8 397 adcq \$0, %rcx 398 399 movq %r15, 64(%rsp) 400 movq %r8, 72(%rsp) 401 402#sixth iteration 403 mulq %r12 404 addq %rax, %r10 405 movq %r14, %rax # 56($inp) 406 movq %rdx, %rbx 407 adcq \$0, %rbx 408 409 mulq %r12 410 addq %rax, %r11 411 movq %r12, %rax 412 adcq \$0, %rdx 413 addq %rbx, %r11 414 adcq \$0, %rdx 415 416 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 417 addq %r9, %r9 418 movq %rdx, %r12 419 adcq %r10, %r10 420 adcq \$0, %rbx 421 422 mulq %rax 423 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 424 addq %rcx, %rax 425 addq %rax, %r9 426 movq %r14, %rax # 56($inp) 427 adcq %rdx, %r10 428 adcq \$0, %rbx 429 430 movq %r9, 80(%rsp) 431 movq %r10, 88(%rsp) 432 433#seventh iteration 434 mulq %rbp 435 addq %rax, %r12 436 movq %rbp, %rax 437 adcq \$0, %rdx 438 439 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 440 addq %r11, %r11 441 movq %rdx, %r13 442 adcq %r12, %r12 443 adcq \$0, %rcx 444 445 mulq %rax 446 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 447 addq %rbx, %rax 448 addq %rax, %r11 449 movq %r14, %rax # 56($inp) 450 adcq %rdx, %r12 451 adcq \$0, %rcx 452 453 movq %r11, 96(%rsp) 454 movq %r12, 104(%rsp) 455 456#eighth iteration 457 xorq %rbx, %rbx # rbx:r13 = r13 << 1 458 addq %r13, %r13 459 adcq \$0, %rbx 460 461 mulq %rax 462 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 463 addq %rcx, %rax 464 addq %r13, %rax 465 adcq %rbx, %rdx 466 467 movq (%rsp), %r8 468 movq 8(%rsp), %r9 469 movq 16(%rsp), %r10 470 movq 24(%rsp), %r11 471 movq 32(%rsp), %r12 472 movq 40(%rsp), %r13 473 movq 48(%rsp), %r14 474 movq 56(%rsp), %r15 475 movq %xmm1, %rbp 476 477 movq %rax, 112(%rsp) 478 movq %rdx, 120(%rsp) 479 480 call __rsaz_512_reduce 481 482 addq 64(%rsp), %r8 483 adcq 72(%rsp), %r9 484 adcq 80(%rsp), %r10 485 adcq 88(%rsp), %r11 486 adcq 96(%rsp), %r12 487 adcq 104(%rsp), %r13 488 adcq 112(%rsp), %r14 489 adcq 120(%rsp), %r15 490 sbbq %rcx, %rcx 491 492 call __rsaz_512_subtract 493 494 movq %r8, %rdx 495 movq %r9, %rax 496 movl 128+8(%rsp), $times 497 movq $out, $inp 498 499 decl $times 500 jnz .Loop_sqr 501___ 502if ($addx) { 503$code.=<<___; 504 jmp .Lsqr_tail 505 506.align 32 507.Loop_sqrx: 508 movl $times,128+8(%rsp) 509 movq $out, %xmm0 # off-load 510#first iteration 511 mulx %rax, %r8, %r9 512 mov %rax, %rbx 513 514 mulx 16($inp), %rcx, %r10 515 xor %rbp, %rbp # cf=0, of=0 516 517 mulx 24($inp), %rax, %r11 518 adcx %rcx, %r9 519 520 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 521 adcx %rax, %r10 522 523 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 524 adcx %rcx, %r11 525 526 mulx 48($inp), %rcx, %r14 527 adcx %rax, %r12 528 adcx %rcx, %r13 529 530 mulx 56($inp), %rax, %r15 531 adcx %rax, %r14 532 adcx %rbp, %r15 # %rbp is 0 533 534 mulx %rdx, %rax, $out 535 mov %rbx, %rdx # 8($inp) 536 xor %rcx, %rcx 537 adox %r8, %r8 538 adcx $out, %r8 539 adox %rbp, %rcx 540 adcx %rbp, %rcx 541 542 mov %rax, (%rsp) 543 mov %r8, 8(%rsp) 544 545#second iteration 546 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx 547 adox %rax, %r10 548 adcx %rbx, %r11 549 550 mulx 24($inp), $out, %r8 551 adox $out, %r11 552 .byte 0x66 553 adcx %r8, %r12 554 555 mulx 32($inp), %rax, %rbx 556 adox %rax, %r12 557 adcx %rbx, %r13 558 559 mulx 40($inp), $out, %r8 560 adox $out, %r13 561 adcx %r8, %r14 562 563 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 564 adox %rax, %r14 565 adcx %rbx, %r15 566 567 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 568 adox $out, %r15 569 adcx %rbp, %r8 570 mulx %rdx, %rax, $out 571 adox %rbp, %r8 572 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx 573 574 xor %rbx, %rbx 575 adox %r9, %r9 576 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 577 adcx %rcx, %rax 578 adox %r10, %r10 579 adcx %rax, %r9 580 adox %rbp, %rbx 581 adcx $out, %r10 582 adcx %rbp, %rbx 583 584 mov %r9, 16(%rsp) 585 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 586 587#third iteration 588 mulx 24($inp), $out, %r9 589 adox $out, %r12 590 adcx %r9, %r13 591 592 mulx 32($inp), %rax, %rcx 593 adox %rax, %r13 594 adcx %rcx, %r14 595 596 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 597 adox $out, %r14 598 adcx %r9, %r15 599 600 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 601 adox %rax, %r15 602 adcx %rcx, %r8 603 604 mulx 56($inp), $out, %r9 605 adox $out, %r8 606 adcx %rbp, %r9 607 mulx %rdx, %rax, $out 608 adox %rbp, %r9 609 mov 24($inp), %rdx 610 611 xor %rcx, %rcx 612 adox %r11, %r11 613 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 614 adcx %rbx, %rax 615 adox %r12, %r12 616 adcx %rax, %r11 617 adox %rbp, %rcx 618 adcx $out, %r12 619 adcx %rbp, %rcx 620 621 mov %r11, 32(%rsp) 622 mov %r12, 40(%rsp) 623 624#fourth iteration 625 mulx 32($inp), %rax, %rbx 626 adox %rax, %r14 627 adcx %rbx, %r15 628 629 mulx 40($inp), $out, %r10 630 adox $out, %r15 631 adcx %r10, %r8 632 633 mulx 48($inp), %rax, %rbx 634 adox %rax, %r8 635 adcx %rbx, %r9 636 637 mulx 56($inp), $out, %r10 638 adox $out, %r9 639 adcx %rbp, %r10 640 mulx %rdx, %rax, $out 641 adox %rbp, %r10 642 mov 32($inp), %rdx 643 644 xor %rbx, %rbx 645 adox %r13, %r13 646 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 647 adcx %rcx, %rax 648 adox %r14, %r14 649 adcx %rax, %r13 650 adox %rbp, %rbx 651 adcx $out, %r14 652 adcx %rbp, %rbx 653 654 mov %r13, 48(%rsp) 655 mov %r14, 56(%rsp) 656 657#fifth iteration 658 mulx 40($inp), $out, %r11 659 adox $out, %r8 660 adcx %r11, %r9 661 662 mulx 48($inp), %rax, %rcx 663 adox %rax, %r9 664 adcx %rcx, %r10 665 666 mulx 56($inp), $out, %r11 667 adox $out, %r10 668 adcx %rbp, %r11 669 mulx %rdx, %rax, $out 670 mov 40($inp), %rdx 671 adox %rbp, %r11 672 673 xor %rcx, %rcx 674 adox %r15, %r15 675 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 676 adcx %rbx, %rax 677 adox %r8, %r8 678 adcx %rax, %r15 679 adox %rbp, %rcx 680 adcx $out, %r8 681 adcx %rbp, %rcx 682 683 mov %r15, 64(%rsp) 684 mov %r8, 72(%rsp) 685 686#sixth iteration 687 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 688 adox %rax, %r10 689 adcx %rbx, %r11 690 691 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 692 adox $out, %r11 693 adcx %rbp, %r12 694 mulx %rdx, %rax, $out 695 adox %rbp, %r12 696 mov 48($inp), %rdx 697 698 xor %rbx, %rbx 699 adox %r9, %r9 700 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 701 adcx %rcx, %rax 702 adox %r10, %r10 703 adcx %rax, %r9 704 adcx $out, %r10 705 adox %rbp, %rbx 706 adcx %rbp, %rbx 707 708 mov %r9, 80(%rsp) 709 mov %r10, 88(%rsp) 710 711#seventh iteration 712 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 713 adox %rax, %r12 714 adox %rbp, %r13 715 716 mulx %rdx, %rax, $out 717 xor %rcx, %rcx 718 mov 56($inp), %rdx 719 adox %r11, %r11 720 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 721 adcx %rbx, %rax 722 adox %r12, %r12 723 adcx %rax, %r11 724 adox %rbp, %rcx 725 adcx $out, %r12 726 adcx %rbp, %rcx 727 728 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 729 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 730 731#eighth iteration 732 mulx %rdx, %rax, %rdx 733 xor %rbx, %rbx 734 adox %r13, %r13 735 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 736 adcx %rcx, %rax 737 adox %rbp, %rbx 738 adcx %r13, %rax 739 adcx %rdx, %rbx 740 741 movq %xmm0, $out 742 movq %xmm1, %rbp 743 744 movq 128(%rsp), %rdx # pull $n0 745 movq (%rsp), %r8 746 movq 8(%rsp), %r9 747 movq 16(%rsp), %r10 748 movq 24(%rsp), %r11 749 movq 32(%rsp), %r12 750 movq 40(%rsp), %r13 751 movq 48(%rsp), %r14 752 movq 56(%rsp), %r15 753 754 movq %rax, 112(%rsp) 755 movq %rbx, 120(%rsp) 756 757 call __rsaz_512_reducex 758 759 addq 64(%rsp), %r8 760 adcq 72(%rsp), %r9 761 adcq 80(%rsp), %r10 762 adcq 88(%rsp), %r11 763 adcq 96(%rsp), %r12 764 adcq 104(%rsp), %r13 765 adcq 112(%rsp), %r14 766 adcq 120(%rsp), %r15 767 sbbq %rcx, %rcx 768 769 call __rsaz_512_subtract 770 771 movq %r8, %rdx 772 movq %r9, %rax 773 movl 128+8(%rsp), $times 774 movq $out, $inp 775 776 decl $times 777 jnz .Loop_sqrx 778 779.Lsqr_tail: 780___ 781} 782$code.=<<___; 783 784 leaq 128+24+48(%rsp), %rax 785.cfi_def_cfa %rax,8 786 movq -48(%rax), %r15 787.cfi_restore %r15 788 movq -40(%rax), %r14 789.cfi_restore %r14 790 movq -32(%rax), %r13 791.cfi_restore %r13 792 movq -24(%rax), %r12 793.cfi_restore %r12 794 movq -16(%rax), %rbp 795.cfi_restore %rbp 796 movq -8(%rax), %rbx 797.cfi_restore %rbx 798 leaq (%rax), %rsp 799.cfi_def_cfa_register %rsp 800.Lsqr_epilogue: 801 ret 802.cfi_endproc 803.size rsaz_512_sqr,.-rsaz_512_sqr 804___ 805} 806{ 807my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 808$code.=<<___; 809.globl rsaz_512_mul 810.type rsaz_512_mul,\@function,5 811.align 32 812rsaz_512_mul: 813.cfi_startproc 814 push %rbx 815.cfi_push %rbx 816 push %rbp 817.cfi_push %rbp 818 push %r12 819.cfi_push %r12 820 push %r13 821.cfi_push %r13 822 push %r14 823.cfi_push %r14 824 push %r15 825.cfi_push %r15 826 827 subq \$128+24, %rsp 828.cfi_adjust_cfa_offset 128+24 829.Lmul_body: 830 movq $out, %xmm0 # off-load arguments 831 movq $mod, %xmm1 832 movq $n0, 128(%rsp) 833___ 834$code.=<<___ if ($addx); 835 movl \$0x80100,%r11d 836 andl OPENSSL_ia32cap_P+8(%rip),%r11d 837 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 838 je .Lmulx 839___ 840$code.=<<___; 841 movq ($bp), %rbx # pass b[0] 842 movq $bp, %rbp # pass argument 843 call __rsaz_512_mul 844 845 movq %xmm0, $out 846 movq %xmm1, %rbp 847 848 movq (%rsp), %r8 849 movq 8(%rsp), %r9 850 movq 16(%rsp), %r10 851 movq 24(%rsp), %r11 852 movq 32(%rsp), %r12 853 movq 40(%rsp), %r13 854 movq 48(%rsp), %r14 855 movq 56(%rsp), %r15 856 857 call __rsaz_512_reduce 858___ 859$code.=<<___ if ($addx); 860 jmp .Lmul_tail 861 862.align 32 863.Lmulx: 864 movq $bp, %rbp # pass argument 865 movq ($bp), %rdx # pass b[0] 866 call __rsaz_512_mulx 867 868 movq %xmm0, $out 869 movq %xmm1, %rbp 870 871 movq 128(%rsp), %rdx # pull $n0 872 movq (%rsp), %r8 873 movq 8(%rsp), %r9 874 movq 16(%rsp), %r10 875 movq 24(%rsp), %r11 876 movq 32(%rsp), %r12 877 movq 40(%rsp), %r13 878 movq 48(%rsp), %r14 879 movq 56(%rsp), %r15 880 881 call __rsaz_512_reducex 882.Lmul_tail: 883___ 884$code.=<<___; 885 addq 64(%rsp), %r8 886 adcq 72(%rsp), %r9 887 adcq 80(%rsp), %r10 888 adcq 88(%rsp), %r11 889 adcq 96(%rsp), %r12 890 adcq 104(%rsp), %r13 891 adcq 112(%rsp), %r14 892 adcq 120(%rsp), %r15 893 sbbq %rcx, %rcx 894 895 call __rsaz_512_subtract 896 897 leaq 128+24+48(%rsp), %rax 898.cfi_def_cfa %rax,8 899 movq -48(%rax), %r15 900.cfi_restore %r15 901 movq -40(%rax), %r14 902.cfi_restore %r14 903 movq -32(%rax), %r13 904.cfi_restore %r13 905 movq -24(%rax), %r12 906.cfi_restore %r12 907 movq -16(%rax), %rbp 908.cfi_restore %rbp 909 movq -8(%rax), %rbx 910.cfi_restore %rbx 911 leaq (%rax), %rsp 912.cfi_def_cfa_register %rsp 913.Lmul_epilogue: 914 ret 915.cfi_endproc 916.size rsaz_512_mul,.-rsaz_512_mul 917___ 918} 919{ 920my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 921$code.=<<___; 922.globl rsaz_512_mul_gather4 923.type rsaz_512_mul_gather4,\@function,6 924.align 32 925rsaz_512_mul_gather4: 926.cfi_startproc 927 push %rbx 928.cfi_push %rbx 929 push %rbp 930.cfi_push %rbp 931 push %r12 932.cfi_push %r12 933 push %r13 934.cfi_push %r13 935 push %r14 936.cfi_push %r14 937 push %r15 938.cfi_push %r15 939 940 subq \$`128+24+($win64?0xb0:0)`, %rsp 941.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` 942___ 943$code.=<<___ if ($win64); 944 movaps %xmm6,0xa0(%rsp) 945 movaps %xmm7,0xb0(%rsp) 946 movaps %xmm8,0xc0(%rsp) 947 movaps %xmm9,0xd0(%rsp) 948 movaps %xmm10,0xe0(%rsp) 949 movaps %xmm11,0xf0(%rsp) 950 movaps %xmm12,0x100(%rsp) 951 movaps %xmm13,0x110(%rsp) 952 movaps %xmm14,0x120(%rsp) 953 movaps %xmm15,0x130(%rsp) 954___ 955$code.=<<___; 956.Lmul_gather4_body: 957 movd $pwr,%xmm8 958 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 959 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 960 961 pshufd \$0,%xmm8,%xmm8 # broadcast $power 962 movdqa %xmm1,%xmm7 963 movdqa %xmm1,%xmm2 964___ 965######################################################################## 966# calculate mask by comparing 0..15 to $power 967# 968for($i=0;$i<4;$i++) { 969$code.=<<___; 970 paddd %xmm`$i`,%xmm`$i+1` 971 pcmpeqd %xmm8,%xmm`$i` 972 movdqa %xmm7,%xmm`$i+3` 973___ 974} 975for(;$i<7;$i++) { 976$code.=<<___; 977 paddd %xmm`$i`,%xmm`$i+1` 978 pcmpeqd %xmm8,%xmm`$i` 979___ 980} 981$code.=<<___; 982 pcmpeqd %xmm8,%xmm7 983 984 movdqa 16*0($bp),%xmm8 985 movdqa 16*1($bp),%xmm9 986 movdqa 16*2($bp),%xmm10 987 movdqa 16*3($bp),%xmm11 988 pand %xmm0,%xmm8 989 movdqa 16*4($bp),%xmm12 990 pand %xmm1,%xmm9 991 movdqa 16*5($bp),%xmm13 992 pand %xmm2,%xmm10 993 movdqa 16*6($bp),%xmm14 994 pand %xmm3,%xmm11 995 movdqa 16*7($bp),%xmm15 996 leaq 128($bp), %rbp 997 pand %xmm4,%xmm12 998 pand %xmm5,%xmm13 999 pand %xmm6,%xmm14 1000 pand %xmm7,%xmm15 1001 por %xmm10,%xmm8 1002 por %xmm11,%xmm9 1003 por %xmm12,%xmm8 1004 por %xmm13,%xmm9 1005 por %xmm14,%xmm8 1006 por %xmm15,%xmm9 1007 1008 por %xmm9,%xmm8 1009 pshufd \$0x4e,%xmm8,%xmm9 1010 por %xmm9,%xmm8 1011___ 1012$code.=<<___ if ($addx); 1013 movl \$0x80100,%r11d 1014 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1015 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1016 je .Lmulx_gather 1017___ 1018$code.=<<___; 1019 movq %xmm8,%rbx 1020 1021 movq $n0, 128(%rsp) # off-load arguments 1022 movq $out, 128+8(%rsp) 1023 movq $mod, 128+16(%rsp) 1024 1025 movq ($ap), %rax 1026 movq 8($ap), %rcx 1027 mulq %rbx # 0 iteration 1028 movq %rax, (%rsp) 1029 movq %rcx, %rax 1030 movq %rdx, %r8 1031 1032 mulq %rbx 1033 addq %rax, %r8 1034 movq 16($ap), %rax 1035 movq %rdx, %r9 1036 adcq \$0, %r9 1037 1038 mulq %rbx 1039 addq %rax, %r9 1040 movq 24($ap), %rax 1041 movq %rdx, %r10 1042 adcq \$0, %r10 1043 1044 mulq %rbx 1045 addq %rax, %r10 1046 movq 32($ap), %rax 1047 movq %rdx, %r11 1048 adcq \$0, %r11 1049 1050 mulq %rbx 1051 addq %rax, %r11 1052 movq 40($ap), %rax 1053 movq %rdx, %r12 1054 adcq \$0, %r12 1055 1056 mulq %rbx 1057 addq %rax, %r12 1058 movq 48($ap), %rax 1059 movq %rdx, %r13 1060 adcq \$0, %r13 1061 1062 mulq %rbx 1063 addq %rax, %r13 1064 movq 56($ap), %rax 1065 movq %rdx, %r14 1066 adcq \$0, %r14 1067 1068 mulq %rbx 1069 addq %rax, %r14 1070 movq ($ap), %rax 1071 movq %rdx, %r15 1072 adcq \$0, %r15 1073 1074 leaq 8(%rsp), %rdi 1075 movl \$7, %ecx 1076 jmp .Loop_mul_gather 1077 1078.align 32 1079.Loop_mul_gather: 1080 movdqa 16*0(%rbp),%xmm8 1081 movdqa 16*1(%rbp),%xmm9 1082 movdqa 16*2(%rbp),%xmm10 1083 movdqa 16*3(%rbp),%xmm11 1084 pand %xmm0,%xmm8 1085 movdqa 16*4(%rbp),%xmm12 1086 pand %xmm1,%xmm9 1087 movdqa 16*5(%rbp),%xmm13 1088 pand %xmm2,%xmm10 1089 movdqa 16*6(%rbp),%xmm14 1090 pand %xmm3,%xmm11 1091 movdqa 16*7(%rbp),%xmm15 1092 leaq 128(%rbp), %rbp 1093 pand %xmm4,%xmm12 1094 pand %xmm5,%xmm13 1095 pand %xmm6,%xmm14 1096 pand %xmm7,%xmm15 1097 por %xmm10,%xmm8 1098 por %xmm11,%xmm9 1099 por %xmm12,%xmm8 1100 por %xmm13,%xmm9 1101 por %xmm14,%xmm8 1102 por %xmm15,%xmm9 1103 1104 por %xmm9,%xmm8 1105 pshufd \$0x4e,%xmm8,%xmm9 1106 por %xmm9,%xmm8 1107 movq %xmm8,%rbx 1108 1109 mulq %rbx 1110 addq %rax, %r8 1111 movq 8($ap), %rax 1112 movq %r8, (%rdi) 1113 movq %rdx, %r8 1114 adcq \$0, %r8 1115 1116 mulq %rbx 1117 addq %rax, %r9 1118 movq 16($ap), %rax 1119 adcq \$0, %rdx 1120 addq %r9, %r8 1121 movq %rdx, %r9 1122 adcq \$0, %r9 1123 1124 mulq %rbx 1125 addq %rax, %r10 1126 movq 24($ap), %rax 1127 adcq \$0, %rdx 1128 addq %r10, %r9 1129 movq %rdx, %r10 1130 adcq \$0, %r10 1131 1132 mulq %rbx 1133 addq %rax, %r11 1134 movq 32($ap), %rax 1135 adcq \$0, %rdx 1136 addq %r11, %r10 1137 movq %rdx, %r11 1138 adcq \$0, %r11 1139 1140 mulq %rbx 1141 addq %rax, %r12 1142 movq 40($ap), %rax 1143 adcq \$0, %rdx 1144 addq %r12, %r11 1145 movq %rdx, %r12 1146 adcq \$0, %r12 1147 1148 mulq %rbx 1149 addq %rax, %r13 1150 movq 48($ap), %rax 1151 adcq \$0, %rdx 1152 addq %r13, %r12 1153 movq %rdx, %r13 1154 adcq \$0, %r13 1155 1156 mulq %rbx 1157 addq %rax, %r14 1158 movq 56($ap), %rax 1159 adcq \$0, %rdx 1160 addq %r14, %r13 1161 movq %rdx, %r14 1162 adcq \$0, %r14 1163 1164 mulq %rbx 1165 addq %rax, %r15 1166 movq ($ap), %rax 1167 adcq \$0, %rdx 1168 addq %r15, %r14 1169 movq %rdx, %r15 1170 adcq \$0, %r15 1171 1172 leaq 8(%rdi), %rdi 1173 1174 decl %ecx 1175 jnz .Loop_mul_gather 1176 1177 movq %r8, (%rdi) 1178 movq %r9, 8(%rdi) 1179 movq %r10, 16(%rdi) 1180 movq %r11, 24(%rdi) 1181 movq %r12, 32(%rdi) 1182 movq %r13, 40(%rdi) 1183 movq %r14, 48(%rdi) 1184 movq %r15, 56(%rdi) 1185 1186 movq 128+8(%rsp), $out 1187 movq 128+16(%rsp), %rbp 1188 1189 movq (%rsp), %r8 1190 movq 8(%rsp), %r9 1191 movq 16(%rsp), %r10 1192 movq 24(%rsp), %r11 1193 movq 32(%rsp), %r12 1194 movq 40(%rsp), %r13 1195 movq 48(%rsp), %r14 1196 movq 56(%rsp), %r15 1197 1198 call __rsaz_512_reduce 1199___ 1200$code.=<<___ if ($addx); 1201 jmp .Lmul_gather_tail 1202 1203.align 32 1204.Lmulx_gather: 1205 movq %xmm8,%rdx 1206 1207 mov $n0, 128(%rsp) # off-load arguments 1208 mov $out, 128+8(%rsp) 1209 mov $mod, 128+16(%rsp) 1210 1211 mulx ($ap), %rbx, %r8 # 0 iteration 1212 mov %rbx, (%rsp) 1213 xor %edi, %edi # cf=0, of=0 1214 1215 mulx 8($ap), %rax, %r9 1216 1217 mulx 16($ap), %rbx, %r10 1218 adcx %rax, %r8 1219 1220 mulx 24($ap), %rax, %r11 1221 adcx %rbx, %r9 1222 1223 mulx 32($ap), %rbx, %r12 1224 adcx %rax, %r10 1225 1226 mulx 40($ap), %rax, %r13 1227 adcx %rbx, %r11 1228 1229 mulx 48($ap), %rbx, %r14 1230 adcx %rax, %r12 1231 1232 mulx 56($ap), %rax, %r15 1233 adcx %rbx, %r13 1234 adcx %rax, %r14 1235 .byte 0x67 1236 mov %r8, %rbx 1237 adcx %rdi, %r15 # %rdi is 0 1238 1239 mov \$-7, %rcx 1240 jmp .Loop_mulx_gather 1241 1242.align 32 1243.Loop_mulx_gather: 1244 movdqa 16*0(%rbp),%xmm8 1245 movdqa 16*1(%rbp),%xmm9 1246 movdqa 16*2(%rbp),%xmm10 1247 movdqa 16*3(%rbp),%xmm11 1248 pand %xmm0,%xmm8 1249 movdqa 16*4(%rbp),%xmm12 1250 pand %xmm1,%xmm9 1251 movdqa 16*5(%rbp),%xmm13 1252 pand %xmm2,%xmm10 1253 movdqa 16*6(%rbp),%xmm14 1254 pand %xmm3,%xmm11 1255 movdqa 16*7(%rbp),%xmm15 1256 leaq 128(%rbp), %rbp 1257 pand %xmm4,%xmm12 1258 pand %xmm5,%xmm13 1259 pand %xmm6,%xmm14 1260 pand %xmm7,%xmm15 1261 por %xmm10,%xmm8 1262 por %xmm11,%xmm9 1263 por %xmm12,%xmm8 1264 por %xmm13,%xmm9 1265 por %xmm14,%xmm8 1266 por %xmm15,%xmm9 1267 1268 por %xmm9,%xmm8 1269 pshufd \$0x4e,%xmm8,%xmm9 1270 por %xmm9,%xmm8 1271 movq %xmm8,%rdx 1272 1273 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1274 adcx %rax, %rbx 1275 adox %r9, %r8 1276 1277 mulx 8($ap), %rax, %r9 1278 adcx %rax, %r8 1279 adox %r10, %r9 1280 1281 mulx 16($ap), %rax, %r10 1282 adcx %rax, %r9 1283 adox %r11, %r10 1284 1285 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1286 adcx %rax, %r10 1287 adox %r12, %r11 1288 1289 mulx 32($ap), %rax, %r12 1290 adcx %rax, %r11 1291 adox %r13, %r12 1292 1293 mulx 40($ap), %rax, %r13 1294 adcx %rax, %r12 1295 adox %r14, %r13 1296 1297 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1298 adcx %rax, %r13 1299 .byte 0x67 1300 adox %r15, %r14 1301 1302 mulx 56($ap), %rax, %r15 1303 mov %rbx, 64(%rsp,%rcx,8) 1304 adcx %rax, %r14 1305 adox %rdi, %r15 1306 mov %r8, %rbx 1307 adcx %rdi, %r15 # cf=0 1308 1309 inc %rcx # of=0 1310 jnz .Loop_mulx_gather 1311 1312 mov %r8, 64(%rsp) 1313 mov %r9, 64+8(%rsp) 1314 mov %r10, 64+16(%rsp) 1315 mov %r11, 64+24(%rsp) 1316 mov %r12, 64+32(%rsp) 1317 mov %r13, 64+40(%rsp) 1318 mov %r14, 64+48(%rsp) 1319 mov %r15, 64+56(%rsp) 1320 1321 mov 128(%rsp), %rdx # pull arguments 1322 mov 128+8(%rsp), $out 1323 mov 128+16(%rsp), %rbp 1324 1325 mov (%rsp), %r8 1326 mov 8(%rsp), %r9 1327 mov 16(%rsp), %r10 1328 mov 24(%rsp), %r11 1329 mov 32(%rsp), %r12 1330 mov 40(%rsp), %r13 1331 mov 48(%rsp), %r14 1332 mov 56(%rsp), %r15 1333 1334 call __rsaz_512_reducex 1335 1336.Lmul_gather_tail: 1337___ 1338$code.=<<___; 1339 addq 64(%rsp), %r8 1340 adcq 72(%rsp), %r9 1341 adcq 80(%rsp), %r10 1342 adcq 88(%rsp), %r11 1343 adcq 96(%rsp), %r12 1344 adcq 104(%rsp), %r13 1345 adcq 112(%rsp), %r14 1346 adcq 120(%rsp), %r15 1347 sbbq %rcx, %rcx 1348 1349 call __rsaz_512_subtract 1350 1351 leaq 128+24+48(%rsp), %rax 1352___ 1353$code.=<<___ if ($win64); 1354 movaps 0xa0-0xc8(%rax),%xmm6 1355 movaps 0xb0-0xc8(%rax),%xmm7 1356 movaps 0xc0-0xc8(%rax),%xmm8 1357 movaps 0xd0-0xc8(%rax),%xmm9 1358 movaps 0xe0-0xc8(%rax),%xmm10 1359 movaps 0xf0-0xc8(%rax),%xmm11 1360 movaps 0x100-0xc8(%rax),%xmm12 1361 movaps 0x110-0xc8(%rax),%xmm13 1362 movaps 0x120-0xc8(%rax),%xmm14 1363 movaps 0x130-0xc8(%rax),%xmm15 1364 lea 0xb0(%rax),%rax 1365___ 1366$code.=<<___; 1367.cfi_def_cfa %rax,8 1368 movq -48(%rax), %r15 1369.cfi_restore %r15 1370 movq -40(%rax), %r14 1371.cfi_restore %r14 1372 movq -32(%rax), %r13 1373.cfi_restore %r13 1374 movq -24(%rax), %r12 1375.cfi_restore %r12 1376 movq -16(%rax), %rbp 1377.cfi_restore %rbp 1378 movq -8(%rax), %rbx 1379.cfi_restore %rbx 1380 leaq (%rax), %rsp 1381.cfi_def_cfa_register %rsp 1382.Lmul_gather4_epilogue: 1383 ret 1384.cfi_endproc 1385.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1386___ 1387} 1388{ 1389my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1390$code.=<<___; 1391.globl rsaz_512_mul_scatter4 1392.type rsaz_512_mul_scatter4,\@function,6 1393.align 32 1394rsaz_512_mul_scatter4: 1395.cfi_startproc 1396 push %rbx 1397.cfi_push %rbx 1398 push %rbp 1399.cfi_push %rbp 1400 push %r12 1401.cfi_push %r12 1402 push %r13 1403.cfi_push %r13 1404 push %r14 1405.cfi_push %r14 1406 push %r15 1407.cfi_push %r15 1408 1409 mov $pwr, $pwr 1410 subq \$128+24, %rsp 1411.cfi_adjust_cfa_offset 128+24 1412.Lmul_scatter4_body: 1413 leaq ($tbl,$pwr,8), $tbl 1414 movq $out, %xmm0 # off-load arguments 1415 movq $mod, %xmm1 1416 movq $tbl, %xmm2 1417 movq $n0, 128(%rsp) 1418 1419 movq $out, %rbp 1420___ 1421$code.=<<___ if ($addx); 1422 movl \$0x80100,%r11d 1423 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1424 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1425 je .Lmulx_scatter 1426___ 1427$code.=<<___; 1428 movq ($out),%rbx # pass b[0] 1429 call __rsaz_512_mul 1430 1431 movq %xmm0, $out 1432 movq %xmm1, %rbp 1433 1434 movq (%rsp), %r8 1435 movq 8(%rsp), %r9 1436 movq 16(%rsp), %r10 1437 movq 24(%rsp), %r11 1438 movq 32(%rsp), %r12 1439 movq 40(%rsp), %r13 1440 movq 48(%rsp), %r14 1441 movq 56(%rsp), %r15 1442 1443 call __rsaz_512_reduce 1444___ 1445$code.=<<___ if ($addx); 1446 jmp .Lmul_scatter_tail 1447 1448.align 32 1449.Lmulx_scatter: 1450 movq ($out), %rdx # pass b[0] 1451 call __rsaz_512_mulx 1452 1453 movq %xmm0, $out 1454 movq %xmm1, %rbp 1455 1456 movq 128(%rsp), %rdx # pull $n0 1457 movq (%rsp), %r8 1458 movq 8(%rsp), %r9 1459 movq 16(%rsp), %r10 1460 movq 24(%rsp), %r11 1461 movq 32(%rsp), %r12 1462 movq 40(%rsp), %r13 1463 movq 48(%rsp), %r14 1464 movq 56(%rsp), %r15 1465 1466 call __rsaz_512_reducex 1467 1468.Lmul_scatter_tail: 1469___ 1470$code.=<<___; 1471 addq 64(%rsp), %r8 1472 adcq 72(%rsp), %r9 1473 adcq 80(%rsp), %r10 1474 adcq 88(%rsp), %r11 1475 adcq 96(%rsp), %r12 1476 adcq 104(%rsp), %r13 1477 adcq 112(%rsp), %r14 1478 adcq 120(%rsp), %r15 1479 movq %xmm2, $inp 1480 sbbq %rcx, %rcx 1481 1482 call __rsaz_512_subtract 1483 1484 movq %r8, 128*0($inp) # scatter 1485 movq %r9, 128*1($inp) 1486 movq %r10, 128*2($inp) 1487 movq %r11, 128*3($inp) 1488 movq %r12, 128*4($inp) 1489 movq %r13, 128*5($inp) 1490 movq %r14, 128*6($inp) 1491 movq %r15, 128*7($inp) 1492 1493 leaq 128+24+48(%rsp), %rax 1494.cfi_def_cfa %rax,8 1495 movq -48(%rax), %r15 1496.cfi_restore %r15 1497 movq -40(%rax), %r14 1498.cfi_restore %r14 1499 movq -32(%rax), %r13 1500.cfi_restore %r13 1501 movq -24(%rax), %r12 1502.cfi_restore %r12 1503 movq -16(%rax), %rbp 1504.cfi_restore %rbp 1505 movq -8(%rax), %rbx 1506.cfi_restore %rbx 1507 leaq (%rax), %rsp 1508.cfi_def_cfa_register %rsp 1509.Lmul_scatter4_epilogue: 1510 ret 1511.cfi_endproc 1512.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1513___ 1514} 1515{ 1516my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1517$code.=<<___; 1518.globl rsaz_512_mul_by_one 1519.type rsaz_512_mul_by_one,\@function,4 1520.align 32 1521rsaz_512_mul_by_one: 1522.cfi_startproc 1523 push %rbx 1524.cfi_push %rbx 1525 push %rbp 1526.cfi_push %rbp 1527 push %r12 1528.cfi_push %r12 1529 push %r13 1530.cfi_push %r13 1531 push %r14 1532.cfi_push %r14 1533 push %r15 1534.cfi_push %r15 1535 1536 subq \$128+24, %rsp 1537.cfi_adjust_cfa_offset 128+24 1538.Lmul_by_one_body: 1539___ 1540$code.=<<___ if ($addx); 1541 movl OPENSSL_ia32cap_P+8(%rip),%eax 1542___ 1543$code.=<<___; 1544 movq $mod, %rbp # reassign argument 1545 movq $n0, 128(%rsp) 1546 1547 movq ($inp), %r8 1548 pxor %xmm0, %xmm0 1549 movq 8($inp), %r9 1550 movq 16($inp), %r10 1551 movq 24($inp), %r11 1552 movq 32($inp), %r12 1553 movq 40($inp), %r13 1554 movq 48($inp), %r14 1555 movq 56($inp), %r15 1556 1557 movdqa %xmm0, (%rsp) 1558 movdqa %xmm0, 16(%rsp) 1559 movdqa %xmm0, 32(%rsp) 1560 movdqa %xmm0, 48(%rsp) 1561 movdqa %xmm0, 64(%rsp) 1562 movdqa %xmm0, 80(%rsp) 1563 movdqa %xmm0, 96(%rsp) 1564___ 1565$code.=<<___ if ($addx); 1566 andl \$0x80100,%eax 1567 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1568 je .Lby_one_callx 1569___ 1570$code.=<<___; 1571 call __rsaz_512_reduce 1572___ 1573$code.=<<___ if ($addx); 1574 jmp .Lby_one_tail 1575.align 32 1576.Lby_one_callx: 1577 movq 128(%rsp), %rdx # pull $n0 1578 call __rsaz_512_reducex 1579.Lby_one_tail: 1580___ 1581$code.=<<___; 1582 movq %r8, ($out) 1583 movq %r9, 8($out) 1584 movq %r10, 16($out) 1585 movq %r11, 24($out) 1586 movq %r12, 32($out) 1587 movq %r13, 40($out) 1588 movq %r14, 48($out) 1589 movq %r15, 56($out) 1590 1591 leaq 128+24+48(%rsp), %rax 1592.cfi_def_cfa %rax,8 1593 movq -48(%rax), %r15 1594.cfi_restore %r15 1595 movq -40(%rax), %r14 1596.cfi_restore %r14 1597 movq -32(%rax), %r13 1598.cfi_restore %r13 1599 movq -24(%rax), %r12 1600.cfi_restore %r12 1601 movq -16(%rax), %rbp 1602.cfi_restore %rbp 1603 movq -8(%rax), %rbx 1604.cfi_restore %rbx 1605 leaq (%rax), %rsp 1606.cfi_def_cfa_register %rsp 1607.Lmul_by_one_epilogue: 1608 ret 1609.cfi_endproc 1610.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1611___ 1612} 1613{ # __rsaz_512_reduce 1614 # 1615 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1616 # output: %r8-%r15 1617 # clobbers: everything except %rbp and %rdi 1618$code.=<<___; 1619.type __rsaz_512_reduce,\@abi-omnipotent 1620.align 32 1621__rsaz_512_reduce: 1622.cfi_startproc 1623 movq %r8, %rbx 1624 imulq 128+8(%rsp), %rbx 1625 movq 0(%rbp), %rax 1626 movl \$8, %ecx 1627 jmp .Lreduction_loop 1628 1629.align 32 1630.Lreduction_loop: 1631 mulq %rbx 1632 movq 8(%rbp), %rax 1633 negq %r8 1634 movq %rdx, %r8 1635 adcq \$0, %r8 1636 1637 mulq %rbx 1638 addq %rax, %r9 1639 movq 16(%rbp), %rax 1640 adcq \$0, %rdx 1641 addq %r9, %r8 1642 movq %rdx, %r9 1643 adcq \$0, %r9 1644 1645 mulq %rbx 1646 addq %rax, %r10 1647 movq 24(%rbp), %rax 1648 adcq \$0, %rdx 1649 addq %r10, %r9 1650 movq %rdx, %r10 1651 adcq \$0, %r10 1652 1653 mulq %rbx 1654 addq %rax, %r11 1655 movq 32(%rbp), %rax 1656 adcq \$0, %rdx 1657 addq %r11, %r10 1658 movq 128+8(%rsp), %rsi 1659 #movq %rdx, %r11 1660 #adcq \$0, %r11 1661 adcq \$0, %rdx 1662 movq %rdx, %r11 1663 1664 mulq %rbx 1665 addq %rax, %r12 1666 movq 40(%rbp), %rax 1667 adcq \$0, %rdx 1668 imulq %r8, %rsi 1669 addq %r12, %r11 1670 movq %rdx, %r12 1671 adcq \$0, %r12 1672 1673 mulq %rbx 1674 addq %rax, %r13 1675 movq 48(%rbp), %rax 1676 adcq \$0, %rdx 1677 addq %r13, %r12 1678 movq %rdx, %r13 1679 adcq \$0, %r13 1680 1681 mulq %rbx 1682 addq %rax, %r14 1683 movq 56(%rbp), %rax 1684 adcq \$0, %rdx 1685 addq %r14, %r13 1686 movq %rdx, %r14 1687 adcq \$0, %r14 1688 1689 mulq %rbx 1690 movq %rsi, %rbx 1691 addq %rax, %r15 1692 movq 0(%rbp), %rax 1693 adcq \$0, %rdx 1694 addq %r15, %r14 1695 movq %rdx, %r15 1696 adcq \$0, %r15 1697 1698 decl %ecx 1699 jne .Lreduction_loop 1700 1701 ret 1702.cfi_endproc 1703.size __rsaz_512_reduce,.-__rsaz_512_reduce 1704___ 1705} 1706if ($addx) { 1707 # __rsaz_512_reducex 1708 # 1709 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1710 # output: %r8-%r15 1711 # clobbers: everything except %rbp and %rdi 1712$code.=<<___; 1713.type __rsaz_512_reducex,\@abi-omnipotent 1714.align 32 1715__rsaz_512_reducex: 1716.cfi_startproc 1717 #movq 128+8(%rsp), %rdx # pull $n0 1718 imulq %r8, %rdx 1719 xorq %rsi, %rsi # cf=0,of=0 1720 movl \$8, %ecx 1721 jmp .Lreduction_loopx 1722 1723.align 32 1724.Lreduction_loopx: 1725 mov %r8, %rbx 1726 mulx 0(%rbp), %rax, %r8 1727 adcx %rbx, %rax 1728 adox %r9, %r8 1729 1730 mulx 8(%rbp), %rax, %r9 1731 adcx %rax, %r8 1732 adox %r10, %r9 1733 1734 mulx 16(%rbp), %rbx, %r10 1735 adcx %rbx, %r9 1736 adox %r11, %r10 1737 1738 mulx 24(%rbp), %rbx, %r11 1739 adcx %rbx, %r10 1740 adox %r12, %r11 1741 1742 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1743 mov %rdx, %rax 1744 mov %r8, %rdx 1745 adcx %rbx, %r11 1746 adox %r13, %r12 1747 1748 mulx 128+8(%rsp), %rbx, %rdx 1749 mov %rax, %rdx 1750 1751 mulx 40(%rbp), %rax, %r13 1752 adcx %rax, %r12 1753 adox %r14, %r13 1754 1755 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1756 adcx %rax, %r13 1757 adox %r15, %r14 1758 1759 mulx 56(%rbp), %rax, %r15 1760 mov %rbx, %rdx 1761 adcx %rax, %r14 1762 adox %rsi, %r15 # %rsi is 0 1763 adcx %rsi, %r15 # cf=0 1764 1765 decl %ecx # of=0 1766 jne .Lreduction_loopx 1767 1768 ret 1769.cfi_endproc 1770.size __rsaz_512_reducex,.-__rsaz_512_reducex 1771___ 1772} 1773{ # __rsaz_512_subtract 1774 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1775 # output: 1776 # clobbers: everything but %rdi, %rsi and %rbp 1777$code.=<<___; 1778.type __rsaz_512_subtract,\@abi-omnipotent 1779.align 32 1780__rsaz_512_subtract: 1781.cfi_startproc 1782 movq %r8, ($out) 1783 movq %r9, 8($out) 1784 movq %r10, 16($out) 1785 movq %r11, 24($out) 1786 movq %r12, 32($out) 1787 movq %r13, 40($out) 1788 movq %r14, 48($out) 1789 movq %r15, 56($out) 1790 1791 movq 0($mod), %r8 1792 movq 8($mod), %r9 1793 negq %r8 1794 notq %r9 1795 andq %rcx, %r8 1796 movq 16($mod), %r10 1797 andq %rcx, %r9 1798 notq %r10 1799 movq 24($mod), %r11 1800 andq %rcx, %r10 1801 notq %r11 1802 movq 32($mod), %r12 1803 andq %rcx, %r11 1804 notq %r12 1805 movq 40($mod), %r13 1806 andq %rcx, %r12 1807 notq %r13 1808 movq 48($mod), %r14 1809 andq %rcx, %r13 1810 notq %r14 1811 movq 56($mod), %r15 1812 andq %rcx, %r14 1813 notq %r15 1814 andq %rcx, %r15 1815 1816 addq ($out), %r8 1817 adcq 8($out), %r9 1818 adcq 16($out), %r10 1819 adcq 24($out), %r11 1820 adcq 32($out), %r12 1821 adcq 40($out), %r13 1822 adcq 48($out), %r14 1823 adcq 56($out), %r15 1824 1825 movq %r8, ($out) 1826 movq %r9, 8($out) 1827 movq %r10, 16($out) 1828 movq %r11, 24($out) 1829 movq %r12, 32($out) 1830 movq %r13, 40($out) 1831 movq %r14, 48($out) 1832 movq %r15, 56($out) 1833 1834 ret 1835.cfi_endproc 1836.size __rsaz_512_subtract,.-__rsaz_512_subtract 1837___ 1838} 1839{ # __rsaz_512_mul 1840 # 1841 # input: %rsi - ap, %rbp - bp 1842 # output: 1843 # clobbers: everything 1844my ($ap,$bp) = ("%rsi","%rbp"); 1845$code.=<<___; 1846.type __rsaz_512_mul,\@abi-omnipotent 1847.align 32 1848__rsaz_512_mul: 1849.cfi_startproc 1850 leaq 8(%rsp), %rdi 1851 1852 movq ($ap), %rax 1853 mulq %rbx 1854 movq %rax, (%rdi) 1855 movq 8($ap), %rax 1856 movq %rdx, %r8 1857 1858 mulq %rbx 1859 addq %rax, %r8 1860 movq 16($ap), %rax 1861 movq %rdx, %r9 1862 adcq \$0, %r9 1863 1864 mulq %rbx 1865 addq %rax, %r9 1866 movq 24($ap), %rax 1867 movq %rdx, %r10 1868 adcq \$0, %r10 1869 1870 mulq %rbx 1871 addq %rax, %r10 1872 movq 32($ap), %rax 1873 movq %rdx, %r11 1874 adcq \$0, %r11 1875 1876 mulq %rbx 1877 addq %rax, %r11 1878 movq 40($ap), %rax 1879 movq %rdx, %r12 1880 adcq \$0, %r12 1881 1882 mulq %rbx 1883 addq %rax, %r12 1884 movq 48($ap), %rax 1885 movq %rdx, %r13 1886 adcq \$0, %r13 1887 1888 mulq %rbx 1889 addq %rax, %r13 1890 movq 56($ap), %rax 1891 movq %rdx, %r14 1892 adcq \$0, %r14 1893 1894 mulq %rbx 1895 addq %rax, %r14 1896 movq ($ap), %rax 1897 movq %rdx, %r15 1898 adcq \$0, %r15 1899 1900 leaq 8($bp), $bp 1901 leaq 8(%rdi), %rdi 1902 1903 movl \$7, %ecx 1904 jmp .Loop_mul 1905 1906.align 32 1907.Loop_mul: 1908 movq ($bp), %rbx 1909 mulq %rbx 1910 addq %rax, %r8 1911 movq 8($ap), %rax 1912 movq %r8, (%rdi) 1913 movq %rdx, %r8 1914 adcq \$0, %r8 1915 1916 mulq %rbx 1917 addq %rax, %r9 1918 movq 16($ap), %rax 1919 adcq \$0, %rdx 1920 addq %r9, %r8 1921 movq %rdx, %r9 1922 adcq \$0, %r9 1923 1924 mulq %rbx 1925 addq %rax, %r10 1926 movq 24($ap), %rax 1927 adcq \$0, %rdx 1928 addq %r10, %r9 1929 movq %rdx, %r10 1930 adcq \$0, %r10 1931 1932 mulq %rbx 1933 addq %rax, %r11 1934 movq 32($ap), %rax 1935 adcq \$0, %rdx 1936 addq %r11, %r10 1937 movq %rdx, %r11 1938 adcq \$0, %r11 1939 1940 mulq %rbx 1941 addq %rax, %r12 1942 movq 40($ap), %rax 1943 adcq \$0, %rdx 1944 addq %r12, %r11 1945 movq %rdx, %r12 1946 adcq \$0, %r12 1947 1948 mulq %rbx 1949 addq %rax, %r13 1950 movq 48($ap), %rax 1951 adcq \$0, %rdx 1952 addq %r13, %r12 1953 movq %rdx, %r13 1954 adcq \$0, %r13 1955 1956 mulq %rbx 1957 addq %rax, %r14 1958 movq 56($ap), %rax 1959 adcq \$0, %rdx 1960 addq %r14, %r13 1961 movq %rdx, %r14 1962 leaq 8($bp), $bp 1963 adcq \$0, %r14 1964 1965 mulq %rbx 1966 addq %rax, %r15 1967 movq ($ap), %rax 1968 adcq \$0, %rdx 1969 addq %r15, %r14 1970 movq %rdx, %r15 1971 adcq \$0, %r15 1972 1973 leaq 8(%rdi), %rdi 1974 1975 decl %ecx 1976 jnz .Loop_mul 1977 1978 movq %r8, (%rdi) 1979 movq %r9, 8(%rdi) 1980 movq %r10, 16(%rdi) 1981 movq %r11, 24(%rdi) 1982 movq %r12, 32(%rdi) 1983 movq %r13, 40(%rdi) 1984 movq %r14, 48(%rdi) 1985 movq %r15, 56(%rdi) 1986 1987 ret 1988.cfi_endproc 1989.size __rsaz_512_mul,.-__rsaz_512_mul 1990___ 1991} 1992if ($addx) { 1993 # __rsaz_512_mulx 1994 # 1995 # input: %rsi - ap, %rbp - bp 1996 # output: 1997 # clobbers: everything 1998my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1999$code.=<<___; 2000.type __rsaz_512_mulx,\@abi-omnipotent 2001.align 32 2002__rsaz_512_mulx: 2003.cfi_startproc 2004 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 2005 mov \$-6, %rcx 2006 2007 mulx 8($ap), %rax, %r9 2008 movq %rbx, 8(%rsp) 2009 2010 mulx 16($ap), %rbx, %r10 2011 adc %rax, %r8 2012 2013 mulx 24($ap), %rax, %r11 2014 adc %rbx, %r9 2015 2016 mulx 32($ap), %rbx, %r12 2017 adc %rax, %r10 2018 2019 mulx 40($ap), %rax, %r13 2020 adc %rbx, %r11 2021 2022 mulx 48($ap), %rbx, %r14 2023 adc %rax, %r12 2024 2025 mulx 56($ap), %rax, %r15 2026 mov 8($bp), %rdx 2027 adc %rbx, %r13 2028 adc %rax, %r14 2029 adc \$0, %r15 2030 2031 xor $zero, $zero # cf=0,of=0 2032 jmp .Loop_mulx 2033 2034.align 32 2035.Loop_mulx: 2036 movq %r8, %rbx 2037 mulx ($ap), %rax, %r8 2038 adcx %rax, %rbx 2039 adox %r9, %r8 2040 2041 mulx 8($ap), %rax, %r9 2042 adcx %rax, %r8 2043 adox %r10, %r9 2044 2045 mulx 16($ap), %rax, %r10 2046 adcx %rax, %r9 2047 adox %r11, %r10 2048 2049 mulx 24($ap), %rax, %r11 2050 adcx %rax, %r10 2051 adox %r12, %r11 2052 2053 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 2054 adcx %rax, %r11 2055 adox %r13, %r12 2056 2057 mulx 40($ap), %rax, %r13 2058 adcx %rax, %r12 2059 adox %r14, %r13 2060 2061 mulx 48($ap), %rax, %r14 2062 adcx %rax, %r13 2063 adox %r15, %r14 2064 2065 mulx 56($ap), %rax, %r15 2066 movq 64($bp,%rcx,8), %rdx 2067 movq %rbx, 8+64-8(%rsp,%rcx,8) 2068 adcx %rax, %r14 2069 adox $zero, %r15 2070 adcx $zero, %r15 # cf=0 2071 2072 inc %rcx # of=0 2073 jnz .Loop_mulx 2074 2075 movq %r8, %rbx 2076 mulx ($ap), %rax, %r8 2077 adcx %rax, %rbx 2078 adox %r9, %r8 2079 2080 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2081 adcx %rax, %r8 2082 adox %r10, %r9 2083 2084 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2085 adcx %rax, %r9 2086 adox %r11, %r10 2087 2088 mulx 24($ap), %rax, %r11 2089 adcx %rax, %r10 2090 adox %r12, %r11 2091 2092 mulx 32($ap), %rax, %r12 2093 adcx %rax, %r11 2094 adox %r13, %r12 2095 2096 mulx 40($ap), %rax, %r13 2097 adcx %rax, %r12 2098 adox %r14, %r13 2099 2100 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2101 adcx %rax, %r13 2102 adox %r15, %r14 2103 2104 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2105 adcx %rax, %r14 2106 adox $zero, %r15 2107 adcx $zero, %r15 2108 2109 mov %rbx, 8+64-8(%rsp) 2110 mov %r8, 8+64(%rsp) 2111 mov %r9, 8+64+8(%rsp) 2112 mov %r10, 8+64+16(%rsp) 2113 mov %r11, 8+64+24(%rsp) 2114 mov %r12, 8+64+32(%rsp) 2115 mov %r13, 8+64+40(%rsp) 2116 mov %r14, 8+64+48(%rsp) 2117 mov %r15, 8+64+56(%rsp) 2118 2119 ret 2120.cfi_endproc 2121.size __rsaz_512_mulx,.-__rsaz_512_mulx 2122___ 2123} 2124{ 2125my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2126$code.=<<___; 2127.globl rsaz_512_scatter4 2128.type rsaz_512_scatter4,\@abi-omnipotent 2129.align 16 2130rsaz_512_scatter4: 2131.cfi_startproc 2132 leaq ($out,$power,8), $out 2133 movl \$8, %r9d 2134 jmp .Loop_scatter 2135.align 16 2136.Loop_scatter: 2137 movq ($inp), %rax 2138 leaq 8($inp), $inp 2139 movq %rax, ($out) 2140 leaq 128($out), $out 2141 decl %r9d 2142 jnz .Loop_scatter 2143 ret 2144.cfi_endproc 2145.size rsaz_512_scatter4,.-rsaz_512_scatter4 2146 2147.globl rsaz_512_gather4 2148.type rsaz_512_gather4,\@abi-omnipotent 2149.align 16 2150rsaz_512_gather4: 2151.cfi_startproc 2152___ 2153$code.=<<___ if ($win64); 2154.LSEH_begin_rsaz_512_gather4: 2155 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2156 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2157 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2158 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2159 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2160 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2161 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2162 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2163 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2164 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2165 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2166___ 2167$code.=<<___; 2168 movd $power,%xmm8 2169 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2170 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2171 2172 pshufd \$0,%xmm8,%xmm8 # broadcast $power 2173 movdqa %xmm1,%xmm7 2174 movdqa %xmm1,%xmm2 2175___ 2176######################################################################## 2177# calculate mask by comparing 0..15 to $power 2178# 2179for($i=0;$i<4;$i++) { 2180$code.=<<___; 2181 paddd %xmm`$i`,%xmm`$i+1` 2182 pcmpeqd %xmm8,%xmm`$i` 2183 movdqa %xmm7,%xmm`$i+3` 2184___ 2185} 2186for(;$i<7;$i++) { 2187$code.=<<___; 2188 paddd %xmm`$i`,%xmm`$i+1` 2189 pcmpeqd %xmm8,%xmm`$i` 2190___ 2191} 2192$code.=<<___; 2193 pcmpeqd %xmm8,%xmm7 2194 movl \$8, %r9d 2195 jmp .Loop_gather 2196.align 16 2197.Loop_gather: 2198 movdqa 16*0($inp),%xmm8 2199 movdqa 16*1($inp),%xmm9 2200 movdqa 16*2($inp),%xmm10 2201 movdqa 16*3($inp),%xmm11 2202 pand %xmm0,%xmm8 2203 movdqa 16*4($inp),%xmm12 2204 pand %xmm1,%xmm9 2205 movdqa 16*5($inp),%xmm13 2206 pand %xmm2,%xmm10 2207 movdqa 16*6($inp),%xmm14 2208 pand %xmm3,%xmm11 2209 movdqa 16*7($inp),%xmm15 2210 leaq 128($inp), $inp 2211 pand %xmm4,%xmm12 2212 pand %xmm5,%xmm13 2213 pand %xmm6,%xmm14 2214 pand %xmm7,%xmm15 2215 por %xmm10,%xmm8 2216 por %xmm11,%xmm9 2217 por %xmm12,%xmm8 2218 por %xmm13,%xmm9 2219 por %xmm14,%xmm8 2220 por %xmm15,%xmm9 2221 2222 por %xmm9,%xmm8 2223 pshufd \$0x4e,%xmm8,%xmm9 2224 por %xmm9,%xmm8 2225 movq %xmm8,($out) 2226 leaq 8($out), $out 2227 decl %r9d 2228 jnz .Loop_gather 2229___ 2230$code.=<<___ if ($win64); 2231 movaps 0x00(%rsp),%xmm6 2232 movaps 0x10(%rsp),%xmm7 2233 movaps 0x20(%rsp),%xmm8 2234 movaps 0x30(%rsp),%xmm9 2235 movaps 0x40(%rsp),%xmm10 2236 movaps 0x50(%rsp),%xmm11 2237 movaps 0x60(%rsp),%xmm12 2238 movaps 0x70(%rsp),%xmm13 2239 movaps 0x80(%rsp),%xmm14 2240 movaps 0x90(%rsp),%xmm15 2241 add \$0xa8,%rsp 2242___ 2243$code.=<<___; 2244 ret 2245.LSEH_end_rsaz_512_gather4: 2246.cfi_endproc 2247.size rsaz_512_gather4,.-rsaz_512_gather4 2248 2249.align 64 2250.Linc: 2251 .long 0,0, 1,1 2252 .long 2,2, 2,2 2253___ 2254} 2255 2256# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2257# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2258if ($win64) { 2259$rec="%rcx"; 2260$frame="%rdx"; 2261$context="%r8"; 2262$disp="%r9"; 2263 2264$code.=<<___; 2265.extern __imp_RtlVirtualUnwind 2266.type se_handler,\@abi-omnipotent 2267.align 16 2268se_handler: 2269 push %rsi 2270 push %rdi 2271 push %rbx 2272 push %rbp 2273 push %r12 2274 push %r13 2275 push %r14 2276 push %r15 2277 pushfq 2278 sub \$64,%rsp 2279 2280 mov 120($context),%rax # pull context->Rax 2281 mov 248($context),%rbx # pull context->Rip 2282 2283 mov 8($disp),%rsi # disp->ImageBase 2284 mov 56($disp),%r11 # disp->HandlerData 2285 2286 mov 0(%r11),%r10d # HandlerData[0] 2287 lea (%rsi,%r10),%r10 # end of prologue label 2288 cmp %r10,%rbx # context->Rip<end of prologue label 2289 jb .Lcommon_seh_tail 2290 2291 mov 152($context),%rax # pull context->Rsp 2292 2293 mov 4(%r11),%r10d # HandlerData[1] 2294 lea (%rsi,%r10),%r10 # epilogue label 2295 cmp %r10,%rbx # context->Rip>=epilogue label 2296 jae .Lcommon_seh_tail 2297 2298 lea 128+24+48(%rax),%rax 2299 2300 lea .Lmul_gather4_epilogue(%rip),%rbx 2301 cmp %r10,%rbx 2302 jne .Lse_not_in_mul_gather4 2303 2304 lea 0xb0(%rax),%rax 2305 2306 lea -48-0xa8(%rax),%rsi 2307 lea 512($context),%rdi 2308 mov \$20,%ecx 2309 .long 0xa548f3fc # cld; rep movsq 2310 2311.Lse_not_in_mul_gather4: 2312 mov -8(%rax),%rbx 2313 mov -16(%rax),%rbp 2314 mov -24(%rax),%r12 2315 mov -32(%rax),%r13 2316 mov -40(%rax),%r14 2317 mov -48(%rax),%r15 2318 mov %rbx,144($context) # restore context->Rbx 2319 mov %rbp,160($context) # restore context->Rbp 2320 mov %r12,216($context) # restore context->R12 2321 mov %r13,224($context) # restore context->R13 2322 mov %r14,232($context) # restore context->R14 2323 mov %r15,240($context) # restore context->R15 2324 2325.Lcommon_seh_tail: 2326 mov 8(%rax),%rdi 2327 mov 16(%rax),%rsi 2328 mov %rax,152($context) # restore context->Rsp 2329 mov %rsi,168($context) # restore context->Rsi 2330 mov %rdi,176($context) # restore context->Rdi 2331 2332 mov 40($disp),%rdi # disp->ContextRecord 2333 mov $context,%rsi # context 2334 mov \$154,%ecx # sizeof(CONTEXT) 2335 .long 0xa548f3fc # cld; rep movsq 2336 2337 mov $disp,%rsi 2338 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2339 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2340 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2341 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2342 mov 40(%rsi),%r10 # disp->ContextRecord 2343 lea 56(%rsi),%r11 # &disp->HandlerData 2344 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2345 mov %r10,32(%rsp) # arg5 2346 mov %r11,40(%rsp) # arg6 2347 mov %r12,48(%rsp) # arg7 2348 mov %rcx,56(%rsp) # arg8, (NULL) 2349 call *__imp_RtlVirtualUnwind(%rip) 2350 2351 mov \$1,%eax # ExceptionContinueSearch 2352 add \$64,%rsp 2353 popfq 2354 pop %r15 2355 pop %r14 2356 pop %r13 2357 pop %r12 2358 pop %rbp 2359 pop %rbx 2360 pop %rdi 2361 pop %rsi 2362 ret 2363.size se_handler,.-se_handler 2364 2365.section .pdata 2366.align 4 2367 .rva .LSEH_begin_rsaz_512_sqr 2368 .rva .LSEH_end_rsaz_512_sqr 2369 .rva .LSEH_info_rsaz_512_sqr 2370 2371 .rva .LSEH_begin_rsaz_512_mul 2372 .rva .LSEH_end_rsaz_512_mul 2373 .rva .LSEH_info_rsaz_512_mul 2374 2375 .rva .LSEH_begin_rsaz_512_mul_gather4 2376 .rva .LSEH_end_rsaz_512_mul_gather4 2377 .rva .LSEH_info_rsaz_512_mul_gather4 2378 2379 .rva .LSEH_begin_rsaz_512_mul_scatter4 2380 .rva .LSEH_end_rsaz_512_mul_scatter4 2381 .rva .LSEH_info_rsaz_512_mul_scatter4 2382 2383 .rva .LSEH_begin_rsaz_512_mul_by_one 2384 .rva .LSEH_end_rsaz_512_mul_by_one 2385 .rva .LSEH_info_rsaz_512_mul_by_one 2386 2387 .rva .LSEH_begin_rsaz_512_gather4 2388 .rva .LSEH_end_rsaz_512_gather4 2389 .rva .LSEH_info_rsaz_512_gather4 2390 2391.section .xdata 2392.align 8 2393.LSEH_info_rsaz_512_sqr: 2394 .byte 9,0,0,0 2395 .rva se_handler 2396 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2397.LSEH_info_rsaz_512_mul: 2398 .byte 9,0,0,0 2399 .rva se_handler 2400 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2401.LSEH_info_rsaz_512_mul_gather4: 2402 .byte 9,0,0,0 2403 .rva se_handler 2404 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2405.LSEH_info_rsaz_512_mul_scatter4: 2406 .byte 9,0,0,0 2407 .rva se_handler 2408 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2409.LSEH_info_rsaz_512_mul_by_one: 2410 .byte 9,0,0,0 2411 .rva se_handler 2412 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2413.LSEH_info_rsaz_512_gather4: 2414 .byte 0x01,0x46,0x16,0x00 2415 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2416 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2417 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2418 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2419 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2420 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2421 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2422 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2423 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2424 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2425 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2426___ 2427} 2428 2429$code =~ s/\`([^\`]*)\`/eval $1/gem; 2430print $code; 2431close STDOUT or die "error closing STDOUT: $!"; 2432