1#! /usr/bin/env perl 2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# August 2011. 18# 19# Companion to x86_64-mont.pl that optimizes cache-timing attack 20# countermeasures. The subroutines are produced by replacing bp[i] 21# references in their x86_64-mont.pl counterparts with cache-neutral 22# references to powers table computed in BN_mod_exp_mont_consttime. 23# In addition subroutine that scatters elements of the powers table 24# is implemented, so that scatter-/gathering can be tuned without 25# bn_exp.c modifications. 26 27# August 2013. 28# 29# Add MULX/AD*X code paths and additional interfaces to optimize for 30# branch prediction unit. For input lengths that are multiples of 8 31# the np argument is not just modulus value, but one interleaved 32# with 0. This is to optimize post-condition... 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 46*STDOUT=*OUT; 47 48# In upstream, this is controlled by shelling out to the compiler to check 49# versions, but BoringSSL is intended to be used with pre-generated perlasm 50# output, so this isn't useful anyway. 51$addx = 1; 52 53# int bn_mul_mont_gather5( 54$rp="%rdi"; # BN_ULONG *rp, 55$ap="%rsi"; # const BN_ULONG *ap, 56$bp="%rdx"; # const BN_ULONG *bp, 57$np="%rcx"; # const BN_ULONG *np, 58$n0="%r8"; # const BN_ULONG *n0, 59$num="%r9"; # int num, 60 # int idx); # 0 to 2^5-1, "index" in $bp holding 61 # pre-computed powers of a', interlaced 62 # in such manner that b[0] is $bp[idx], 63 # b[1] is [2^5+idx], etc. 64$lo0="%r10"; 65$hi0="%r11"; 66$hi1="%r13"; 67$i="%r14"; 68$j="%r15"; 69$m0="%rbx"; 70$m1="%rbp"; 71 72$code=<<___; 73.text 74 75.extern OPENSSL_ia32cap_P 76 77.globl bn_mul_mont_gather5 78.type bn_mul_mont_gather5,\@function,6 79.align 64 80bn_mul_mont_gather5: 81.cfi_startproc 82 mov ${num}d,${num}d 83 mov %rsp,%rax 84.cfi_def_cfa_register %rax 85 test \$7,${num}d 86 jnz .Lmul_enter 87___ 88$code.=<<___ if ($addx); 89 leaq OPENSSL_ia32cap_P(%rip),%r11 90 mov 8(%r11),%r11d 91___ 92$code.=<<___; 93 jmp .Lmul4x_enter 94 95.align 16 96.Lmul_enter: 97 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument 98 push %rbx 99.cfi_push %rbx 100 push %rbp 101.cfi_push %rbp 102 push %r12 103.cfi_push %r12 104 push %r13 105.cfi_push %r13 106 push %r14 107.cfi_push %r14 108 push %r15 109.cfi_push %r15 110 111 neg $num 112 mov %rsp,%r11 113 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) 114 neg $num # restore $num 115 and \$-1024,%r10 # minimize TLB usage 116 117 # An OS-agnostic version of __chkstk. 118 # 119 # Some OSes (Windows) insist on stack being "wired" to 120 # physical memory in strictly sequential manner, i.e. if stack 121 # allocation spans two pages, then reference to farmost one can 122 # be punishable by SEGV. But page walking can do good even on 123 # other OSes, because it guarantees that villain thread hits 124 # the guard page before it can make damage to innocent one... 125 sub %r10,%r11 126 and \$-4096,%r11 127 lea (%r10,%r11),%rsp 128 mov (%rsp),%r11 129 cmp %r10,%rsp 130 ja .Lmul_page_walk 131 jmp .Lmul_page_walk_done 132 133.Lmul_page_walk: 134 lea -4096(%rsp),%rsp 135 mov (%rsp),%r11 136 cmp %r10,%rsp 137 ja .Lmul_page_walk 138.Lmul_page_walk_done: 139 140 lea .Linc(%rip),%r10 141 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 142.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 143.Lmul_body: 144 145 lea 128($bp),%r12 # reassign $bp (+size optimization) 146___ 147 $bp="%r12"; 148 $STRIDE=2**5*8; # 5 is "window size" 149 $N=$STRIDE/4; # should match cache line size 150$code.=<<___; 151 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 152 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 153 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) 154 and \$-16,%r10 155 156 pshufd \$0,%xmm5,%xmm5 # broadcast index 157 movdqa %xmm1,%xmm4 158 movdqa %xmm1,%xmm2 159___ 160######################################################################## 161# calculate mask by comparing 0..31 to index and save result to stack 162# 163$code.=<<___; 164 paddd %xmm0,%xmm1 165 pcmpeqd %xmm5,%xmm0 # compare to 1,0 166 .byte 0x67 167 movdqa %xmm4,%xmm3 168___ 169for($k=0;$k<$STRIDE/16-4;$k+=4) { 170$code.=<<___; 171 paddd %xmm1,%xmm2 172 pcmpeqd %xmm5,%xmm1 # compare to 3,2 173 movdqa %xmm0,`16*($k+0)+112`(%r10) 174 movdqa %xmm4,%xmm0 175 176 paddd %xmm2,%xmm3 177 pcmpeqd %xmm5,%xmm2 # compare to 5,4 178 movdqa %xmm1,`16*($k+1)+112`(%r10) 179 movdqa %xmm4,%xmm1 180 181 paddd %xmm3,%xmm0 182 pcmpeqd %xmm5,%xmm3 # compare to 7,6 183 movdqa %xmm2,`16*($k+2)+112`(%r10) 184 movdqa %xmm4,%xmm2 185 186 paddd %xmm0,%xmm1 187 pcmpeqd %xmm5,%xmm0 188 movdqa %xmm3,`16*($k+3)+112`(%r10) 189 movdqa %xmm4,%xmm3 190___ 191} 192$code.=<<___; # last iteration can be optimized 193 paddd %xmm1,%xmm2 194 pcmpeqd %xmm5,%xmm1 195 movdqa %xmm0,`16*($k+0)+112`(%r10) 196 197 paddd %xmm2,%xmm3 198 .byte 0x67 199 pcmpeqd %xmm5,%xmm2 200 movdqa %xmm1,`16*($k+1)+112`(%r10) 201 202 pcmpeqd %xmm5,%xmm3 203 movdqa %xmm2,`16*($k+2)+112`(%r10) 204 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register 205 206 pand `16*($k+1)-128`($bp),%xmm1 207 pand `16*($k+2)-128`($bp),%xmm2 208 movdqa %xmm3,`16*($k+3)+112`(%r10) 209 pand `16*($k+3)-128`($bp),%xmm3 210 por %xmm2,%xmm0 211 por %xmm3,%xmm1 212___ 213for($k=0;$k<$STRIDE/16-4;$k+=4) { 214$code.=<<___; 215 movdqa `16*($k+0)-128`($bp),%xmm4 216 movdqa `16*($k+1)-128`($bp),%xmm5 217 movdqa `16*($k+2)-128`($bp),%xmm2 218 pand `16*($k+0)+112`(%r10),%xmm4 219 movdqa `16*($k+3)-128`($bp),%xmm3 220 pand `16*($k+1)+112`(%r10),%xmm5 221 por %xmm4,%xmm0 222 pand `16*($k+2)+112`(%r10),%xmm2 223 por %xmm5,%xmm1 224 pand `16*($k+3)+112`(%r10),%xmm3 225 por %xmm2,%xmm0 226 por %xmm3,%xmm1 227___ 228} 229$code.=<<___; 230 por %xmm1,%xmm0 231 pshufd \$0x4e,%xmm0,%xmm1 232 por %xmm1,%xmm0 233 lea $STRIDE($bp),$bp 234 movq %xmm0,$m0 # m0=bp[0] 235 236 mov ($n0),$n0 # pull n0[0] value 237 mov ($ap),%rax 238 239 xor $i,$i # i=0 240 xor $j,$j # j=0 241 242 mov $n0,$m1 243 mulq $m0 # ap[0]*bp[0] 244 mov %rax,$lo0 245 mov ($np),%rax 246 247 imulq $lo0,$m1 # "tp[0]"*n0 248 mov %rdx,$hi0 249 250 mulq $m1 # np[0]*m1 251 add %rax,$lo0 # discarded 252 mov 8($ap),%rax 253 adc \$0,%rdx 254 mov %rdx,$hi1 255 256 lea 1($j),$j # j++ 257 jmp .L1st_enter 258 259.align 16 260.L1st: 261 add %rax,$hi1 262 mov ($ap,$j,8),%rax 263 adc \$0,%rdx 264 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 265 mov $lo0,$hi0 266 adc \$0,%rdx 267 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 268 mov %rdx,$hi1 269 270.L1st_enter: 271 mulq $m0 # ap[j]*bp[0] 272 add %rax,$hi0 273 mov ($np,$j,8),%rax 274 adc \$0,%rdx 275 lea 1($j),$j # j++ 276 mov %rdx,$lo0 277 278 mulq $m1 # np[j]*m1 279 cmp $num,$j 280 jne .L1st # note that upon exit $j==$num, so 281 # they can be used interchangeably 282 283 add %rax,$hi1 284 adc \$0,%rdx 285 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 286 adc \$0,%rdx 287 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 288 mov %rdx,$hi1 289 mov $lo0,$hi0 290 291 xor %rdx,%rdx 292 add $hi0,$hi1 293 adc \$0,%rdx 294 mov $hi1,-8(%rsp,$num,8) 295 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 296 297 lea 1($i),$i # i++ 298 jmp .Louter 299.align 16 300.Louter: 301 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) 302 and \$-16,%rdx 303 pxor %xmm4,%xmm4 304 pxor %xmm5,%xmm5 305___ 306for($k=0;$k<$STRIDE/16;$k+=4) { 307$code.=<<___; 308 movdqa `16*($k+0)-128`($bp),%xmm0 309 movdqa `16*($k+1)-128`($bp),%xmm1 310 movdqa `16*($k+2)-128`($bp),%xmm2 311 movdqa `16*($k+3)-128`($bp),%xmm3 312 pand `16*($k+0)-128`(%rdx),%xmm0 313 pand `16*($k+1)-128`(%rdx),%xmm1 314 por %xmm0,%xmm4 315 pand `16*($k+2)-128`(%rdx),%xmm2 316 por %xmm1,%xmm5 317 pand `16*($k+3)-128`(%rdx),%xmm3 318 por %xmm2,%xmm4 319 por %xmm3,%xmm5 320___ 321} 322$code.=<<___; 323 por %xmm5,%xmm4 324 pshufd \$0x4e,%xmm4,%xmm0 325 por %xmm4,%xmm0 326 lea $STRIDE($bp),$bp 327 328 mov ($ap),%rax # ap[0] 329 movq %xmm0,$m0 # m0=bp[i] 330 331 xor $j,$j # j=0 332 mov $n0,$m1 333 mov (%rsp),$lo0 334 335 mulq $m0 # ap[0]*bp[i] 336 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 337 mov ($np),%rax 338 adc \$0,%rdx 339 340 imulq $lo0,$m1 # tp[0]*n0 341 mov %rdx,$hi0 342 343 mulq $m1 # np[0]*m1 344 add %rax,$lo0 # discarded 345 mov 8($ap),%rax 346 adc \$0,%rdx 347 mov 8(%rsp),$lo0 # tp[1] 348 mov %rdx,$hi1 349 350 lea 1($j),$j # j++ 351 jmp .Linner_enter 352 353.align 16 354.Linner: 355 add %rax,$hi1 356 mov ($ap,$j,8),%rax 357 adc \$0,%rdx 358 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 359 mov (%rsp,$j,8),$lo0 360 adc \$0,%rdx 361 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 362 mov %rdx,$hi1 363 364.Linner_enter: 365 mulq $m0 # ap[j]*bp[i] 366 add %rax,$hi0 367 mov ($np,$j,8),%rax 368 adc \$0,%rdx 369 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 370 mov %rdx,$hi0 371 adc \$0,$hi0 372 lea 1($j),$j # j++ 373 374 mulq $m1 # np[j]*m1 375 cmp $num,$j 376 jne .Linner # note that upon exit $j==$num, so 377 # they can be used interchangeably 378 add %rax,$hi1 379 adc \$0,%rdx 380 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 381 mov (%rsp,$num,8),$lo0 382 adc \$0,%rdx 383 mov $hi1,-16(%rsp,$num,8) # tp[num-1] 384 mov %rdx,$hi1 385 386 xor %rdx,%rdx 387 add $hi0,$hi1 388 adc \$0,%rdx 389 add $lo0,$hi1 # pull upmost overflow bit 390 adc \$0,%rdx 391 mov $hi1,-8(%rsp,$num,8) 392 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 393 394 lea 1($i),$i # i++ 395 cmp $num,$i 396 jb .Louter 397 398 xor $i,$i # i=0 and clear CF! 399 mov (%rsp),%rax # tp[0] 400 lea (%rsp),$ap # borrow ap for tp 401 mov $num,$j # j=num 402 jmp .Lsub 403.align 16 404.Lsub: sbb ($np,$i,8),%rax 405 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 406 mov 8($ap,$i,8),%rax # tp[i+1] 407 lea 1($i),$i # i++ 408 dec $j # doesn't affect CF! 409 jnz .Lsub 410 411 sbb \$0,%rax # handle upmost overflow bit 412 mov \$-1,%rbx 413 xor %rax,%rbx 414 xor $i,$i 415 mov $num,$j # j=num 416 417.Lcopy: # conditional copy 418 mov ($rp,$i,8),%rcx 419 mov (%rsp,$i,8),%rdx 420 and %rbx,%rcx 421 and %rax,%rdx 422 mov $i,(%rsp,$i,8) # zap temporary vector 423 or %rcx,%rdx 424 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 425 lea 1($i),$i 426 sub \$1,$j 427 jnz .Lcopy 428 429 mov 8(%rsp,$num,8),%rsi # restore %rsp 430.cfi_def_cfa %rsi,8 431 mov \$1,%rax 432 433 mov -48(%rsi),%r15 434.cfi_restore %r15 435 mov -40(%rsi),%r14 436.cfi_restore %r14 437 mov -32(%rsi),%r13 438.cfi_restore %r13 439 mov -24(%rsi),%r12 440.cfi_restore %r12 441 mov -16(%rsi),%rbp 442.cfi_restore %rbp 443 mov -8(%rsi),%rbx 444.cfi_restore %rbx 445 lea (%rsi),%rsp 446.cfi_def_cfa_register %rsp 447.Lmul_epilogue: 448 ret 449.cfi_endproc 450.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 451___ 452{{{ 453my @A=("%r10","%r11"); 454my @N=("%r13","%rdi"); 455$code.=<<___; 456.type bn_mul4x_mont_gather5,\@function,6 457.align 32 458bn_mul4x_mont_gather5: 459.cfi_startproc 460 .byte 0x67 461 mov %rsp,%rax 462.cfi_def_cfa_register %rax 463.Lmul4x_enter: 464___ 465$code.=<<___ if ($addx); 466 and \$0x80108,%r11d 467 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 468 je .Lmulx4x_enter 469___ 470$code.=<<___; 471 push %rbx 472.cfi_push %rbx 473 push %rbp 474.cfi_push %rbp 475 push %r12 476.cfi_push %r12 477 push %r13 478.cfi_push %r13 479 push %r14 480.cfi_push %r14 481 push %r15 482.cfi_push %r15 483.Lmul4x_prologue: 484 485 .byte 0x67 486 shl \$3,${num}d # convert $num to bytes 487 lea ($num,$num,2),%r10 # 3*$num in bytes 488 neg $num # -$num 489 490 ############################################################## 491 # Ensure that stack frame doesn't alias with $rptr+3*$num 492 # modulo 4096, which covers ret[num], am[num] and n[num] 493 # (see bn_exp.c). This is done to allow memory disambiguation 494 # logic do its magic. [Extra [num] is allocated in order 495 # to align with bn_power5's frame, which is cleansed after 496 # completing exponentiation. Extra 256 bytes is for power mask 497 # calculated from 7th argument, the index.] 498 # 499 lea -320(%rsp,$num,2),%r11 500 mov %rsp,%rbp 501 sub $rp,%r11 502 and \$4095,%r11 503 cmp %r11,%r10 504 jb .Lmul4xsp_alt 505 sub %r11,%rbp # align with $rp 506 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 507 jmp .Lmul4xsp_done 508 509.align 32 510.Lmul4xsp_alt: 511 lea 4096-320(,$num,2),%r10 512 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 513 sub %r10,%r11 514 mov \$0,%r10 515 cmovc %r10,%r11 516 sub %r11,%rbp 517.Lmul4xsp_done: 518 and \$-64,%rbp 519 mov %rsp,%r11 520 sub %rbp,%r11 521 and \$-4096,%r11 522 lea (%rbp,%r11),%rsp 523 mov (%rsp),%r10 524 cmp %rbp,%rsp 525 ja .Lmul4x_page_walk 526 jmp .Lmul4x_page_walk_done 527 528.Lmul4x_page_walk: 529 lea -4096(%rsp),%rsp 530 mov (%rsp),%r10 531 cmp %rbp,%rsp 532 ja .Lmul4x_page_walk 533.Lmul4x_page_walk_done: 534 535 neg $num 536 537 mov %rax,40(%rsp) 538.cfi_cfa_expression %rsp+40,deref,+8 539.Lmul4x_body: 540 541 call mul4x_internal 542 543 mov 40(%rsp),%rsi # restore %rsp 544.cfi_def_cfa %rsi,8 545 mov \$1,%rax 546 547 mov -48(%rsi),%r15 548.cfi_restore %r15 549 mov -40(%rsi),%r14 550.cfi_restore %r14 551 mov -32(%rsi),%r13 552.cfi_restore %r13 553 mov -24(%rsi),%r12 554.cfi_restore %r12 555 mov -16(%rsi),%rbp 556.cfi_restore %rbp 557 mov -8(%rsi),%rbx 558.cfi_restore %rbx 559 lea (%rsi),%rsp 560.cfi_def_cfa_register %rsp 561.Lmul4x_epilogue: 562 ret 563.cfi_endproc 564.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 565 566.type mul4x_internal,\@abi-omnipotent 567.align 32 568mul4x_internal: 569.cfi_startproc 570 shl \$5,$num # $num was in bytes 571 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index 572 lea .Linc(%rip),%rax 573 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) 574 shr \$5,$num # restore $num 575___ 576 $bp="%r12"; 577 $STRIDE=2**5*8; # 5 is "window size" 578 $N=$STRIDE/4; # should match cache line size 579 $tp=$i; 580$code.=<<___; 581 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 582 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 583 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) 584 lea 128(%rdx),$bp # size optimization 585 586 pshufd \$0,%xmm5,%xmm5 # broadcast index 587 movdqa %xmm1,%xmm4 588 .byte 0x67,0x67 589 movdqa %xmm1,%xmm2 590___ 591######################################################################## 592# calculate mask by comparing 0..31 to index and save result to stack 593# 594$code.=<<___; 595 paddd %xmm0,%xmm1 596 pcmpeqd %xmm5,%xmm0 # compare to 1,0 597 .byte 0x67 598 movdqa %xmm4,%xmm3 599___ 600for($i=0;$i<$STRIDE/16-4;$i+=4) { 601$code.=<<___; 602 paddd %xmm1,%xmm2 603 pcmpeqd %xmm5,%xmm1 # compare to 3,2 604 movdqa %xmm0,`16*($i+0)+112`(%r10) 605 movdqa %xmm4,%xmm0 606 607 paddd %xmm2,%xmm3 608 pcmpeqd %xmm5,%xmm2 # compare to 5,4 609 movdqa %xmm1,`16*($i+1)+112`(%r10) 610 movdqa %xmm4,%xmm1 611 612 paddd %xmm3,%xmm0 613 pcmpeqd %xmm5,%xmm3 # compare to 7,6 614 movdqa %xmm2,`16*($i+2)+112`(%r10) 615 movdqa %xmm4,%xmm2 616 617 paddd %xmm0,%xmm1 618 pcmpeqd %xmm5,%xmm0 619 movdqa %xmm3,`16*($i+3)+112`(%r10) 620 movdqa %xmm4,%xmm3 621___ 622} 623$code.=<<___; # last iteration can be optimized 624 paddd %xmm1,%xmm2 625 pcmpeqd %xmm5,%xmm1 626 movdqa %xmm0,`16*($i+0)+112`(%r10) 627 628 paddd %xmm2,%xmm3 629 .byte 0x67 630 pcmpeqd %xmm5,%xmm2 631 movdqa %xmm1,`16*($i+1)+112`(%r10) 632 633 pcmpeqd %xmm5,%xmm3 634 movdqa %xmm2,`16*($i+2)+112`(%r10) 635 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register 636 637 pand `16*($i+1)-128`($bp),%xmm1 638 pand `16*($i+2)-128`($bp),%xmm2 639 movdqa %xmm3,`16*($i+3)+112`(%r10) 640 pand `16*($i+3)-128`($bp),%xmm3 641 por %xmm2,%xmm0 642 por %xmm3,%xmm1 643___ 644for($i=0;$i<$STRIDE/16-4;$i+=4) { 645$code.=<<___; 646 movdqa `16*($i+0)-128`($bp),%xmm4 647 movdqa `16*($i+1)-128`($bp),%xmm5 648 movdqa `16*($i+2)-128`($bp),%xmm2 649 pand `16*($i+0)+112`(%r10),%xmm4 650 movdqa `16*($i+3)-128`($bp),%xmm3 651 pand `16*($i+1)+112`(%r10),%xmm5 652 por %xmm4,%xmm0 653 pand `16*($i+2)+112`(%r10),%xmm2 654 por %xmm5,%xmm1 655 pand `16*($i+3)+112`(%r10),%xmm3 656 por %xmm2,%xmm0 657 por %xmm3,%xmm1 658___ 659} 660$code.=<<___; 661 por %xmm1,%xmm0 662 pshufd \$0x4e,%xmm0,%xmm1 663 por %xmm1,%xmm0 664 lea $STRIDE($bp),$bp 665 movq %xmm0,$m0 # m0=bp[0] 666 667 mov %r13,16+8(%rsp) # save end of b[num] 668 mov $rp, 56+8(%rsp) # save $rp 669 670 mov ($n0),$n0 # pull n0[0] value 671 mov ($ap),%rax 672 lea ($ap,$num),$ap # end of a[num] 673 neg $num 674 675 mov $n0,$m1 676 mulq $m0 # ap[0]*bp[0] 677 mov %rax,$A[0] 678 mov ($np),%rax 679 680 imulq $A[0],$m1 # "tp[0]"*n0 681 lea 64+8(%rsp),$tp 682 mov %rdx,$A[1] 683 684 mulq $m1 # np[0]*m1 685 add %rax,$A[0] # discarded 686 mov 8($ap,$num),%rax 687 adc \$0,%rdx 688 mov %rdx,$N[1] 689 690 mulq $m0 691 add %rax,$A[1] 692 mov 8*1($np),%rax 693 adc \$0,%rdx 694 mov %rdx,$A[0] 695 696 mulq $m1 697 add %rax,$N[1] 698 mov 16($ap,$num),%rax 699 adc \$0,%rdx 700 add $A[1],$N[1] 701 lea 4*8($num),$j # j=4 702 lea 8*4($np),$np 703 adc \$0,%rdx 704 mov $N[1],($tp) 705 mov %rdx,$N[0] 706 jmp .L1st4x 707 708.align 32 709.L1st4x: 710 mulq $m0 # ap[j]*bp[0] 711 add %rax,$A[0] 712 mov -8*2($np),%rax 713 lea 32($tp),$tp 714 adc \$0,%rdx 715 mov %rdx,$A[1] 716 717 mulq $m1 # np[j]*m1 718 add %rax,$N[0] 719 mov -8($ap,$j),%rax 720 adc \$0,%rdx 721 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 722 adc \$0,%rdx 723 mov $N[0],-24($tp) # tp[j-1] 724 mov %rdx,$N[1] 725 726 mulq $m0 # ap[j]*bp[0] 727 add %rax,$A[1] 728 mov -8*1($np),%rax 729 adc \$0,%rdx 730 mov %rdx,$A[0] 731 732 mulq $m1 # np[j]*m1 733 add %rax,$N[1] 734 mov ($ap,$j),%rax 735 adc \$0,%rdx 736 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 737 adc \$0,%rdx 738 mov $N[1],-16($tp) # tp[j-1] 739 mov %rdx,$N[0] 740 741 mulq $m0 # ap[j]*bp[0] 742 add %rax,$A[0] 743 mov 8*0($np),%rax 744 adc \$0,%rdx 745 mov %rdx,$A[1] 746 747 mulq $m1 # np[j]*m1 748 add %rax,$N[0] 749 mov 8($ap,$j),%rax 750 adc \$0,%rdx 751 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 752 adc \$0,%rdx 753 mov $N[0],-8($tp) # tp[j-1] 754 mov %rdx,$N[1] 755 756 mulq $m0 # ap[j]*bp[0] 757 add %rax,$A[1] 758 mov 8*1($np),%rax 759 adc \$0,%rdx 760 mov %rdx,$A[0] 761 762 mulq $m1 # np[j]*m1 763 add %rax,$N[1] 764 mov 16($ap,$j),%rax 765 adc \$0,%rdx 766 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 767 lea 8*4($np),$np 768 adc \$0,%rdx 769 mov $N[1],($tp) # tp[j-1] 770 mov %rdx,$N[0] 771 772 add \$32,$j # j+=4 773 jnz .L1st4x 774 775 mulq $m0 # ap[j]*bp[0] 776 add %rax,$A[0] 777 mov -8*2($np),%rax 778 lea 32($tp),$tp 779 adc \$0,%rdx 780 mov %rdx,$A[1] 781 782 mulq $m1 # np[j]*m1 783 add %rax,$N[0] 784 mov -8($ap),%rax 785 adc \$0,%rdx 786 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 787 adc \$0,%rdx 788 mov $N[0],-24($tp) # tp[j-1] 789 mov %rdx,$N[1] 790 791 mulq $m0 # ap[j]*bp[0] 792 add %rax,$A[1] 793 mov -8*1($np),%rax 794 adc \$0,%rdx 795 mov %rdx,$A[0] 796 797 mulq $m1 # np[j]*m1 798 add %rax,$N[1] 799 mov ($ap,$num),%rax # ap[0] 800 adc \$0,%rdx 801 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 802 adc \$0,%rdx 803 mov $N[1],-16($tp) # tp[j-1] 804 mov %rdx,$N[0] 805 806 lea ($np,$num),$np # rewind $np 807 808 xor $N[1],$N[1] 809 add $A[0],$N[0] 810 adc \$0,$N[1] 811 mov $N[0],-8($tp) 812 813 jmp .Louter4x 814 815.align 32 816.Louter4x: 817 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) 818 pxor %xmm4,%xmm4 819 pxor %xmm5,%xmm5 820___ 821for($i=0;$i<$STRIDE/16;$i+=4) { 822$code.=<<___; 823 movdqa `16*($i+0)-128`($bp),%xmm0 824 movdqa `16*($i+1)-128`($bp),%xmm1 825 movdqa `16*($i+2)-128`($bp),%xmm2 826 movdqa `16*($i+3)-128`($bp),%xmm3 827 pand `16*($i+0)-128`(%rdx),%xmm0 828 pand `16*($i+1)-128`(%rdx),%xmm1 829 por %xmm0,%xmm4 830 pand `16*($i+2)-128`(%rdx),%xmm2 831 por %xmm1,%xmm5 832 pand `16*($i+3)-128`(%rdx),%xmm3 833 por %xmm2,%xmm4 834 por %xmm3,%xmm5 835___ 836} 837$code.=<<___; 838 por %xmm5,%xmm4 839 pshufd \$0x4e,%xmm4,%xmm0 840 por %xmm4,%xmm0 841 lea $STRIDE($bp),$bp 842 movq %xmm0,$m0 # m0=bp[i] 843 844 mov ($tp,$num),$A[0] 845 mov $n0,$m1 846 mulq $m0 # ap[0]*bp[i] 847 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 848 mov ($np),%rax 849 adc \$0,%rdx 850 851 imulq $A[0],$m1 # tp[0]*n0 852 mov %rdx,$A[1] 853 mov $N[1],($tp) # store upmost overflow bit 854 855 lea ($tp,$num),$tp # rewind $tp 856 857 mulq $m1 # np[0]*m1 858 add %rax,$A[0] # "$N[0]", discarded 859 mov 8($ap,$num),%rax 860 adc \$0,%rdx 861 mov %rdx,$N[1] 862 863 mulq $m0 # ap[j]*bp[i] 864 add %rax,$A[1] 865 mov 8*1($np),%rax 866 adc \$0,%rdx 867 add 8($tp),$A[1] # +tp[1] 868 adc \$0,%rdx 869 mov %rdx,$A[0] 870 871 mulq $m1 # np[j]*m1 872 add %rax,$N[1] 873 mov 16($ap,$num),%rax 874 adc \$0,%rdx 875 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 876 lea 4*8($num),$j # j=4 877 lea 8*4($np),$np 878 adc \$0,%rdx 879 mov %rdx,$N[0] 880 jmp .Linner4x 881 882.align 32 883.Linner4x: 884 mulq $m0 # ap[j]*bp[i] 885 add %rax,$A[0] 886 mov -8*2($np),%rax 887 adc \$0,%rdx 888 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 889 lea 32($tp),$tp 890 adc \$0,%rdx 891 mov %rdx,$A[1] 892 893 mulq $m1 # np[j]*m1 894 add %rax,$N[0] 895 mov -8($ap,$j),%rax 896 adc \$0,%rdx 897 add $A[0],$N[0] 898 adc \$0,%rdx 899 mov $N[1],-32($tp) # tp[j-1] 900 mov %rdx,$N[1] 901 902 mulq $m0 # ap[j]*bp[i] 903 add %rax,$A[1] 904 mov -8*1($np),%rax 905 adc \$0,%rdx 906 add -8($tp),$A[1] 907 adc \$0,%rdx 908 mov %rdx,$A[0] 909 910 mulq $m1 # np[j]*m1 911 add %rax,$N[1] 912 mov ($ap,$j),%rax 913 adc \$0,%rdx 914 add $A[1],$N[1] 915 adc \$0,%rdx 916 mov $N[0],-24($tp) # tp[j-1] 917 mov %rdx,$N[0] 918 919 mulq $m0 # ap[j]*bp[i] 920 add %rax,$A[0] 921 mov 8*0($np),%rax 922 adc \$0,%rdx 923 add ($tp),$A[0] # ap[j]*bp[i]+tp[j] 924 adc \$0,%rdx 925 mov %rdx,$A[1] 926 927 mulq $m1 # np[j]*m1 928 add %rax,$N[0] 929 mov 8($ap,$j),%rax 930 adc \$0,%rdx 931 add $A[0],$N[0] 932 adc \$0,%rdx 933 mov $N[1],-16($tp) # tp[j-1] 934 mov %rdx,$N[1] 935 936 mulq $m0 # ap[j]*bp[i] 937 add %rax,$A[1] 938 mov 8*1($np),%rax 939 adc \$0,%rdx 940 add 8($tp),$A[1] 941 adc \$0,%rdx 942 mov %rdx,$A[0] 943 944 mulq $m1 # np[j]*m1 945 add %rax,$N[1] 946 mov 16($ap,$j),%rax 947 adc \$0,%rdx 948 add $A[1],$N[1] 949 lea 8*4($np),$np 950 adc \$0,%rdx 951 mov $N[0],-8($tp) # tp[j-1] 952 mov %rdx,$N[0] 953 954 add \$32,$j # j+=4 955 jnz .Linner4x 956 957 mulq $m0 # ap[j]*bp[i] 958 add %rax,$A[0] 959 mov -8*2($np),%rax 960 adc \$0,%rdx 961 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] 962 lea 32($tp),$tp 963 adc \$0,%rdx 964 mov %rdx,$A[1] 965 966 mulq $m1 # np[j]*m1 967 add %rax,$N[0] 968 mov -8($ap),%rax 969 adc \$0,%rdx 970 add $A[0],$N[0] 971 adc \$0,%rdx 972 mov $N[1],-32($tp) # tp[j-1] 973 mov %rdx,$N[1] 974 975 mulq $m0 # ap[j]*bp[i] 976 add %rax,$A[1] 977 mov $m1,%rax 978 mov -8*1($np),$m1 979 adc \$0,%rdx 980 add -8($tp),$A[1] 981 adc \$0,%rdx 982 mov %rdx,$A[0] 983 984 mulq $m1 # np[j]*m1 985 add %rax,$N[1] 986 mov ($ap,$num),%rax # ap[0] 987 adc \$0,%rdx 988 add $A[1],$N[1] 989 adc \$0,%rdx 990 mov $N[0],-24($tp) # tp[j-1] 991 mov %rdx,$N[0] 992 993 mov $N[1],-16($tp) # tp[j-1] 994 lea ($np,$num),$np # rewind $np 995 996 xor $N[1],$N[1] 997 add $A[0],$N[0] 998 adc \$0,$N[1] 999 add ($tp),$N[0] # pull upmost overflow bit 1000 adc \$0,$N[1] # upmost overflow bit 1001 mov $N[0],-8($tp) 1002 1003 cmp 16+8(%rsp),$bp 1004 jb .Louter4x 1005___ 1006if (1) { 1007$code.=<<___; 1008 xor %rax,%rax 1009 sub $N[0],$m1 # compare top-most words 1010 adc $j,$j # $j is zero 1011 or $j,$N[1] 1012 sub $N[1],%rax # %rax=-$N[1] 1013 lea ($tp,$num),%rbx # tptr in .sqr4x_sub 1014 mov ($np),%r12 1015 lea ($np),%rbp # nptr in .sqr4x_sub 1016 mov %r9,%rcx 1017 sar \$3+2,%rcx 1018 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub 1019 dec %r12 # so that after 'not' we get -n[0] 1020 xor %r10,%r10 1021 mov 8*1(%rbp),%r13 1022 mov 8*2(%rbp),%r14 1023 mov 8*3(%rbp),%r15 1024 jmp .Lsqr4x_sub_entry 1025___ 1026} else { 1027my @ri=("%rax",$bp,$m0,$m1); 1028my $rp="%rdx"; 1029$code.=<<___ 1030 xor \$1,$N[1] 1031 lea ($tp,$num),$tp # rewind $tp 1032 sar \$5,$num # cf=0 1033 lea ($np,$N[1],8),$np 1034 mov 56+8(%rsp),$rp # restore $rp 1035 jmp .Lsub4x 1036 1037.align 32 1038.Lsub4x: 1039 .byte 0x66 1040 mov 8*0($tp),@ri[0] 1041 mov 8*1($tp),@ri[1] 1042 .byte 0x66 1043 sbb 16*0($np),@ri[0] 1044 mov 8*2($tp),@ri[2] 1045 sbb 16*1($np),@ri[1] 1046 mov 3*8($tp),@ri[3] 1047 lea 4*8($tp),$tp 1048 sbb 16*2($np),@ri[2] 1049 mov @ri[0],8*0($rp) 1050 sbb 16*3($np),@ri[3] 1051 lea 16*4($np),$np 1052 mov @ri[1],8*1($rp) 1053 mov @ri[2],8*2($rp) 1054 mov @ri[3],8*3($rp) 1055 lea 8*4($rp),$rp 1056 1057 inc $num 1058 jnz .Lsub4x 1059 1060 ret 1061___ 1062} 1063$code.=<<___; 1064.cfi_endproc 1065.size mul4x_internal,.-mul4x_internal 1066___ 1067}}} 1068{{{ 1069###################################################################### 1070# void bn_power5( 1071my $rptr="%rdi"; # BN_ULONG *rptr, 1072my $aptr="%rsi"; # const BN_ULONG *aptr, 1073my $bptr="%rdx"; # const BN_ULONG *table, 1074my $nptr="%rcx"; # const BN_ULONG *nptr, 1075my $n0 ="%r8"; # const BN_ULONG *n0); 1076my $num ="%r9"; # int num, has to be divisible by 8 1077 # int pwr 1078 1079my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 1080my @A0=("%r10","%r11"); 1081my @A1=("%r12","%r13"); 1082my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 1083 1084$code.=<<___; 1085.globl bn_power5 1086.type bn_power5,\@function,6 1087.align 32 1088bn_power5: 1089.cfi_startproc 1090 mov %rsp,%rax 1091.cfi_def_cfa_register %rax 1092___ 1093$code.=<<___ if ($addx); 1094 leaq OPENSSL_ia32cap_P(%rip),%r11 1095 mov 8(%r11),%r11d 1096 and \$0x80108,%r11d 1097 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 1098 je .Lpowerx5_enter 1099___ 1100$code.=<<___; 1101 push %rbx 1102.cfi_push %rbx 1103 push %rbp 1104.cfi_push %rbp 1105 push %r12 1106.cfi_push %r12 1107 push %r13 1108.cfi_push %r13 1109 push %r14 1110.cfi_push %r14 1111 push %r15 1112.cfi_push %r15 1113.Lpower5_prologue: 1114 1115 shl \$3,${num}d # convert $num to bytes 1116 lea ($num,$num,2),%r10d # 3*$num 1117 neg $num 1118 mov ($n0),$n0 # *n0 1119 1120 ############################################################## 1121 # Ensure that stack frame doesn't alias with $rptr+3*$num 1122 # modulo 4096, which covers ret[num], am[num] and n[num] 1123 # (see bn_exp.c). This is done to allow memory disambiguation 1124 # logic do its magic. [Extra 256 bytes is for power mask 1125 # calculated from 7th argument, the index.] 1126 # 1127 lea -320(%rsp,$num,2),%r11 1128 mov %rsp,%rbp 1129 sub $rptr,%r11 1130 and \$4095,%r11 1131 cmp %r11,%r10 1132 jb .Lpwr_sp_alt 1133 sub %r11,%rbp # align with $aptr 1134 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1135 jmp .Lpwr_sp_done 1136 1137.align 32 1138.Lpwr_sp_alt: 1139 lea 4096-320(,$num,2),%r10 1140 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) 1141 sub %r10,%r11 1142 mov \$0,%r10 1143 cmovc %r10,%r11 1144 sub %r11,%rbp 1145.Lpwr_sp_done: 1146 and \$-64,%rbp 1147 mov %rsp,%r11 1148 sub %rbp,%r11 1149 and \$-4096,%r11 1150 lea (%rbp,%r11),%rsp 1151 mov (%rsp),%r10 1152 cmp %rbp,%rsp 1153 ja .Lpwr_page_walk 1154 jmp .Lpwr_page_walk_done 1155 1156.Lpwr_page_walk: 1157 lea -4096(%rsp),%rsp 1158 mov (%rsp),%r10 1159 cmp %rbp,%rsp 1160 ja .Lpwr_page_walk 1161.Lpwr_page_walk_done: 1162 1163 mov $num,%r10 1164 neg $num 1165 1166 ############################################################## 1167 # Stack layout 1168 # 1169 # +0 saved $num, used in reduction section 1170 # +8 &t[2*$num], used in reduction section 1171 # +32 saved *n0 1172 # +40 saved %rsp 1173 # +48 t[2*$num] 1174 # 1175 mov $n0, 32(%rsp) 1176 mov %rax, 40(%rsp) # save original %rsp 1177.cfi_cfa_expression %rsp+40,deref,+8 1178.Lpower5_body: 1179 movq $rptr,%xmm1 # save $rptr, used in sqr8x 1180 movq $nptr,%xmm2 # save $nptr 1181 movq %r10, %xmm3 # -$num, used in sqr8x 1182 movq $bptr,%xmm4 1183 1184 call __bn_sqr8x_internal 1185 call __bn_post4x_internal 1186 call __bn_sqr8x_internal 1187 call __bn_post4x_internal 1188 call __bn_sqr8x_internal 1189 call __bn_post4x_internal 1190 call __bn_sqr8x_internal 1191 call __bn_post4x_internal 1192 call __bn_sqr8x_internal 1193 call __bn_post4x_internal 1194 1195 movq %xmm2,$nptr 1196 movq %xmm4,$bptr 1197 mov $aptr,$rptr 1198 mov 40(%rsp),%rax 1199 lea 32(%rsp),$n0 1200 1201 call mul4x_internal 1202 1203 mov 40(%rsp),%rsi # restore %rsp 1204.cfi_def_cfa %rsi,8 1205 mov \$1,%rax 1206 mov -48(%rsi),%r15 1207.cfi_restore %r15 1208 mov -40(%rsi),%r14 1209.cfi_restore %r14 1210 mov -32(%rsi),%r13 1211.cfi_restore %r13 1212 mov -24(%rsi),%r12 1213.cfi_restore %r12 1214 mov -16(%rsi),%rbp 1215.cfi_restore %rbp 1216 mov -8(%rsi),%rbx 1217.cfi_restore %rbx 1218 lea (%rsi),%rsp 1219.cfi_def_cfa_register %rsp 1220.Lpower5_epilogue: 1221 ret 1222.cfi_endproc 1223.size bn_power5,.-bn_power5 1224 1225.globl bn_sqr8x_internal 1226.hidden bn_sqr8x_internal 1227.type bn_sqr8x_internal,\@abi-omnipotent 1228.align 32 1229bn_sqr8x_internal: 1230__bn_sqr8x_internal: 1231.cfi_startproc 1232 ############################################################## 1233 # Squaring part: 1234 # 1235 # a) multiply-n-add everything but a[i]*a[i]; 1236 # b) shift result of a) by 1 to the left and accumulate 1237 # a[i]*a[i] products; 1238 # 1239 ############################################################## 1240 # a[1]a[0] 1241 # a[2]a[0] 1242 # a[3]a[0] 1243 # a[2]a[1] 1244 # a[4]a[0] 1245 # a[3]a[1] 1246 # a[5]a[0] 1247 # a[4]a[1] 1248 # a[3]a[2] 1249 # a[6]a[0] 1250 # a[5]a[1] 1251 # a[4]a[2] 1252 # a[7]a[0] 1253 # a[6]a[1] 1254 # a[5]a[2] 1255 # a[4]a[3] 1256 # a[7]a[1] 1257 # a[6]a[2] 1258 # a[5]a[3] 1259 # a[7]a[2] 1260 # a[6]a[3] 1261 # a[5]a[4] 1262 # a[7]a[3] 1263 # a[6]a[4] 1264 # a[7]a[4] 1265 # a[6]a[5] 1266 # a[7]a[5] 1267 # a[7]a[6] 1268 # a[1]a[0] 1269 # a[2]a[0] 1270 # a[3]a[0] 1271 # a[4]a[0] 1272 # a[5]a[0] 1273 # a[6]a[0] 1274 # a[7]a[0] 1275 # a[2]a[1] 1276 # a[3]a[1] 1277 # a[4]a[1] 1278 # a[5]a[1] 1279 # a[6]a[1] 1280 # a[7]a[1] 1281 # a[3]a[2] 1282 # a[4]a[2] 1283 # a[5]a[2] 1284 # a[6]a[2] 1285 # a[7]a[2] 1286 # a[4]a[3] 1287 # a[5]a[3] 1288 # a[6]a[3] 1289 # a[7]a[3] 1290 # a[5]a[4] 1291 # a[6]a[4] 1292 # a[7]a[4] 1293 # a[6]a[5] 1294 # a[7]a[5] 1295 # a[7]a[6] 1296 # a[0]a[0] 1297 # a[1]a[1] 1298 # a[2]a[2] 1299 # a[3]a[3] 1300 # a[4]a[4] 1301 # a[5]a[5] 1302 # a[6]a[6] 1303 # a[7]a[7] 1304 1305 lea 32(%r10),$i # $i=-($num-32) 1306 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] 1307 1308 mov $num,$j # $j=$num 1309 1310 # comments apply to $num==8 case 1311 mov -32($aptr,$i),$a0 # a[0] 1312 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1313 mov -24($aptr,$i),%rax # a[1] 1314 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1315 mov -16($aptr,$i),$ai # a[2] 1316 mov %rax,$a1 1317 1318 mul $a0 # a[1]*a[0] 1319 mov %rax,$A0[0] # a[1]*a[0] 1320 mov $ai,%rax # a[2] 1321 mov %rdx,$A0[1] 1322 mov $A0[0],-24($tptr,$i) # t[1] 1323 1324 mul $a0 # a[2]*a[0] 1325 add %rax,$A0[1] 1326 mov $ai,%rax 1327 adc \$0,%rdx 1328 mov $A0[1],-16($tptr,$i) # t[2] 1329 mov %rdx,$A0[0] 1330 1331 1332 mov -8($aptr,$i),$ai # a[3] 1333 mul $a1 # a[2]*a[1] 1334 mov %rax,$A1[0] # a[2]*a[1]+t[3] 1335 mov $ai,%rax 1336 mov %rdx,$A1[1] 1337 1338 lea ($i),$j 1339 mul $a0 # a[3]*a[0] 1340 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1341 mov $ai,%rax 1342 mov %rdx,$A0[1] 1343 adc \$0,$A0[1] 1344 add $A1[0],$A0[0] 1345 adc \$0,$A0[1] 1346 mov $A0[0],-8($tptr,$j) # t[3] 1347 jmp .Lsqr4x_1st 1348 1349.align 32 1350.Lsqr4x_1st: 1351 mov ($aptr,$j),$ai # a[4] 1352 mul $a1 # a[3]*a[1] 1353 add %rax,$A1[1] # a[3]*a[1]+t[4] 1354 mov $ai,%rax 1355 mov %rdx,$A1[0] 1356 adc \$0,$A1[0] 1357 1358 mul $a0 # a[4]*a[0] 1359 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1360 mov $ai,%rax # a[3] 1361 mov 8($aptr,$j),$ai # a[5] 1362 mov %rdx,$A0[0] 1363 adc \$0,$A0[0] 1364 add $A1[1],$A0[1] 1365 adc \$0,$A0[0] 1366 1367 1368 mul $a1 # a[4]*a[3] 1369 add %rax,$A1[0] # a[4]*a[3]+t[5] 1370 mov $ai,%rax 1371 mov $A0[1],($tptr,$j) # t[4] 1372 mov %rdx,$A1[1] 1373 adc \$0,$A1[1] 1374 1375 mul $a0 # a[5]*a[2] 1376 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1377 mov $ai,%rax 1378 mov 16($aptr,$j),$ai # a[6] 1379 mov %rdx,$A0[1] 1380 adc \$0,$A0[1] 1381 add $A1[0],$A0[0] 1382 adc \$0,$A0[1] 1383 1384 mul $a1 # a[5]*a[3] 1385 add %rax,$A1[1] # a[5]*a[3]+t[6] 1386 mov $ai,%rax 1387 mov $A0[0],8($tptr,$j) # t[5] 1388 mov %rdx,$A1[0] 1389 adc \$0,$A1[0] 1390 1391 mul $a0 # a[6]*a[2] 1392 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] 1393 mov $ai,%rax # a[3] 1394 mov 24($aptr,$j),$ai # a[7] 1395 mov %rdx,$A0[0] 1396 adc \$0,$A0[0] 1397 add $A1[1],$A0[1] 1398 adc \$0,$A0[0] 1399 1400 1401 mul $a1 # a[6]*a[5] 1402 add %rax,$A1[0] # a[6]*a[5]+t[7] 1403 mov $ai,%rax 1404 mov $A0[1],16($tptr,$j) # t[6] 1405 mov %rdx,$A1[1] 1406 adc \$0,$A1[1] 1407 lea 32($j),$j 1408 1409 mul $a0 # a[7]*a[4] 1410 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] 1411 mov $ai,%rax 1412 mov %rdx,$A0[1] 1413 adc \$0,$A0[1] 1414 add $A1[0],$A0[0] 1415 adc \$0,$A0[1] 1416 mov $A0[0],-8($tptr,$j) # t[7] 1417 1418 cmp \$0,$j 1419 jne .Lsqr4x_1st 1420 1421 mul $a1 # a[7]*a[5] 1422 add %rax,$A1[1] 1423 lea 16($i),$i 1424 adc \$0,%rdx 1425 add $A0[1],$A1[1] 1426 adc \$0,%rdx 1427 1428 mov $A1[1],($tptr) # t[8] 1429 mov %rdx,$A1[0] 1430 mov %rdx,8($tptr) # t[9] 1431 jmp .Lsqr4x_outer 1432 1433.align 32 1434.Lsqr4x_outer: # comments apply to $num==6 case 1435 mov -32($aptr,$i),$a0 # a[0] 1436 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1437 mov -24($aptr,$i),%rax # a[1] 1438 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1439 mov -16($aptr,$i),$ai # a[2] 1440 mov %rax,$a1 1441 1442 mul $a0 # a[1]*a[0] 1443 mov -24($tptr,$i),$A0[0] # t[1] 1444 add %rax,$A0[0] # a[1]*a[0]+t[1] 1445 mov $ai,%rax # a[2] 1446 adc \$0,%rdx 1447 mov $A0[0],-24($tptr,$i) # t[1] 1448 mov %rdx,$A0[1] 1449 1450 mul $a0 # a[2]*a[0] 1451 add %rax,$A0[1] 1452 mov $ai,%rax 1453 adc \$0,%rdx 1454 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] 1455 mov %rdx,$A0[0] 1456 adc \$0,$A0[0] 1457 mov $A0[1],-16($tptr,$i) # t[2] 1458 1459 xor $A1[0],$A1[0] 1460 1461 mov -8($aptr,$i),$ai # a[3] 1462 mul $a1 # a[2]*a[1] 1463 add %rax,$A1[0] # a[2]*a[1]+t[3] 1464 mov $ai,%rax 1465 adc \$0,%rdx 1466 add -8($tptr,$i),$A1[0] 1467 mov %rdx,$A1[1] 1468 adc \$0,$A1[1] 1469 1470 mul $a0 # a[3]*a[0] 1471 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1472 mov $ai,%rax 1473 adc \$0,%rdx 1474 add $A1[0],$A0[0] 1475 mov %rdx,$A0[1] 1476 adc \$0,$A0[1] 1477 mov $A0[0],-8($tptr,$i) # t[3] 1478 1479 lea ($i),$j 1480 jmp .Lsqr4x_inner 1481 1482.align 32 1483.Lsqr4x_inner: 1484 mov ($aptr,$j),$ai # a[4] 1485 mul $a1 # a[3]*a[1] 1486 add %rax,$A1[1] # a[3]*a[1]+t[4] 1487 mov $ai,%rax 1488 mov %rdx,$A1[0] 1489 adc \$0,$A1[0] 1490 add ($tptr,$j),$A1[1] 1491 adc \$0,$A1[0] 1492 1493 .byte 0x67 1494 mul $a0 # a[4]*a[0] 1495 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] 1496 mov $ai,%rax # a[3] 1497 mov 8($aptr,$j),$ai # a[5] 1498 mov %rdx,$A0[0] 1499 adc \$0,$A0[0] 1500 add $A1[1],$A0[1] 1501 adc \$0,$A0[0] 1502 1503 mul $a1 # a[4]*a[3] 1504 add %rax,$A1[0] # a[4]*a[3]+t[5] 1505 mov $A0[1],($tptr,$j) # t[4] 1506 mov $ai,%rax 1507 mov %rdx,$A1[1] 1508 adc \$0,$A1[1] 1509 add 8($tptr,$j),$A1[0] 1510 lea 16($j),$j # j++ 1511 adc \$0,$A1[1] 1512 1513 mul $a0 # a[5]*a[2] 1514 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] 1515 mov $ai,%rax 1516 adc \$0,%rdx 1517 add $A1[0],$A0[0] 1518 mov %rdx,$A0[1] 1519 adc \$0,$A0[1] 1520 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below 1521 1522 cmp \$0,$j 1523 jne .Lsqr4x_inner 1524 1525 .byte 0x67 1526 mul $a1 # a[5]*a[3] 1527 add %rax,$A1[1] 1528 adc \$0,%rdx 1529 add $A0[1],$A1[1] 1530 adc \$0,%rdx 1531 1532 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below 1533 mov %rdx,$A1[0] 1534 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below 1535 1536 add \$16,$i 1537 jnz .Lsqr4x_outer 1538 1539 # comments apply to $num==4 case 1540 mov -32($aptr),$a0 # a[0] 1541 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] 1542 mov -24($aptr),%rax # a[1] 1543 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] 1544 mov -16($aptr),$ai # a[2] 1545 mov %rax,$a1 1546 1547 mul $a0 # a[1]*a[0] 1548 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] 1549 mov $ai,%rax # a[2] 1550 mov %rdx,$A0[1] 1551 adc \$0,$A0[1] 1552 1553 mul $a0 # a[2]*a[0] 1554 add %rax,$A0[1] 1555 mov $ai,%rax 1556 mov $A0[0],-24($tptr) # t[1] 1557 mov %rdx,$A0[0] 1558 adc \$0,$A0[0] 1559 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] 1560 mov -8($aptr),$ai # a[3] 1561 adc \$0,$A0[0] 1562 1563 mul $a1 # a[2]*a[1] 1564 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] 1565 mov $ai,%rax 1566 mov $A0[1],-16($tptr) # t[2] 1567 mov %rdx,$A1[1] 1568 adc \$0,$A1[1] 1569 1570 mul $a0 # a[3]*a[0] 1571 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] 1572 mov $ai,%rax 1573 mov %rdx,$A0[1] 1574 adc \$0,$A0[1] 1575 add $A1[0],$A0[0] 1576 adc \$0,$A0[1] 1577 mov $A0[0],-8($tptr) # t[3] 1578 1579 mul $a1 # a[3]*a[1] 1580 add %rax,$A1[1] 1581 mov -16($aptr),%rax # a[2] 1582 adc \$0,%rdx 1583 add $A0[1],$A1[1] 1584 adc \$0,%rdx 1585 1586 mov $A1[1],($tptr) # t[4] 1587 mov %rdx,$A1[0] 1588 mov %rdx,8($tptr) # t[5] 1589 1590 mul $ai # a[2]*a[3] 1591___ 1592{ 1593my ($shift,$carry)=($a0,$a1); 1594my @S=(@A1,$ai,$n0); 1595$code.=<<___; 1596 add \$16,$i 1597 xor $shift,$shift 1598 sub $num,$i # $i=16-$num 1599 xor $carry,$carry 1600 1601 add $A1[0],%rax # t[5] 1602 adc \$0,%rdx 1603 mov %rax,8($tptr) # t[5] 1604 mov %rdx,16($tptr) # t[6] 1605 mov $carry,24($tptr) # t[7] 1606 1607 mov -16($aptr,$i),%rax # a[0] 1608 lea 48+8(%rsp),$tptr 1609 xor $A0[0],$A0[0] # t[0] 1610 mov 8($tptr),$A0[1] # t[1] 1611 1612 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1613 shr \$63,$A0[0] 1614 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1615 shr \$63,$A0[1] 1616 or $A0[0],$S[1] # | t[2*i]>>63 1617 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1618 mov $A0[1],$shift # shift=t[2*i+1]>>63 1619 mul %rax # a[i]*a[i] 1620 neg $carry # mov $carry,cf 1621 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1622 adc %rax,$S[0] 1623 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1624 mov $S[0],($tptr) 1625 adc %rdx,$S[1] 1626 1627 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1628 mov $S[1],8($tptr) 1629 sbb $carry,$carry # mov cf,$carry 1630 shr \$63,$A0[0] 1631 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1632 shr \$63,$A0[1] 1633 or $A0[0],$S[3] # | t[2*i]>>63 1634 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1635 mov $A0[1],$shift # shift=t[2*i+1]>>63 1636 mul %rax # a[i]*a[i] 1637 neg $carry # mov $carry,cf 1638 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1639 adc %rax,$S[2] 1640 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1641 mov $S[2],16($tptr) 1642 adc %rdx,$S[3] 1643 lea 16($i),$i 1644 mov $S[3],24($tptr) 1645 sbb $carry,$carry # mov cf,$carry 1646 lea 64($tptr),$tptr 1647 jmp .Lsqr4x_shift_n_add 1648 1649.align 32 1650.Lsqr4x_shift_n_add: 1651 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1652 shr \$63,$A0[0] 1653 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1654 shr \$63,$A0[1] 1655 or $A0[0],$S[1] # | t[2*i]>>63 1656 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1657 mov $A0[1],$shift # shift=t[2*i+1]>>63 1658 mul %rax # a[i]*a[i] 1659 neg $carry # mov $carry,cf 1660 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1661 adc %rax,$S[0] 1662 mov -8($aptr,$i),%rax # a[i+1] # prefetch 1663 mov $S[0],-32($tptr) 1664 adc %rdx,$S[1] 1665 1666 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1667 mov $S[1],-24($tptr) 1668 sbb $carry,$carry # mov cf,$carry 1669 shr \$63,$A0[0] 1670 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1671 shr \$63,$A0[1] 1672 or $A0[0],$S[3] # | t[2*i]>>63 1673 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch 1674 mov $A0[1],$shift # shift=t[2*i+1]>>63 1675 mul %rax # a[i]*a[i] 1676 neg $carry # mov $carry,cf 1677 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1678 adc %rax,$S[2] 1679 mov 0($aptr,$i),%rax # a[i+1] # prefetch 1680 mov $S[2],-16($tptr) 1681 adc %rdx,$S[3] 1682 1683 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1684 mov $S[3],-8($tptr) 1685 sbb $carry,$carry # mov cf,$carry 1686 shr \$63,$A0[0] 1687 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1688 shr \$63,$A0[1] 1689 or $A0[0],$S[1] # | t[2*i]>>63 1690 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch 1691 mov $A0[1],$shift # shift=t[2*i+1]>>63 1692 mul %rax # a[i]*a[i] 1693 neg $carry # mov $carry,cf 1694 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch 1695 adc %rax,$S[0] 1696 mov 8($aptr,$i),%rax # a[i+1] # prefetch 1697 mov $S[0],0($tptr) 1698 adc %rdx,$S[1] 1699 1700 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift 1701 mov $S[1],8($tptr) 1702 sbb $carry,$carry # mov cf,$carry 1703 shr \$63,$A0[0] 1704 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1705 shr \$63,$A0[1] 1706 or $A0[0],$S[3] # | t[2*i]>>63 1707 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch 1708 mov $A0[1],$shift # shift=t[2*i+1]>>63 1709 mul %rax # a[i]*a[i] 1710 neg $carry # mov $carry,cf 1711 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch 1712 adc %rax,$S[2] 1713 mov 16($aptr,$i),%rax # a[i+1] # prefetch 1714 mov $S[2],16($tptr) 1715 adc %rdx,$S[3] 1716 mov $S[3],24($tptr) 1717 sbb $carry,$carry # mov cf,$carry 1718 lea 64($tptr),$tptr 1719 add \$32,$i 1720 jnz .Lsqr4x_shift_n_add 1721 1722 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift 1723 .byte 0x67 1724 shr \$63,$A0[0] 1725 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | 1726 shr \$63,$A0[1] 1727 or $A0[0],$S[1] # | t[2*i]>>63 1728 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch 1729 mov $A0[1],$shift # shift=t[2*i+1]>>63 1730 mul %rax # a[i]*a[i] 1731 neg $carry # mov $carry,cf 1732 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch 1733 adc %rax,$S[0] 1734 mov -8($aptr),%rax # a[i+1] # prefetch 1735 mov $S[0],-32($tptr) 1736 adc %rdx,$S[1] 1737 1738 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift 1739 mov $S[1],-24($tptr) 1740 sbb $carry,$carry # mov cf,$carry 1741 shr \$63,$A0[0] 1742 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | 1743 shr \$63,$A0[1] 1744 or $A0[0],$S[3] # | t[2*i]>>63 1745 mul %rax # a[i]*a[i] 1746 neg $carry # mov $carry,cf 1747 adc %rax,$S[2] 1748 adc %rdx,$S[3] 1749 mov $S[2],-16($tptr) 1750 mov $S[3],-8($tptr) 1751___ 1752} 1753###################################################################### 1754# Montgomery reduction part, "word-by-word" algorithm. 1755# 1756# This new path is inspired by multiple submissions from Intel, by 1757# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 1758# Vinodh Gopal... 1759{ 1760my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); 1761 1762$code.=<<___; 1763 movq %xmm2,$nptr 1764__bn_sqr8x_reduction: 1765 xor %rax,%rax 1766 lea ($nptr,$num),%rcx # end of n[] 1767 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer 1768 mov %rcx,0+8(%rsp) 1769 lea 48+8(%rsp,$num),$tptr # end of initial t[] window 1770 mov %rdx,8+8(%rsp) 1771 neg $num 1772 jmp .L8x_reduction_loop 1773 1774.align 32 1775.L8x_reduction_loop: 1776 lea ($tptr,$num),$tptr # start of current t[] window 1777 .byte 0x66 1778 mov 8*0($tptr),$m0 1779 mov 8*1($tptr),%r9 1780 mov 8*2($tptr),%r10 1781 mov 8*3($tptr),%r11 1782 mov 8*4($tptr),%r12 1783 mov 8*5($tptr),%r13 1784 mov 8*6($tptr),%r14 1785 mov 8*7($tptr),%r15 1786 mov %rax,(%rdx) # store top-most carry bit 1787 lea 8*8($tptr),$tptr 1788 1789 .byte 0x67 1790 mov $m0,%r8 1791 imulq 32+8(%rsp),$m0 # n0*a[0] 1792 mov 8*0($nptr),%rax # n[0] 1793 mov \$8,%ecx 1794 jmp .L8x_reduce 1795 1796.align 32 1797.L8x_reduce: 1798 mulq $m0 1799 mov 8*1($nptr),%rax # n[1] 1800 neg %r8 1801 mov %rdx,%r8 1802 adc \$0,%r8 1803 1804 mulq $m0 1805 add %rax,%r9 1806 mov 8*2($nptr),%rax 1807 adc \$0,%rdx 1808 add %r9,%r8 1809 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] 1810 mov %rdx,%r9 1811 adc \$0,%r9 1812 1813 mulq $m0 1814 add %rax,%r10 1815 mov 8*3($nptr),%rax 1816 adc \$0,%rdx 1817 add %r10,%r9 1818 mov 32+8(%rsp),$carry # pull n0, borrow $carry 1819 mov %rdx,%r10 1820 adc \$0,%r10 1821 1822 mulq $m0 1823 add %rax,%r11 1824 mov 8*4($nptr),%rax 1825 adc \$0,%rdx 1826 imulq %r8,$carry # modulo-scheduled 1827 add %r11,%r10 1828 mov %rdx,%r11 1829 adc \$0,%r11 1830 1831 mulq $m0 1832 add %rax,%r12 1833 mov 8*5($nptr),%rax 1834 adc \$0,%rdx 1835 add %r12,%r11 1836 mov %rdx,%r12 1837 adc \$0,%r12 1838 1839 mulq $m0 1840 add %rax,%r13 1841 mov 8*6($nptr),%rax 1842 adc \$0,%rdx 1843 add %r13,%r12 1844 mov %rdx,%r13 1845 adc \$0,%r13 1846 1847 mulq $m0 1848 add %rax,%r14 1849 mov 8*7($nptr),%rax 1850 adc \$0,%rdx 1851 add %r14,%r13 1852 mov %rdx,%r14 1853 adc \$0,%r14 1854 1855 mulq $m0 1856 mov $carry,$m0 # n0*a[i] 1857 add %rax,%r15 1858 mov 8*0($nptr),%rax # n[0] 1859 adc \$0,%rdx 1860 add %r15,%r14 1861 mov %rdx,%r15 1862 adc \$0,%r15 1863 1864 dec %ecx 1865 jnz .L8x_reduce 1866 1867 lea 8*8($nptr),$nptr 1868 xor %rax,%rax 1869 mov 8+8(%rsp),%rdx # pull end of t[] 1870 cmp 0+8(%rsp),$nptr # end of n[]? 1871 jae .L8x_no_tail 1872 1873 .byte 0x66 1874 add 8*0($tptr),%r8 1875 adc 8*1($tptr),%r9 1876 adc 8*2($tptr),%r10 1877 adc 8*3($tptr),%r11 1878 adc 8*4($tptr),%r12 1879 adc 8*5($tptr),%r13 1880 adc 8*6($tptr),%r14 1881 adc 8*7($tptr),%r15 1882 sbb $carry,$carry # top carry 1883 1884 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1885 mov \$8,%ecx 1886 mov 8*0($nptr),%rax 1887 jmp .L8x_tail 1888 1889.align 32 1890.L8x_tail: 1891 mulq $m0 1892 add %rax,%r8 1893 mov 8*1($nptr),%rax 1894 mov %r8,($tptr) # save result 1895 mov %rdx,%r8 1896 adc \$0,%r8 1897 1898 mulq $m0 1899 add %rax,%r9 1900 mov 8*2($nptr),%rax 1901 adc \$0,%rdx 1902 add %r9,%r8 1903 lea 8($tptr),$tptr # $tptr++ 1904 mov %rdx,%r9 1905 adc \$0,%r9 1906 1907 mulq $m0 1908 add %rax,%r10 1909 mov 8*3($nptr),%rax 1910 adc \$0,%rdx 1911 add %r10,%r9 1912 mov %rdx,%r10 1913 adc \$0,%r10 1914 1915 mulq $m0 1916 add %rax,%r11 1917 mov 8*4($nptr),%rax 1918 adc \$0,%rdx 1919 add %r11,%r10 1920 mov %rdx,%r11 1921 adc \$0,%r11 1922 1923 mulq $m0 1924 add %rax,%r12 1925 mov 8*5($nptr),%rax 1926 adc \$0,%rdx 1927 add %r12,%r11 1928 mov %rdx,%r12 1929 adc \$0,%r12 1930 1931 mulq $m0 1932 add %rax,%r13 1933 mov 8*6($nptr),%rax 1934 adc \$0,%rdx 1935 add %r13,%r12 1936 mov %rdx,%r13 1937 adc \$0,%r13 1938 1939 mulq $m0 1940 add %rax,%r14 1941 mov 8*7($nptr),%rax 1942 adc \$0,%rdx 1943 add %r14,%r13 1944 mov %rdx,%r14 1945 adc \$0,%r14 1946 1947 mulq $m0 1948 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] 1949 add %rax,%r15 1950 adc \$0,%rdx 1951 add %r15,%r14 1952 mov 8*0($nptr),%rax # pull n[0] 1953 mov %rdx,%r15 1954 adc \$0,%r15 1955 1956 dec %ecx 1957 jnz .L8x_tail 1958 1959 lea 8*8($nptr),$nptr 1960 mov 8+8(%rsp),%rdx # pull end of t[] 1961 cmp 0+8(%rsp),$nptr # end of n[]? 1962 jae .L8x_tail_done # break out of loop 1963 1964 mov 48+56+8(%rsp),$m0 # pull n0*a[0] 1965 neg $carry 1966 mov 8*0($nptr),%rax # pull n[0] 1967 adc 8*0($tptr),%r8 1968 adc 8*1($tptr),%r9 1969 adc 8*2($tptr),%r10 1970 adc 8*3($tptr),%r11 1971 adc 8*4($tptr),%r12 1972 adc 8*5($tptr),%r13 1973 adc 8*6($tptr),%r14 1974 adc 8*7($tptr),%r15 1975 sbb $carry,$carry # top carry 1976 1977 mov \$8,%ecx 1978 jmp .L8x_tail 1979 1980.align 32 1981.L8x_tail_done: 1982 xor %rax,%rax 1983 add (%rdx),%r8 # can this overflow? 1984 adc \$0,%r9 1985 adc \$0,%r10 1986 adc \$0,%r11 1987 adc \$0,%r12 1988 adc \$0,%r13 1989 adc \$0,%r14 1990 adc \$0,%r15 1991 adc \$0,%rax 1992 1993 neg $carry 1994.L8x_no_tail: 1995 adc 8*0($tptr),%r8 1996 adc 8*1($tptr),%r9 1997 adc 8*2($tptr),%r10 1998 adc 8*3($tptr),%r11 1999 adc 8*4($tptr),%r12 2000 adc 8*5($tptr),%r13 2001 adc 8*6($tptr),%r14 2002 adc 8*7($tptr),%r15 2003 adc \$0,%rax # top-most carry 2004 mov -8($nptr),%rcx # np[num-1] 2005 xor $carry,$carry 2006 2007 movq %xmm2,$nptr # restore $nptr 2008 2009 mov %r8,8*0($tptr) # store top 512 bits 2010 mov %r9,8*1($tptr) 2011 movq %xmm3,$num # $num is %r9, can't be moved upwards 2012 mov %r10,8*2($tptr) 2013 mov %r11,8*3($tptr) 2014 mov %r12,8*4($tptr) 2015 mov %r13,8*5($tptr) 2016 mov %r14,8*6($tptr) 2017 mov %r15,8*7($tptr) 2018 lea 8*8($tptr),$tptr 2019 2020 cmp %rdx,$tptr # end of t[]? 2021 jb .L8x_reduction_loop 2022 ret 2023.cfi_endproc 2024.size bn_sqr8x_internal,.-bn_sqr8x_internal 2025___ 2026} 2027############################################################## 2028# Post-condition, 4x unrolled 2029# 2030{ 2031my ($tptr,$nptr)=("%rbx","%rbp"); 2032$code.=<<___; 2033.type __bn_post4x_internal,\@abi-omnipotent 2034.align 32 2035__bn_post4x_internal: 2036.cfi_startproc 2037 mov 8*0($nptr),%r12 2038 lea (%rdi,$num),$tptr # %rdi was $tptr above 2039 mov $num,%rcx 2040 movq %xmm1,$rptr # restore $rptr 2041 neg %rax 2042 movq %xmm1,$aptr # prepare for back-to-back call 2043 sar \$3+2,%rcx 2044 dec %r12 # so that after 'not' we get -n[0] 2045 xor %r10,%r10 2046 mov 8*1($nptr),%r13 2047 mov 8*2($nptr),%r14 2048 mov 8*3($nptr),%r15 2049 jmp .Lsqr4x_sub_entry 2050 2051.align 16 2052.Lsqr4x_sub: 2053 mov 8*0($nptr),%r12 2054 mov 8*1($nptr),%r13 2055 mov 8*2($nptr),%r14 2056 mov 8*3($nptr),%r15 2057.Lsqr4x_sub_entry: 2058 lea 8*4($nptr),$nptr 2059 not %r12 2060 not %r13 2061 not %r14 2062 not %r15 2063 and %rax,%r12 2064 and %rax,%r13 2065 and %rax,%r14 2066 and %rax,%r15 2067 2068 neg %r10 # mov %r10,%cf 2069 adc 8*0($tptr),%r12 2070 adc 8*1($tptr),%r13 2071 adc 8*2($tptr),%r14 2072 adc 8*3($tptr),%r15 2073 mov %r12,8*0($rptr) 2074 lea 8*4($tptr),$tptr 2075 mov %r13,8*1($rptr) 2076 sbb %r10,%r10 # mov %cf,%r10 2077 mov %r14,8*2($rptr) 2078 mov %r15,8*3($rptr) 2079 lea 8*4($rptr),$rptr 2080 2081 inc %rcx # pass %cf 2082 jnz .Lsqr4x_sub 2083 2084 mov $num,%r10 # prepare for back-to-back call 2085 neg $num # restore $num 2086 ret 2087.cfi_endproc 2088.size __bn_post4x_internal,.-__bn_post4x_internal 2089___ 2090} 2091}}} 2092 2093if ($addx) {{{ 2094my $bp="%rdx"; # restore original value 2095 2096$code.=<<___; 2097.type bn_mulx4x_mont_gather5,\@function,6 2098.align 32 2099bn_mulx4x_mont_gather5: 2100.cfi_startproc 2101 mov %rsp,%rax 2102.cfi_def_cfa_register %rax 2103.Lmulx4x_enter: 2104 push %rbx 2105.cfi_push %rbx 2106 push %rbp 2107.cfi_push %rbp 2108 push %r12 2109.cfi_push %r12 2110 push %r13 2111.cfi_push %r13 2112 push %r14 2113.cfi_push %r14 2114 push %r15 2115.cfi_push %r15 2116.Lmulx4x_prologue: 2117 2118 shl \$3,${num}d # convert $num to bytes 2119 lea ($num,$num,2),%r10 # 3*$num in bytes 2120 neg $num # -$num 2121 mov ($n0),$n0 # *n0 2122 2123 ############################################################## 2124 # Ensure that stack frame doesn't alias with $rptr+3*$num 2125 # modulo 4096, which covers ret[num], am[num] and n[num] 2126 # (see bn_exp.c). This is done to allow memory disambiguation 2127 # logic do its magic. [Extra [num] is allocated in order 2128 # to align with bn_power5's frame, which is cleansed after 2129 # completing exponentiation. Extra 256 bytes is for power mask 2130 # calculated from 7th argument, the index.] 2131 # 2132 lea -320(%rsp,$num,2),%r11 2133 mov %rsp,%rbp 2134 sub $rp,%r11 2135 and \$4095,%r11 2136 cmp %r11,%r10 2137 jb .Lmulx4xsp_alt 2138 sub %r11,%rbp # align with $aptr 2139 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2140 jmp .Lmulx4xsp_done 2141 2142.Lmulx4xsp_alt: 2143 lea 4096-320(,$num,2),%r10 2144 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2145 sub %r10,%r11 2146 mov \$0,%r10 2147 cmovc %r10,%r11 2148 sub %r11,%rbp 2149.Lmulx4xsp_done: 2150 and \$-64,%rbp # ensure alignment 2151 mov %rsp,%r11 2152 sub %rbp,%r11 2153 and \$-4096,%r11 2154 lea (%rbp,%r11),%rsp 2155 mov (%rsp),%r10 2156 cmp %rbp,%rsp 2157 ja .Lmulx4x_page_walk 2158 jmp .Lmulx4x_page_walk_done 2159 2160.Lmulx4x_page_walk: 2161 lea -4096(%rsp),%rsp 2162 mov (%rsp),%r10 2163 cmp %rbp,%rsp 2164 ja .Lmulx4x_page_walk 2165.Lmulx4x_page_walk_done: 2166 2167 ############################################################## 2168 # Stack layout 2169 # +0 -num 2170 # +8 off-loaded &b[i] 2171 # +16 end of b[num] 2172 # +24 inner counter 2173 # +32 saved n0 2174 # +40 saved %rsp 2175 # +48 2176 # +56 saved rp 2177 # +64 tmp[num+1] 2178 # 2179 mov $n0, 32(%rsp) # save *n0 2180 mov %rax,40(%rsp) # save original %rsp 2181.cfi_cfa_expression %rsp+40,deref,+8 2182.Lmulx4x_body: 2183 call mulx4x_internal 2184 2185 mov 40(%rsp),%rsi # restore %rsp 2186.cfi_def_cfa %rsi,8 2187 mov \$1,%rax 2188 2189 mov -48(%rsi),%r15 2190.cfi_restore %r15 2191 mov -40(%rsi),%r14 2192.cfi_restore %r14 2193 mov -32(%rsi),%r13 2194.cfi_restore %r13 2195 mov -24(%rsi),%r12 2196.cfi_restore %r12 2197 mov -16(%rsi),%rbp 2198.cfi_restore %rbp 2199 mov -8(%rsi),%rbx 2200.cfi_restore %rbx 2201 lea (%rsi),%rsp 2202.cfi_def_cfa_register %rsp 2203.Lmulx4x_epilogue: 2204 ret 2205.cfi_endproc 2206.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2207 2208.type mulx4x_internal,\@abi-omnipotent 2209.align 32 2210mulx4x_internal: 2211.cfi_startproc 2212 mov $num,8(%rsp) # save -$num (it was in bytes) 2213 mov $num,%r10 2214 neg $num # restore $num 2215 shl \$5,$num 2216 neg %r10 # restore $num 2217 lea 128($bp,$num),%r13 # end of powers table (+size optimization) 2218 shr \$5+5,$num 2219 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument 2220 sub \$1,$num 2221 lea .Linc(%rip),%rax 2222 mov %r13,16+8(%rsp) # end of b[num] 2223 mov $num,24+8(%rsp) # inner counter 2224 mov $rp, 56+8(%rsp) # save $rp 2225___ 2226my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 2227 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 2228my $rptr=$bptr; 2229my $STRIDE=2**5*8; # 5 is "window size" 2230my $N=$STRIDE/4; # should match cache line size 2231$code.=<<___; 2232 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 2233 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 2234 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) 2235 lea 128($bp),$bptr # size optimization 2236 2237 pshufd \$0,%xmm5,%xmm5 # broadcast index 2238 movdqa %xmm1,%xmm4 2239 .byte 0x67 2240 movdqa %xmm1,%xmm2 2241___ 2242######################################################################## 2243# calculate mask by comparing 0..31 to index and save result to stack 2244# 2245$code.=<<___; 2246 .byte 0x67 2247 paddd %xmm0,%xmm1 2248 pcmpeqd %xmm5,%xmm0 # compare to 1,0 2249 movdqa %xmm4,%xmm3 2250___ 2251for($i=0;$i<$STRIDE/16-4;$i+=4) { 2252$code.=<<___; 2253 paddd %xmm1,%xmm2 2254 pcmpeqd %xmm5,%xmm1 # compare to 3,2 2255 movdqa %xmm0,`16*($i+0)+112`(%r10) 2256 movdqa %xmm4,%xmm0 2257 2258 paddd %xmm2,%xmm3 2259 pcmpeqd %xmm5,%xmm2 # compare to 5,4 2260 movdqa %xmm1,`16*($i+1)+112`(%r10) 2261 movdqa %xmm4,%xmm1 2262 2263 paddd %xmm3,%xmm0 2264 pcmpeqd %xmm5,%xmm3 # compare to 7,6 2265 movdqa %xmm2,`16*($i+2)+112`(%r10) 2266 movdqa %xmm4,%xmm2 2267 2268 paddd %xmm0,%xmm1 2269 pcmpeqd %xmm5,%xmm0 2270 movdqa %xmm3,`16*($i+3)+112`(%r10) 2271 movdqa %xmm4,%xmm3 2272___ 2273} 2274$code.=<<___; # last iteration can be optimized 2275 .byte 0x67 2276 paddd %xmm1,%xmm2 2277 pcmpeqd %xmm5,%xmm1 2278 movdqa %xmm0,`16*($i+0)+112`(%r10) 2279 2280 paddd %xmm2,%xmm3 2281 pcmpeqd %xmm5,%xmm2 2282 movdqa %xmm1,`16*($i+1)+112`(%r10) 2283 2284 pcmpeqd %xmm5,%xmm3 2285 movdqa %xmm2,`16*($i+2)+112`(%r10) 2286 2287 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register 2288 pand `16*($i+1)-128`($bptr),%xmm1 2289 pand `16*($i+2)-128`($bptr),%xmm2 2290 movdqa %xmm3,`16*($i+3)+112`(%r10) 2291 pand `16*($i+3)-128`($bptr),%xmm3 2292 por %xmm2,%xmm0 2293 por %xmm3,%xmm1 2294___ 2295for($i=0;$i<$STRIDE/16-4;$i+=4) { 2296$code.=<<___; 2297 movdqa `16*($i+0)-128`($bptr),%xmm4 2298 movdqa `16*($i+1)-128`($bptr),%xmm5 2299 movdqa `16*($i+2)-128`($bptr),%xmm2 2300 pand `16*($i+0)+112`(%r10),%xmm4 2301 movdqa `16*($i+3)-128`($bptr),%xmm3 2302 pand `16*($i+1)+112`(%r10),%xmm5 2303 por %xmm4,%xmm0 2304 pand `16*($i+2)+112`(%r10),%xmm2 2305 por %xmm5,%xmm1 2306 pand `16*($i+3)+112`(%r10),%xmm3 2307 por %xmm2,%xmm0 2308 por %xmm3,%xmm1 2309___ 2310} 2311$code.=<<___; 2312 pxor %xmm1,%xmm0 2313 pshufd \$0x4e,%xmm0,%xmm1 2314 por %xmm1,%xmm0 2315 lea $STRIDE($bptr),$bptr 2316 movq %xmm0,%rdx # bp[0] 2317 lea 64+8*4+8(%rsp),$tptr 2318 2319 mov %rdx,$bi 2320 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 2321 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] 2322 add %rax,%r11 2323 mulx 2*8($aptr),%rax,%r13 # ... 2324 adc %rax,%r12 2325 adc \$0,%r13 2326 mulx 3*8($aptr),%rax,%r14 2327 2328 mov $mi,%r15 2329 imulq 32+8(%rsp),$mi # "t[0]"*n0 2330 xor $zero,$zero # cf=0, of=0 2331 mov $mi,%rdx 2332 2333 mov $bptr,8+8(%rsp) # off-load &b[i] 2334 2335 lea 4*8($aptr),$aptr 2336 adcx %rax,%r13 2337 adcx $zero,%r14 # cf=0 2338 2339 mulx 0*8($nptr),%rax,%r10 2340 adcx %rax,%r15 # discarded 2341 adox %r11,%r10 2342 mulx 1*8($nptr),%rax,%r11 2343 adcx %rax,%r10 2344 adox %r12,%r11 2345 mulx 2*8($nptr),%rax,%r12 2346 mov 24+8(%rsp),$bptr # counter value 2347 mov %r10,-8*4($tptr) 2348 adcx %rax,%r11 2349 adox %r13,%r12 2350 mulx 3*8($nptr),%rax,%r15 2351 mov $bi,%rdx 2352 mov %r11,-8*3($tptr) 2353 adcx %rax,%r12 2354 adox $zero,%r15 # of=0 2355 lea 4*8($nptr),$nptr 2356 mov %r12,-8*2($tptr) 2357 jmp .Lmulx4x_1st 2358 2359.align 32 2360.Lmulx4x_1st: 2361 adcx $zero,%r15 # cf=0, modulo-scheduled 2362 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 2363 adcx %r14,%r10 2364 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 2365 adcx %rax,%r11 2366 mulx 2*8($aptr),%r12,%rax # ... 2367 adcx %r14,%r12 2368 mulx 3*8($aptr),%r13,%r14 2369 .byte 0x67,0x67 2370 mov $mi,%rdx 2371 adcx %rax,%r13 2372 adcx $zero,%r14 # cf=0 2373 lea 4*8($aptr),$aptr 2374 lea 4*8($tptr),$tptr 2375 2376 adox %r15,%r10 2377 mulx 0*8($nptr),%rax,%r15 2378 adcx %rax,%r10 2379 adox %r15,%r11 2380 mulx 1*8($nptr),%rax,%r15 2381 adcx %rax,%r11 2382 adox %r15,%r12 2383 mulx 2*8($nptr),%rax,%r15 2384 mov %r10,-5*8($tptr) 2385 adcx %rax,%r12 2386 mov %r11,-4*8($tptr) 2387 adox %r15,%r13 2388 mulx 3*8($nptr),%rax,%r15 2389 mov $bi,%rdx 2390 mov %r12,-3*8($tptr) 2391 adcx %rax,%r13 2392 adox $zero,%r15 2393 lea 4*8($nptr),$nptr 2394 mov %r13,-2*8($tptr) 2395 2396 dec $bptr # of=0, pass cf 2397 jnz .Lmulx4x_1st 2398 2399 mov 8(%rsp),$num # load -num 2400 adc $zero,%r15 # modulo-scheduled 2401 lea ($aptr,$num),$aptr # rewind $aptr 2402 add %r15,%r14 2403 mov 8+8(%rsp),$bptr # re-load &b[i] 2404 adc $zero,$zero # top-most carry 2405 mov %r14,-1*8($tptr) 2406 jmp .Lmulx4x_outer 2407 2408.align 32 2409.Lmulx4x_outer: 2410 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) 2411 pxor %xmm4,%xmm4 2412 .byte 0x67,0x67 2413 pxor %xmm5,%xmm5 2414___ 2415for($i=0;$i<$STRIDE/16;$i+=4) { 2416$code.=<<___; 2417 movdqa `16*($i+0)-128`($bptr),%xmm0 2418 movdqa `16*($i+1)-128`($bptr),%xmm1 2419 movdqa `16*($i+2)-128`($bptr),%xmm2 2420 pand `16*($i+0)+256`(%r10),%xmm0 2421 movdqa `16*($i+3)-128`($bptr),%xmm3 2422 pand `16*($i+1)+256`(%r10),%xmm1 2423 por %xmm0,%xmm4 2424 pand `16*($i+2)+256`(%r10),%xmm2 2425 por %xmm1,%xmm5 2426 pand `16*($i+3)+256`(%r10),%xmm3 2427 por %xmm2,%xmm4 2428 por %xmm3,%xmm5 2429___ 2430} 2431$code.=<<___; 2432 por %xmm5,%xmm4 2433 pshufd \$0x4e,%xmm4,%xmm0 2434 por %xmm4,%xmm0 2435 lea $STRIDE($bptr),$bptr 2436 movq %xmm0,%rdx # m0=bp[i] 2437 2438 mov $zero,($tptr) # save top-most carry 2439 lea 4*8($tptr,$num),$tptr # rewind $tptr 2440 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 2441 xor $zero,$zero # cf=0, of=0 2442 mov %rdx,$bi 2443 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 2444 adox -4*8($tptr),$mi # +t[0] 2445 adcx %r14,%r11 2446 mulx 2*8($aptr),%r15,%r13 # ... 2447 adox -3*8($tptr),%r11 2448 adcx %r15,%r12 2449 mulx 3*8($aptr),%rdx,%r14 2450 adox -2*8($tptr),%r12 2451 adcx %rdx,%r13 2452 lea ($nptr,$num),$nptr # rewind $nptr 2453 lea 4*8($aptr),$aptr 2454 adox -1*8($tptr),%r13 2455 adcx $zero,%r14 2456 adox $zero,%r14 2457 2458 mov $mi,%r15 2459 imulq 32+8(%rsp),$mi # "t[0]"*n0 2460 2461 mov $mi,%rdx 2462 xor $zero,$zero # cf=0, of=0 2463 mov $bptr,8+8(%rsp) # off-load &b[i] 2464 2465 mulx 0*8($nptr),%rax,%r10 2466 adcx %rax,%r15 # discarded 2467 adox %r11,%r10 2468 mulx 1*8($nptr),%rax,%r11 2469 adcx %rax,%r10 2470 adox %r12,%r11 2471 mulx 2*8($nptr),%rax,%r12 2472 adcx %rax,%r11 2473 adox %r13,%r12 2474 mulx 3*8($nptr),%rax,%r15 2475 mov $bi,%rdx 2476 mov 24+8(%rsp),$bptr # counter value 2477 mov %r10,-8*4($tptr) 2478 adcx %rax,%r12 2479 mov %r11,-8*3($tptr) 2480 adox $zero,%r15 # of=0 2481 mov %r12,-8*2($tptr) 2482 lea 4*8($nptr),$nptr 2483 jmp .Lmulx4x_inner 2484 2485.align 32 2486.Lmulx4x_inner: 2487 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 2488 adcx $zero,%r15 # cf=0, modulo-scheduled 2489 adox %r14,%r10 2490 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 2491 adcx 0*8($tptr),%r10 2492 adox %rax,%r11 2493 mulx 2*8($aptr),%r12,%rax # ... 2494 adcx 1*8($tptr),%r11 2495 adox %r14,%r12 2496 mulx 3*8($aptr),%r13,%r14 2497 mov $mi,%rdx 2498 adcx 2*8($tptr),%r12 2499 adox %rax,%r13 2500 adcx 3*8($tptr),%r13 2501 adox $zero,%r14 # of=0 2502 lea 4*8($aptr),$aptr 2503 lea 4*8($tptr),$tptr 2504 adcx $zero,%r14 # cf=0 2505 2506 adox %r15,%r10 2507 mulx 0*8($nptr),%rax,%r15 2508 adcx %rax,%r10 2509 adox %r15,%r11 2510 mulx 1*8($nptr),%rax,%r15 2511 adcx %rax,%r11 2512 adox %r15,%r12 2513 mulx 2*8($nptr),%rax,%r15 2514 mov %r10,-5*8($tptr) 2515 adcx %rax,%r12 2516 adox %r15,%r13 2517 mov %r11,-4*8($tptr) 2518 mulx 3*8($nptr),%rax,%r15 2519 mov $bi,%rdx 2520 lea 4*8($nptr),$nptr 2521 mov %r12,-3*8($tptr) 2522 adcx %rax,%r13 2523 adox $zero,%r15 2524 mov %r13,-2*8($tptr) 2525 2526 dec $bptr # of=0, pass cf 2527 jnz .Lmulx4x_inner 2528 2529 mov 0+8(%rsp),$num # load -num 2530 adc $zero,%r15 # modulo-scheduled 2531 sub 0*8($tptr),$bptr # pull top-most carry to %cf 2532 mov 8+8(%rsp),$bptr # re-load &b[i] 2533 mov 16+8(%rsp),%r10 2534 adc %r15,%r14 2535 lea ($aptr,$num),$aptr # rewind $aptr 2536 adc $zero,$zero # top-most carry 2537 mov %r14,-1*8($tptr) 2538 2539 cmp %r10,$bptr 2540 jb .Lmulx4x_outer 2541 2542 mov -8($nptr),%r10 2543 mov $zero,%r8 2544 mov ($nptr,$num),%r12 2545 lea ($nptr,$num),%rbp # rewind $nptr 2546 mov $num,%rcx 2547 lea ($tptr,$num),%rdi # rewind $tptr 2548 xor %eax,%eax 2549 xor %r15,%r15 2550 sub %r14,%r10 # compare top-most words 2551 adc %r15,%r15 2552 or %r15,%r8 2553 sar \$3+2,%rcx 2554 sub %r8,%rax # %rax=-%r8 2555 mov 56+8(%rsp),%rdx # restore rp 2556 dec %r12 # so that after 'not' we get -n[0] 2557 mov 8*1(%rbp),%r13 2558 xor %r8,%r8 2559 mov 8*2(%rbp),%r14 2560 mov 8*3(%rbp),%r15 2561 jmp .Lsqrx4x_sub_entry # common post-condition 2562.cfi_endproc 2563.size mulx4x_internal,.-mulx4x_internal 2564___ 2565}{ 2566###################################################################### 2567# void bn_power5( 2568my $rptr="%rdi"; # BN_ULONG *rptr, 2569my $aptr="%rsi"; # const BN_ULONG *aptr, 2570my $bptr="%rdx"; # const BN_ULONG *table, 2571my $nptr="%rcx"; # const BN_ULONG *nptr, 2572my $n0 ="%r8"; # const BN_ULONG *n0); 2573my $num ="%r9"; # int num, has to be divisible by 8 2574 # int pwr); 2575 2576my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 2577my @A0=("%r10","%r11"); 2578my @A1=("%r12","%r13"); 2579my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 2580 2581$code.=<<___; 2582.type bn_powerx5,\@function,6 2583.align 32 2584bn_powerx5: 2585.cfi_startproc 2586 mov %rsp,%rax 2587.cfi_def_cfa_register %rax 2588.Lpowerx5_enter: 2589 push %rbx 2590.cfi_push %rbx 2591 push %rbp 2592.cfi_push %rbp 2593 push %r12 2594.cfi_push %r12 2595 push %r13 2596.cfi_push %r13 2597 push %r14 2598.cfi_push %r14 2599 push %r15 2600.cfi_push %r15 2601.Lpowerx5_prologue: 2602 2603 shl \$3,${num}d # convert $num to bytes 2604 lea ($num,$num,2),%r10 # 3*$num in bytes 2605 neg $num 2606 mov ($n0),$n0 # *n0 2607 2608 ############################################################## 2609 # Ensure that stack frame doesn't alias with $rptr+3*$num 2610 # modulo 4096, which covers ret[num], am[num] and n[num] 2611 # (see bn_exp.c). This is done to allow memory disambiguation 2612 # logic do its magic. [Extra 256 bytes is for power mask 2613 # calculated from 7th argument, the index.] 2614 # 2615 lea -320(%rsp,$num,2),%r11 2616 mov %rsp,%rbp 2617 sub $rptr,%r11 2618 and \$4095,%r11 2619 cmp %r11,%r10 2620 jb .Lpwrx_sp_alt 2621 sub %r11,%rbp # align with $aptr 2622 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) 2623 jmp .Lpwrx_sp_done 2624 2625.align 32 2626.Lpwrx_sp_alt: 2627 lea 4096-320(,$num,2),%r10 2628 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) 2629 sub %r10,%r11 2630 mov \$0,%r10 2631 cmovc %r10,%r11 2632 sub %r11,%rbp 2633.Lpwrx_sp_done: 2634 and \$-64,%rbp 2635 mov %rsp,%r11 2636 sub %rbp,%r11 2637 and \$-4096,%r11 2638 lea (%rbp,%r11),%rsp 2639 mov (%rsp),%r10 2640 cmp %rbp,%rsp 2641 ja .Lpwrx_page_walk 2642 jmp .Lpwrx_page_walk_done 2643 2644.Lpwrx_page_walk: 2645 lea -4096(%rsp),%rsp 2646 mov (%rsp),%r10 2647 cmp %rbp,%rsp 2648 ja .Lpwrx_page_walk 2649.Lpwrx_page_walk_done: 2650 2651 mov $num,%r10 2652 neg $num 2653 2654 ############################################################## 2655 # Stack layout 2656 # 2657 # +0 saved $num, used in reduction section 2658 # +8 &t[2*$num], used in reduction section 2659 # +16 intermediate carry bit 2660 # +24 top-most carry bit, used in reduction section 2661 # +32 saved *n0 2662 # +40 saved %rsp 2663 # +48 t[2*$num] 2664 # 2665 pxor %xmm0,%xmm0 2666 movq $rptr,%xmm1 # save $rptr 2667 movq $nptr,%xmm2 # save $nptr 2668 movq %r10, %xmm3 # -$num 2669 movq $bptr,%xmm4 2670 mov $n0, 32(%rsp) 2671 mov %rax, 40(%rsp) # save original %rsp 2672.cfi_cfa_expression %rsp+40,deref,+8 2673.Lpowerx5_body: 2674 2675 call __bn_sqrx8x_internal 2676 call __bn_postx4x_internal 2677 call __bn_sqrx8x_internal 2678 call __bn_postx4x_internal 2679 call __bn_sqrx8x_internal 2680 call __bn_postx4x_internal 2681 call __bn_sqrx8x_internal 2682 call __bn_postx4x_internal 2683 call __bn_sqrx8x_internal 2684 call __bn_postx4x_internal 2685 2686 mov %r10,$num # -num 2687 mov $aptr,$rptr 2688 movq %xmm2,$nptr 2689 movq %xmm4,$bptr 2690 mov 40(%rsp),%rax 2691 2692 call mulx4x_internal 2693 2694 mov 40(%rsp),%rsi # restore %rsp 2695.cfi_def_cfa %rsi,8 2696 mov \$1,%rax 2697 2698 mov -48(%rsi),%r15 2699.cfi_restore %r15 2700 mov -40(%rsi),%r14 2701.cfi_restore %r14 2702 mov -32(%rsi),%r13 2703.cfi_restore %r13 2704 mov -24(%rsi),%r12 2705.cfi_restore %r12 2706 mov -16(%rsi),%rbp 2707.cfi_restore %rbp 2708 mov -8(%rsi),%rbx 2709.cfi_restore %rbx 2710 lea (%rsi),%rsp 2711.cfi_def_cfa_register %rsp 2712.Lpowerx5_epilogue: 2713 ret 2714.cfi_endproc 2715.size bn_powerx5,.-bn_powerx5 2716 2717.globl bn_sqrx8x_internal 2718.hidden bn_sqrx8x_internal 2719.type bn_sqrx8x_internal,\@abi-omnipotent 2720.align 32 2721bn_sqrx8x_internal: 2722__bn_sqrx8x_internal: 2723.cfi_startproc 2724 ################################################################## 2725 # Squaring part: 2726 # 2727 # a) multiply-n-add everything but a[i]*a[i]; 2728 # b) shift result of a) by 1 to the left and accumulate 2729 # a[i]*a[i] products; 2730 # 2731 ################################################################## 2732 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2733 # a[1]a[0] 2734 # a[2]a[0] 2735 # a[3]a[0] 2736 # a[2]a[1] 2737 # a[3]a[1] 2738 # a[3]a[2] 2739 # 2740 # a[4]a[0] 2741 # a[5]a[0] 2742 # a[6]a[0] 2743 # a[7]a[0] 2744 # a[4]a[1] 2745 # a[5]a[1] 2746 # a[6]a[1] 2747 # a[7]a[1] 2748 # a[4]a[2] 2749 # a[5]a[2] 2750 # a[6]a[2] 2751 # a[7]a[2] 2752 # a[4]a[3] 2753 # a[5]a[3] 2754 # a[6]a[3] 2755 # a[7]a[3] 2756 # 2757 # a[5]a[4] 2758 # a[6]a[4] 2759 # a[7]a[4] 2760 # a[6]a[5] 2761 # a[7]a[5] 2762 # a[7]a[6] 2763 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] 2764___ 2765{ 2766my ($zero,$carry)=("%rbp","%rcx"); 2767my $aaptr=$zero; 2768$code.=<<___; 2769 lea 48+8(%rsp),$tptr 2770 lea ($aptr,$num),$aaptr 2771 mov $num,0+8(%rsp) # save $num 2772 mov $aaptr,8+8(%rsp) # save end of $aptr 2773 jmp .Lsqr8x_zero_start 2774 2775.align 32 2776.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2777.Lsqrx8x_zero: 2778 .byte 0x3e 2779 movdqa %xmm0,0*8($tptr) 2780 movdqa %xmm0,2*8($tptr) 2781 movdqa %xmm0,4*8($tptr) 2782 movdqa %xmm0,6*8($tptr) 2783.Lsqr8x_zero_start: # aligned at 32 2784 movdqa %xmm0,8*8($tptr) 2785 movdqa %xmm0,10*8($tptr) 2786 movdqa %xmm0,12*8($tptr) 2787 movdqa %xmm0,14*8($tptr) 2788 lea 16*8($tptr),$tptr 2789 sub \$64,$num 2790 jnz .Lsqrx8x_zero 2791 2792 mov 0*8($aptr),%rdx # a[0], modulo-scheduled 2793 #xor %r9,%r9 # t[1], ex-$num, zero already 2794 xor %r10,%r10 2795 xor %r11,%r11 2796 xor %r12,%r12 2797 xor %r13,%r13 2798 xor %r14,%r14 2799 xor %r15,%r15 2800 lea 48+8(%rsp),$tptr 2801 xor $zero,$zero # cf=0, cf=0 2802 jmp .Lsqrx8x_outer_loop 2803 2804.align 32 2805.Lsqrx8x_outer_loop: 2806 mulx 1*8($aptr),%r8,%rax # a[1]*a[0] 2807 adcx %r9,%r8 # a[1]*a[0]+=t[1] 2808 adox %rax,%r10 2809 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] 2810 adcx %r10,%r9 2811 adox %rax,%r11 2812 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... 2813 adcx %r11,%r10 2814 adox %rax,%r12 2815 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax 2816 adcx %r12,%r11 2817 adox %rax,%r13 2818 mulx 5*8($aptr),%r12,%rax 2819 adcx %r13,%r12 2820 adox %rax,%r14 2821 mulx 6*8($aptr),%r13,%rax 2822 adcx %r14,%r13 2823 adox %r15,%rax 2824 mulx 7*8($aptr),%r14,%r15 2825 mov 1*8($aptr),%rdx # a[1] 2826 adcx %rax,%r14 2827 adox $zero,%r15 2828 adc 8*8($tptr),%r15 2829 mov %r8,1*8($tptr) # t[1] 2830 mov %r9,2*8($tptr) # t[2] 2831 sbb $carry,$carry # mov %cf,$carry 2832 xor $zero,$zero # cf=0, of=0 2833 2834 2835 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] 2836 mulx 3*8($aptr),%r9,%rax # a[3]*a[1] 2837 adcx %r10,%r8 2838 adox %rbx,%r9 2839 mulx 4*8($aptr),%r10,%rbx # ... 2840 adcx %r11,%r9 2841 adox %rax,%r10 2842 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax 2843 adcx %r12,%r10 2844 adox %rbx,%r11 2845 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx 2846 adcx %r13,%r11 2847 adox %r14,%r12 2848 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 2849 mov 2*8($aptr),%rdx # a[2] 2850 adcx %rax,%r12 2851 adox %rbx,%r13 2852 adcx %r15,%r13 2853 adox $zero,%r14 # of=0 2854 adcx $zero,%r14 # cf=0 2855 2856 mov %r8,3*8($tptr) # t[3] 2857 mov %r9,4*8($tptr) # t[4] 2858 2859 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] 2860 mulx 4*8($aptr),%r9,%rax # a[4]*a[2] 2861 adcx %r10,%r8 2862 adox %rbx,%r9 2863 mulx 5*8($aptr),%r10,%rbx # ... 2864 adcx %r11,%r9 2865 adox %rax,%r10 2866 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax 2867 adcx %r12,%r10 2868 adox %r13,%r11 2869 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 2870 .byte 0x3e 2871 mov 3*8($aptr),%rdx # a[3] 2872 adcx %rbx,%r11 2873 adox %rax,%r12 2874 adcx %r14,%r12 2875 mov %r8,5*8($tptr) # t[5] 2876 mov %r9,6*8($tptr) # t[6] 2877 mulx 4*8($aptr),%r8,%rax # a[4]*a[3] 2878 adox $zero,%r13 # of=0 2879 adcx $zero,%r13 # cf=0 2880 2881 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] 2882 adcx %r10,%r8 2883 adox %rax,%r9 2884 mulx 6*8($aptr),%r10,%rax # ... 2885 adcx %r11,%r9 2886 adox %r12,%r10 2887 mulx 7*8($aptr),%r11,%r12 2888 mov 4*8($aptr),%rdx # a[4] 2889 mov 5*8($aptr),%r14 # a[5] 2890 adcx %rbx,%r10 2891 adox %rax,%r11 2892 mov 6*8($aptr),%r15 # a[6] 2893 adcx %r13,%r11 2894 adox $zero,%r12 # of=0 2895 adcx $zero,%r12 # cf=0 2896 2897 mov %r8,7*8($tptr) # t[7] 2898 mov %r9,8*8($tptr) # t[8] 2899 2900 mulx %r14,%r9,%rax # a[5]*a[4] 2901 mov 7*8($aptr),%r8 # a[7] 2902 adcx %r10,%r9 2903 mulx %r15,%r10,%rbx # a[6]*a[4] 2904 adox %rax,%r10 2905 adcx %r11,%r10 2906 mulx %r8,%r11,%rax # a[7]*a[4] 2907 mov %r14,%rdx # a[5] 2908 adox %rbx,%r11 2909 adcx %r12,%r11 2910 #adox $zero,%rax # of=0 2911 adcx $zero,%rax # cf=0 2912 2913 mulx %r15,%r14,%rbx # a[6]*a[5] 2914 mulx %r8,%r12,%r13 # a[7]*a[5] 2915 mov %r15,%rdx # a[6] 2916 lea 8*8($aptr),$aptr 2917 adcx %r14,%r11 2918 adox %rbx,%r12 2919 adcx %rax,%r12 2920 adox $zero,%r13 2921 2922 .byte 0x67,0x67 2923 mulx %r8,%r8,%r14 # a[7]*a[6] 2924 adcx %r8,%r13 2925 adcx $zero,%r14 2926 2927 cmp 8+8(%rsp),$aptr 2928 je .Lsqrx8x_outer_break 2929 2930 neg $carry # mov $carry,%cf 2931 mov \$-8,%rcx 2932 mov $zero,%r15 2933 mov 8*8($tptr),%r8 2934 adcx 9*8($tptr),%r9 # +=t[9] 2935 adcx 10*8($tptr),%r10 # ... 2936 adcx 11*8($tptr),%r11 2937 adc 12*8($tptr),%r12 2938 adc 13*8($tptr),%r13 2939 adc 14*8($tptr),%r14 2940 adc 15*8($tptr),%r15 2941 lea ($aptr),$aaptr 2942 lea 2*64($tptr),$tptr 2943 sbb %rax,%rax # mov %cf,$carry 2944 2945 mov -64($aptr),%rdx # a[0] 2946 mov %rax,16+8(%rsp) # offload $carry 2947 mov $tptr,24+8(%rsp) 2948 2949 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above 2950 xor %eax,%eax # cf=0, of=0 2951 jmp .Lsqrx8x_loop 2952 2953.align 32 2954.Lsqrx8x_loop: 2955 mov %r8,%rbx 2956 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] 2957 adcx %rax,%rbx # +=t[8] 2958 adox %r9,%r8 2959 2960 mulx 1*8($aaptr),%rax,%r9 # ... 2961 adcx %rax,%r8 2962 adox %r10,%r9 2963 2964 mulx 2*8($aaptr),%rax,%r10 2965 adcx %rax,%r9 2966 adox %r11,%r10 2967 2968 mulx 3*8($aaptr),%rax,%r11 2969 adcx %rax,%r10 2970 adox %r12,%r11 2971 2972 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 2973 adcx %rax,%r11 2974 adox %r13,%r12 2975 2976 mulx 5*8($aaptr),%rax,%r13 2977 adcx %rax,%r12 2978 adox %r14,%r13 2979 2980 mulx 6*8($aaptr),%rax,%r14 2981 mov %rbx,($tptr,%rcx,8) # store t[8+i] 2982 mov \$0,%ebx 2983 adcx %rax,%r13 2984 adox %r15,%r14 2985 2986 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 2987 mov 8($aptr,%rcx,8),%rdx # a[i] 2988 adcx %rax,%r14 2989 adox %rbx,%r15 # %rbx is 0, of=0 2990 adcx %rbx,%r15 # cf=0 2991 2992 .byte 0x67 2993 inc %rcx # of=0 2994 jnz .Lsqrx8x_loop 2995 2996 lea 8*8($aaptr),$aaptr 2997 mov \$-8,%rcx 2998 cmp 8+8(%rsp),$aaptr # done? 2999 je .Lsqrx8x_break 3000 3001 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3002 .byte 0x66 3003 mov -64($aptr),%rdx 3004 adcx 0*8($tptr),%r8 3005 adcx 1*8($tptr),%r9 3006 adc 2*8($tptr),%r10 3007 adc 3*8($tptr),%r11 3008 adc 4*8($tptr),%r12 3009 adc 5*8($tptr),%r13 3010 adc 6*8($tptr),%r14 3011 adc 7*8($tptr),%r15 3012 lea 8*8($tptr),$tptr 3013 .byte 0x67 3014 sbb %rax,%rax # mov %cf,%rax 3015 xor %ebx,%ebx # cf=0, of=0 3016 mov %rax,16+8(%rsp) # offload carry 3017 jmp .Lsqrx8x_loop 3018 3019.align 32 3020.Lsqrx8x_break: 3021 xor $zero,$zero 3022 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf 3023 adcx $zero,%r8 3024 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry 3025 adcx $zero,%r9 3026 mov 0*8($aptr),%rdx # a[8], modulo-scheduled 3027 adc \$0,%r10 3028 mov %r8,0*8($tptr) 3029 adc \$0,%r11 3030 adc \$0,%r12 3031 adc \$0,%r13 3032 adc \$0,%r14 3033 adc \$0,%r15 3034 cmp $carry,$tptr # cf=0, of=0 3035 je .Lsqrx8x_outer_loop 3036 3037 mov %r9,1*8($tptr) 3038 mov 1*8($carry),%r9 3039 mov %r10,2*8($tptr) 3040 mov 2*8($carry),%r10 3041 mov %r11,3*8($tptr) 3042 mov 3*8($carry),%r11 3043 mov %r12,4*8($tptr) 3044 mov 4*8($carry),%r12 3045 mov %r13,5*8($tptr) 3046 mov 5*8($carry),%r13 3047 mov %r14,6*8($tptr) 3048 mov 6*8($carry),%r14 3049 mov %r15,7*8($tptr) 3050 mov 7*8($carry),%r15 3051 mov $carry,$tptr 3052 jmp .Lsqrx8x_outer_loop 3053 3054.align 32 3055.Lsqrx8x_outer_break: 3056 mov %r9,9*8($tptr) # t[9] 3057 movq %xmm3,%rcx # -$num 3058 mov %r10,10*8($tptr) # ... 3059 mov %r11,11*8($tptr) 3060 mov %r12,12*8($tptr) 3061 mov %r13,13*8($tptr) 3062 mov %r14,14*8($tptr) 3063___ 3064}{ 3065my $i="%rcx"; 3066$code.=<<___; 3067 lea 48+8(%rsp),$tptr 3068 mov ($aptr,$i),%rdx # a[0] 3069 3070 mov 8($tptr),$A0[1] # t[1] 3071 xor $A0[0],$A0[0] # t[0], of=0, cf=0 3072 mov 0+8(%rsp),$num # restore $num 3073 adox $A0[1],$A0[1] 3074 mov 16($tptr),$A1[0] # t[2] # prefetch 3075 mov 24($tptr),$A1[1] # t[3] # prefetch 3076 #jmp .Lsqrx4x_shift_n_add # happens to be aligned 3077 3078.align 32 3079.Lsqrx4x_shift_n_add: 3080 mulx %rdx,%rax,%rbx 3081 adox $A1[0],$A1[0] 3082 adcx $A0[0],%rax 3083 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch 3084 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch 3085 adox $A1[1],$A1[1] 3086 adcx $A0[1],%rbx 3087 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch 3088 mov %rax,0($tptr) 3089 mov %rbx,8($tptr) 3090 3091 mulx %rdx,%rax,%rbx 3092 adox $A0[0],$A0[0] 3093 adcx $A1[0],%rax 3094 mov 16($aptr,$i),%rdx # a[i+2] # prefetch 3095 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch 3096 adox $A0[1],$A0[1] 3097 adcx $A1[1],%rbx 3098 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch 3099 mov %rax,16($tptr) 3100 mov %rbx,24($tptr) 3101 3102 mulx %rdx,%rax,%rbx 3103 adox $A1[0],$A1[0] 3104 adcx $A0[0],%rax 3105 mov 24($aptr,$i),%rdx # a[i+3] # prefetch 3106 lea 32($i),$i 3107 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch 3108 adox $A1[1],$A1[1] 3109 adcx $A0[1],%rbx 3110 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch 3111 mov %rax,32($tptr) 3112 mov %rbx,40($tptr) 3113 3114 mulx %rdx,%rax,%rbx 3115 adox $A0[0],$A0[0] 3116 adcx $A1[0],%rax 3117 jrcxz .Lsqrx4x_shift_n_add_break 3118 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch 3119 adox $A0[1],$A0[1] 3120 adcx $A1[1],%rbx 3121 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch 3122 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch 3123 mov %rax,48($tptr) 3124 mov %rbx,56($tptr) 3125 lea 64($tptr),$tptr 3126 nop 3127 jmp .Lsqrx4x_shift_n_add 3128 3129.align 32 3130.Lsqrx4x_shift_n_add_break: 3131 adcx $A1[1],%rbx 3132 mov %rax,48($tptr) 3133 mov %rbx,56($tptr) 3134 lea 64($tptr),$tptr # end of t[] buffer 3135___ 3136} 3137###################################################################### 3138# Montgomery reduction part, "word-by-word" algorithm. 3139# 3140# This new path is inspired by multiple submissions from Intel, by 3141# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, 3142# Vinodh Gopal... 3143{ 3144my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); 3145 3146$code.=<<___; 3147 movq %xmm2,$nptr 3148__bn_sqrx8x_reduction: 3149 xor %eax,%eax # initial top-most carry bit 3150 mov 32+8(%rsp),%rbx # n0 3151 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) 3152 lea -8*8($nptr,$num),%rcx # end of n[] 3153 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer 3154 mov %rcx, 0+8(%rsp) # save end of n[] 3155 mov $tptr,8+8(%rsp) # save end of t[] 3156 3157 lea 48+8(%rsp),$tptr # initial t[] window 3158 jmp .Lsqrx8x_reduction_loop 3159 3160.align 32 3161.Lsqrx8x_reduction_loop: 3162 mov 8*1($tptr),%r9 3163 mov 8*2($tptr),%r10 3164 mov 8*3($tptr),%r11 3165 mov 8*4($tptr),%r12 3166 mov %rdx,%r8 3167 imulq %rbx,%rdx # n0*a[i] 3168 mov 8*5($tptr),%r13 3169 mov 8*6($tptr),%r14 3170 mov 8*7($tptr),%r15 3171 mov %rax,24+8(%rsp) # store top-most carry bit 3172 3173 lea 8*8($tptr),$tptr 3174 xor $carry,$carry # cf=0,of=0 3175 mov \$-8,%rcx 3176 jmp .Lsqrx8x_reduce 3177 3178.align 32 3179.Lsqrx8x_reduce: 3180 mov %r8, %rbx 3181 mulx 8*0($nptr),%rax,%r8 # n[0] 3182 adcx %rbx,%rax # discarded 3183 adox %r9,%r8 3184 3185 mulx 8*1($nptr),%rbx,%r9 # n[1] 3186 adcx %rbx,%r8 3187 adox %r10,%r9 3188 3189 mulx 8*2($nptr),%rbx,%r10 3190 adcx %rbx,%r9 3191 adox %r11,%r10 3192 3193 mulx 8*3($nptr),%rbx,%r11 3194 adcx %rbx,%r10 3195 adox %r12,%r11 3196 3197 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 3198 mov %rdx,%rax 3199 mov %r8,%rdx 3200 adcx %rbx,%r11 3201 adox %r13,%r12 3202 3203 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded 3204 mov %rax,%rdx 3205 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] 3206 3207 mulx 8*5($nptr),%rax,%r13 3208 adcx %rax,%r12 3209 adox %r14,%r13 3210 3211 mulx 8*6($nptr),%rax,%r14 3212 adcx %rax,%r13 3213 adox %r15,%r14 3214 3215 mulx 8*7($nptr),%rax,%r15 3216 mov %rbx,%rdx 3217 adcx %rax,%r14 3218 adox $carry,%r15 # $carry is 0 3219 adcx $carry,%r15 # cf=0 3220 3221 .byte 0x67,0x67,0x67 3222 inc %rcx # of=0 3223 jnz .Lsqrx8x_reduce 3224 3225 mov $carry,%rax # xor %rax,%rax 3226 cmp 0+8(%rsp),$nptr # end of n[]? 3227 jae .Lsqrx8x_no_tail 3228 3229 mov 48+8(%rsp),%rdx # pull n0*a[0] 3230 add 8*0($tptr),%r8 3231 lea 8*8($nptr),$nptr 3232 mov \$-8,%rcx 3233 adcx 8*1($tptr),%r9 3234 adcx 8*2($tptr),%r10 3235 adc 8*3($tptr),%r11 3236 adc 8*4($tptr),%r12 3237 adc 8*5($tptr),%r13 3238 adc 8*6($tptr),%r14 3239 adc 8*7($tptr),%r15 3240 lea 8*8($tptr),$tptr 3241 sbb %rax,%rax # top carry 3242 3243 xor $carry,$carry # of=0, cf=0 3244 mov %rax,16+8(%rsp) 3245 jmp .Lsqrx8x_tail 3246 3247.align 32 3248.Lsqrx8x_tail: 3249 mov %r8,%rbx 3250 mulx 8*0($nptr),%rax,%r8 3251 adcx %rax,%rbx 3252 adox %r9,%r8 3253 3254 mulx 8*1($nptr),%rax,%r9 3255 adcx %rax,%r8 3256 adox %r10,%r9 3257 3258 mulx 8*2($nptr),%rax,%r10 3259 adcx %rax,%r9 3260 adox %r11,%r10 3261 3262 mulx 8*3($nptr),%rax,%r11 3263 adcx %rax,%r10 3264 adox %r12,%r11 3265 3266 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 3267 adcx %rax,%r11 3268 adox %r13,%r12 3269 3270 mulx 8*5($nptr),%rax,%r13 3271 adcx %rax,%r12 3272 adox %r14,%r13 3273 3274 mulx 8*6($nptr),%rax,%r14 3275 adcx %rax,%r13 3276 adox %r15,%r14 3277 3278 mulx 8*7($nptr),%rax,%r15 3279 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] 3280 adcx %rax,%r14 3281 adox $carry,%r15 3282 mov %rbx,($tptr,%rcx,8) # save result 3283 mov %r8,%rbx 3284 adcx $carry,%r15 # cf=0 3285 3286 inc %rcx # of=0 3287 jnz .Lsqrx8x_tail 3288 3289 cmp 0+8(%rsp),$nptr # end of n[]? 3290 jae .Lsqrx8x_tail_done # break out of loop 3291 3292 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3293 mov 48+8(%rsp),%rdx # pull n0*a[0] 3294 lea 8*8($nptr),$nptr 3295 adc 8*0($tptr),%r8 3296 adc 8*1($tptr),%r9 3297 adc 8*2($tptr),%r10 3298 adc 8*3($tptr),%r11 3299 adc 8*4($tptr),%r12 3300 adc 8*5($tptr),%r13 3301 adc 8*6($tptr),%r14 3302 adc 8*7($tptr),%r15 3303 lea 8*8($tptr),$tptr 3304 sbb %rax,%rax 3305 sub \$8,%rcx # mov \$-8,%rcx 3306 3307 xor $carry,$carry # of=0, cf=0 3308 mov %rax,16+8(%rsp) 3309 jmp .Lsqrx8x_tail 3310 3311.align 32 3312.Lsqrx8x_tail_done: 3313 xor %rax,%rax 3314 add 24+8(%rsp),%r8 # can this overflow? 3315 adc \$0,%r9 3316 adc \$0,%r10 3317 adc \$0,%r11 3318 adc \$0,%r12 3319 adc \$0,%r13 3320 adc \$0,%r14 3321 adc \$0,%r15 3322 adc \$0,%rax 3323 3324 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf 3325.Lsqrx8x_no_tail: # %cf is 0 if jumped here 3326 adc 8*0($tptr),%r8 3327 movq %xmm3,%rcx 3328 adc 8*1($tptr),%r9 3329 mov 8*7($nptr),$carry 3330 movq %xmm2,$nptr # restore $nptr 3331 adc 8*2($tptr),%r10 3332 adc 8*3($tptr),%r11 3333 adc 8*4($tptr),%r12 3334 adc 8*5($tptr),%r13 3335 adc 8*6($tptr),%r14 3336 adc 8*7($tptr),%r15 3337 adc \$0,%rax # top-most carry 3338 3339 mov 32+8(%rsp),%rbx # n0 3340 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" 3341 3342 mov %r8,8*0($tptr) # store top 512 bits 3343 lea 8*8($tptr),%r8 # borrow %r8 3344 mov %r9,8*1($tptr) 3345 mov %r10,8*2($tptr) 3346 mov %r11,8*3($tptr) 3347 mov %r12,8*4($tptr) 3348 mov %r13,8*5($tptr) 3349 mov %r14,8*6($tptr) 3350 mov %r15,8*7($tptr) 3351 3352 lea 8*8($tptr,%rcx),$tptr # start of current t[] window 3353 cmp 8+8(%rsp),%r8 # end of t[]? 3354 jb .Lsqrx8x_reduction_loop 3355 ret 3356.cfi_endproc 3357.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3358___ 3359} 3360############################################################## 3361# Post-condition, 4x unrolled 3362# 3363{ 3364my ($rptr,$nptr)=("%rdx","%rbp"); 3365$code.=<<___; 3366.align 32 3367.type __bn_postx4x_internal,\@abi-omnipotent 3368__bn_postx4x_internal: 3369.cfi_startproc 3370 mov 8*0($nptr),%r12 3371 mov %rcx,%r10 # -$num 3372 mov %rcx,%r9 # -$num 3373 neg %rax 3374 sar \$3+2,%rcx 3375 #lea 48+8(%rsp,%r9),$tptr 3376 movq %xmm1,$rptr # restore $rptr 3377 movq %xmm1,$aptr # prepare for back-to-back call 3378 dec %r12 # so that after 'not' we get -n[0] 3379 mov 8*1($nptr),%r13 3380 xor %r8,%r8 3381 mov 8*2($nptr),%r14 3382 mov 8*3($nptr),%r15 3383 jmp .Lsqrx4x_sub_entry 3384 3385.align 16 3386.Lsqrx4x_sub: 3387 mov 8*0($nptr),%r12 3388 mov 8*1($nptr),%r13 3389 mov 8*2($nptr),%r14 3390 mov 8*3($nptr),%r15 3391.Lsqrx4x_sub_entry: 3392 andn %rax,%r12,%r12 3393 lea 8*4($nptr),$nptr 3394 andn %rax,%r13,%r13 3395 andn %rax,%r14,%r14 3396 andn %rax,%r15,%r15 3397 3398 neg %r8 # mov %r8,%cf 3399 adc 8*0($tptr),%r12 3400 adc 8*1($tptr),%r13 3401 adc 8*2($tptr),%r14 3402 adc 8*3($tptr),%r15 3403 mov %r12,8*0($rptr) 3404 lea 8*4($tptr),$tptr 3405 mov %r13,8*1($rptr) 3406 sbb %r8,%r8 # mov %cf,%r8 3407 mov %r14,8*2($rptr) 3408 mov %r15,8*3($rptr) 3409 lea 8*4($rptr),$rptr 3410 3411 inc %rcx 3412 jnz .Lsqrx4x_sub 3413 3414 neg %r9 # restore $num 3415 3416 ret 3417.cfi_endproc 3418.size __bn_postx4x_internal,.-__bn_postx4x_internal 3419___ 3420} 3421}}} 3422{ 3423my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order 3424 ("%rdi","%esi","%rdx","%ecx"); # Unix order 3425my $out=$inp; 3426my $STRIDE=2**5*8; 3427my $N=$STRIDE/4; 3428 3429$code.=<<___; 3430.globl bn_scatter5 3431.type bn_scatter5,\@abi-omnipotent 3432.align 16 3433bn_scatter5: 3434.cfi_startproc 3435 cmp \$0, $num 3436 jz .Lscatter_epilogue 3437 lea ($tbl,$idx,8),$tbl 3438.Lscatter: 3439 mov ($inp),%rax 3440 lea 8($inp),$inp 3441 mov %rax,($tbl) 3442 lea 32*8($tbl),$tbl 3443 sub \$1,$num 3444 jnz .Lscatter 3445.Lscatter_epilogue: 3446 ret 3447.cfi_endproc 3448.size bn_scatter5,.-bn_scatter5 3449 3450.globl bn_gather5 3451.type bn_gather5,\@abi-omnipotent 3452.align 32 3453bn_gather5: 3454.cfi_startproc 3455.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases 3456 # I can't trust assembler to use specific encoding:-( 3457 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 3458.cfi_def_cfa_register %r10 3459 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp 3460 lea .Linc(%rip),%rax 3461 and \$-16,%rsp # shouldn't be formally required 3462 3463 movd $idx,%xmm5 3464 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 3465 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 3466 lea 128($tbl),%r11 # size optimization 3467 lea 128(%rsp),%rax # size optimization 3468 3469 pshufd \$0,%xmm5,%xmm5 # broadcast $idx 3470 movdqa %xmm1,%xmm4 3471 movdqa %xmm1,%xmm2 3472___ 3473######################################################################## 3474# calculate mask by comparing 0..31 to $idx and save result to stack 3475# 3476for($i=0;$i<$STRIDE/16;$i+=4) { 3477$code.=<<___; 3478 paddd %xmm0,%xmm1 3479 pcmpeqd %xmm5,%xmm0 # compare to 1,0 3480___ 3481$code.=<<___ if ($i); 3482 movdqa %xmm3,`16*($i-1)-128`(%rax) 3483___ 3484$code.=<<___; 3485 movdqa %xmm4,%xmm3 3486 3487 paddd %xmm1,%xmm2 3488 pcmpeqd %xmm5,%xmm1 # compare to 3,2 3489 movdqa %xmm0,`16*($i+0)-128`(%rax) 3490 movdqa %xmm4,%xmm0 3491 3492 paddd %xmm2,%xmm3 3493 pcmpeqd %xmm5,%xmm2 # compare to 5,4 3494 movdqa %xmm1,`16*($i+1)-128`(%rax) 3495 movdqa %xmm4,%xmm1 3496 3497 paddd %xmm3,%xmm0 3498 pcmpeqd %xmm5,%xmm3 # compare to 7,6 3499 movdqa %xmm2,`16*($i+2)-128`(%rax) 3500 movdqa %xmm4,%xmm2 3501___ 3502} 3503$code.=<<___; 3504 movdqa %xmm3,`16*($i-1)-128`(%rax) 3505 jmp .Lgather 3506 3507.align 32 3508.Lgather: 3509 pxor %xmm4,%xmm4 3510 pxor %xmm5,%xmm5 3511___ 3512for($i=0;$i<$STRIDE/16;$i+=4) { 3513$code.=<<___; 3514 movdqa `16*($i+0)-128`(%r11),%xmm0 3515 movdqa `16*($i+1)-128`(%r11),%xmm1 3516 movdqa `16*($i+2)-128`(%r11),%xmm2 3517 pand `16*($i+0)-128`(%rax),%xmm0 3518 movdqa `16*($i+3)-128`(%r11),%xmm3 3519 pand `16*($i+1)-128`(%rax),%xmm1 3520 por %xmm0,%xmm4 3521 pand `16*($i+2)-128`(%rax),%xmm2 3522 por %xmm1,%xmm5 3523 pand `16*($i+3)-128`(%rax),%xmm3 3524 por %xmm2,%xmm4 3525 por %xmm3,%xmm5 3526___ 3527} 3528$code.=<<___; 3529 por %xmm5,%xmm4 3530 lea $STRIDE(%r11),%r11 3531 pshufd \$0x4e,%xmm4,%xmm0 3532 por %xmm4,%xmm0 3533 movq %xmm0,($out) # m0=bp[0] 3534 lea 8($out),$out 3535 sub \$1,$num 3536 jnz .Lgather 3537 3538 lea (%r10),%rsp 3539.cfi_def_cfa_register %rsp 3540 ret 3541.LSEH_end_bn_gather5: 3542.cfi_endproc 3543.size bn_gather5,.-bn_gather5 3544___ 3545} 3546$code.=<<___; 3547.align 64 3548.Linc: 3549 .long 0,0, 1,1 3550 .long 2,2, 2,2 3551.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3552___ 3553 3554# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3555# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3556if ($win64) { 3557$rec="%rcx"; 3558$frame="%rdx"; 3559$context="%r8"; 3560$disp="%r9"; 3561 3562$code.=<<___; 3563.extern __imp_RtlVirtualUnwind 3564.type mul_handler,\@abi-omnipotent 3565.align 16 3566mul_handler: 3567 push %rsi 3568 push %rdi 3569 push %rbx 3570 push %rbp 3571 push %r12 3572 push %r13 3573 push %r14 3574 push %r15 3575 pushfq 3576 sub \$64,%rsp 3577 3578 mov 120($context),%rax # pull context->Rax 3579 mov 248($context),%rbx # pull context->Rip 3580 3581 mov 8($disp),%rsi # disp->ImageBase 3582 mov 56($disp),%r11 # disp->HandlerData 3583 3584 mov 0(%r11),%r10d # HandlerData[0] 3585 lea (%rsi,%r10),%r10 # end of prologue label 3586 cmp %r10,%rbx # context->Rip<end of prologue label 3587 jb .Lcommon_seh_tail 3588 3589 mov 4(%r11),%r10d # HandlerData[1] 3590 lea (%rsi,%r10),%r10 # beginning of body label 3591 cmp %r10,%rbx # context->Rip<body label 3592 jb .Lcommon_pop_regs 3593 3594 mov 152($context),%rax # pull context->Rsp 3595 3596 mov 8(%r11),%r10d # HandlerData[2] 3597 lea (%rsi,%r10),%r10 # epilogue label 3598 cmp %r10,%rbx # context->Rip>=epilogue label 3599 jae .Lcommon_seh_tail 3600 3601 lea .Lmul_epilogue(%rip),%r10 3602 cmp %r10,%rbx 3603 ja .Lbody_40 3604 3605 mov 192($context),%r10 # pull $num 3606 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 3607 3608 jmp .Lcommon_pop_regs 3609 3610.Lbody_40: 3611 mov 40(%rax),%rax # pull saved stack pointer 3612.Lcommon_pop_regs: 3613 mov -8(%rax),%rbx 3614 mov -16(%rax),%rbp 3615 mov -24(%rax),%r12 3616 mov -32(%rax),%r13 3617 mov -40(%rax),%r14 3618 mov -48(%rax),%r15 3619 mov %rbx,144($context) # restore context->Rbx 3620 mov %rbp,160($context) # restore context->Rbp 3621 mov %r12,216($context) # restore context->R12 3622 mov %r13,224($context) # restore context->R13 3623 mov %r14,232($context) # restore context->R14 3624 mov %r15,240($context) # restore context->R15 3625 3626.Lcommon_seh_tail: 3627 mov 8(%rax),%rdi 3628 mov 16(%rax),%rsi 3629 mov %rax,152($context) # restore context->Rsp 3630 mov %rsi,168($context) # restore context->Rsi 3631 mov %rdi,176($context) # restore context->Rdi 3632 3633 mov 40($disp),%rdi # disp->ContextRecord 3634 mov $context,%rsi # context 3635 mov \$154,%ecx # sizeof(CONTEXT) 3636 .long 0xa548f3fc # cld; rep movsq 3637 3638 mov $disp,%rsi 3639 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3640 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3641 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3642 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3643 mov 40(%rsi),%r10 # disp->ContextRecord 3644 lea 56(%rsi),%r11 # &disp->HandlerData 3645 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3646 mov %r10,32(%rsp) # arg5 3647 mov %r11,40(%rsp) # arg6 3648 mov %r12,48(%rsp) # arg7 3649 mov %rcx,56(%rsp) # arg8, (NULL) 3650 call *__imp_RtlVirtualUnwind(%rip) 3651 3652 mov \$1,%eax # ExceptionContinueSearch 3653 add \$64,%rsp 3654 popfq 3655 pop %r15 3656 pop %r14 3657 pop %r13 3658 pop %r12 3659 pop %rbp 3660 pop %rbx 3661 pop %rdi 3662 pop %rsi 3663 ret 3664.size mul_handler,.-mul_handler 3665 3666.section .pdata 3667.align 4 3668 .rva .LSEH_begin_bn_mul_mont_gather5 3669 .rva .LSEH_end_bn_mul_mont_gather5 3670 .rva .LSEH_info_bn_mul_mont_gather5 3671 3672 .rva .LSEH_begin_bn_mul4x_mont_gather5 3673 .rva .LSEH_end_bn_mul4x_mont_gather5 3674 .rva .LSEH_info_bn_mul4x_mont_gather5 3675 3676 .rva .LSEH_begin_bn_power5 3677 .rva .LSEH_end_bn_power5 3678 .rva .LSEH_info_bn_power5 3679___ 3680$code.=<<___ if ($addx); 3681 .rva .LSEH_begin_bn_mulx4x_mont_gather5 3682 .rva .LSEH_end_bn_mulx4x_mont_gather5 3683 .rva .LSEH_info_bn_mulx4x_mont_gather5 3684 3685 .rva .LSEH_begin_bn_powerx5 3686 .rva .LSEH_end_bn_powerx5 3687 .rva .LSEH_info_bn_powerx5 3688___ 3689$code.=<<___; 3690 .rva .LSEH_begin_bn_gather5 3691 .rva .LSEH_end_bn_gather5 3692 .rva .LSEH_info_bn_gather5 3693 3694.section .xdata 3695.align 8 3696.LSEH_info_bn_mul_mont_gather5: 3697 .byte 9,0,0,0 3698 .rva mul_handler 3699 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] 3700.align 8 3701.LSEH_info_bn_mul4x_mont_gather5: 3702 .byte 9,0,0,0 3703 .rva mul_handler 3704 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 3705.align 8 3706.LSEH_info_bn_power5: 3707 .byte 9,0,0,0 3708 .rva mul_handler 3709 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] 3710___ 3711$code.=<<___ if ($addx); 3712.align 8 3713.LSEH_info_bn_mulx4x_mont_gather5: 3714 .byte 9,0,0,0 3715 .rva mul_handler 3716 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 3717.align 8 3718.LSEH_info_bn_powerx5: 3719 .byte 9,0,0,0 3720 .rva mul_handler 3721 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] 3722___ 3723$code.=<<___; 3724.align 8 3725.LSEH_info_bn_gather5: 3726 .byte 0x01,0x0b,0x03,0x0a 3727 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 3728 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) 3729.align 8 3730___ 3731} 3732 3733$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3734 3735print $code; 3736close STDOUT or die "error closing STDOUT: $!"; 3737