1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# December 2007 11 12# The reason for undertaken effort is basically following. Even though 13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 14# performance was observed to be less than impressive, essentially as 15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 16# Well, it's not surprising that IBM had to make some sacrifices to 17# boost the clock frequency that much, but no overall improvement? 18# Having observed how much difference did switching to FPU make on 19# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 20# Unfortunately the resulting performance improvement is not as 21# impressive, ~30%, and in absolute terms is still very far from what 22# one would expect from 4.7GHz CPU. There is a chance that I'm doing 23# something wrong, but in the lack of assembler level micro-profiling 24# data or at least decent platform guide I can't tell... Or better 25# results might be achieved with VMX... Anyway, this module provides 26# *worse* performance on other PowerPC implementations, ~40-15% slower 27# on PPC970 depending on key length and ~40% slower on Power 5 for all 28# key lengths. As it's obviously inappropriate as "best all-round" 29# alternative, it has to be complemented with run-time CPU family 30# detection. Oh! It should also be noted that unlike other PowerPC 31# implementation IALU ppc-mont.pl module performs *suboptimaly* on 32# >=1024-bit key lengths on Power 6. It should also be noted that 33# *everything* said so far applies to 64-bit builds! As far as 32-bit 34# application executed on 64-bit CPU goes, this module is likely to 35# become preferred choice, because it's easy to adapt it for such 36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 37 38# February 2008 39 40# Micro-profiling assisted optimization results in ~15% improvement 41# over original ppc64-mont.pl version, or overall ~50% improvement 42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 43# Power 6 CPU, this module is 5-150% faster depending on key length, 44# [hereafter] more for longer keys. But if compared to ppc-mont.pl 45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 46# in absolute terms, but it's apparently the way Power 6 is... 47 48# December 2009 49 50# Adapted for 32-bit build this module delivers 25-120%, yes, more 51# than *twice* for longer keys, performance improvement over 32-bit 52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 53# even 64-bit integer operations and the trouble is that most PPC 54# operating systems don't preserve upper halves of general purpose 55# registers upon 32-bit signal delivery. They do preserve them upon 56# context switch, but not signalling:-( This means that asynchronous 57# signals have to be blocked upon entry to this subroutine. Signal 58# masking (and of course complementary unmasking) has quite an impact 59# on performance, naturally larger for shorter keys. It's so severe 60# that 512-bit key performance can be as low as 1/3 of expected one. 61# This is why this routine can be engaged for longer key operations 62# only on these OSes, see crypto/ppccap.c for further details. MacOS X 63# is an exception from this and doesn't require signal masking, and 64# that's where above improvement coefficients were collected. For 65# others alternative would be to break dependence on upper halves of 66# GPRs by sticking to 32-bit integer operations... 67 68$flavour = shift; 69 70if ($flavour =~ /32/) { 71 $SIZE_T=4; 72 $RZONE= 224; 73 $fname= "bn_mul_mont_fpu64"; 74 75 $STUX= "stwux"; # store indexed and update 76 $PUSH= "stw"; 77 $POP= "lwz"; 78} elsif ($flavour =~ /64/) { 79 $SIZE_T=8; 80 $RZONE= 288; 81 $fname= "bn_mul_mont_fpu64"; 82 83 # same as above, but 64-bit mnemonics... 84 $STUX= "stdux"; # store indexed and update 85 $PUSH= "std"; 86 $POP= "ld"; 87} else { die "nonsense $flavour"; } 88 89$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 90( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 91( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 92die "can't locate ppc-xlate.pl"; 93 94open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 95 96$FRAME=64; # padded frame header 97$TRANSFER=16*8; 98 99$carry="r0"; 100$sp="r1"; 101$toc="r2"; 102$rp="r3"; $ovf="r3"; 103$ap="r4"; 104$bp="r5"; 105$np="r6"; 106$n0="r7"; 107$num="r8"; 108$rp="r9"; # $rp is reassigned 109$tp="r10"; 110$j="r11"; 111$i="r12"; 112# non-volatile registers 113$nap_d="r22"; # interleaved ap and np in double format 114$a0="r23"; # ap[0] 115$t0="r24"; # temporary registers 116$t1="r25"; 117$t2="r26"; 118$t3="r27"; 119$t4="r28"; 120$t5="r29"; 121$t6="r30"; 122$t7="r31"; 123 124# PPC offers enough register bank capacity to unroll inner loops twice 125# 126# ..A3A2A1A0 127# dcba 128# ----------- 129# A0a 130# A0b 131# A0c 132# A0d 133# A1a 134# A1b 135# A1c 136# A1d 137# A2a 138# A2b 139# A2c 140# A2d 141# A3a 142# A3b 143# A3c 144# A3d 145# ..a 146# ..b 147# 148$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 149$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 150$dota="f8"; $dotb="f9"; 151$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 152$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 153$T0a="f24"; $T0b="f25"; 154$T1a="f26"; $T1b="f27"; 155$T2a="f28"; $T2b="f29"; 156$T3a="f30"; $T3b="f31"; 157 158# sp----------->+-------------------------------+ 159# | saved sp | 160# +-------------------------------+ 161# . . 162# +64 +-------------------------------+ 163# | 16 gpr<->fpr transfer zone | 164# . . 165# . . 166# +16*8 +-------------------------------+ 167# | __int64 tmp[-1] | 168# +-------------------------------+ 169# | __int64 tmp[num] | 170# . . 171# . . 172# . . 173# +(num+1)*8 +-------------------------------+ 174# | padding to 64 byte boundary | 175# . . 176# +X +-------------------------------+ 177# | double nap_d[4*num] | 178# . . 179# . . 180# . . 181# +-------------------------------+ 182# . . 183# -12*size_t +-------------------------------+ 184# | 10 saved gpr, r22-r31 | 185# . . 186# . . 187# -12*8 +-------------------------------+ 188# | 12 saved fpr, f20-f31 | 189# . . 190# . . 191# +-------------------------------+ 192 193$code=<<___; 194.machine "any" 195.text 196 197.globl .$fname 198.align 5 199.$fname: 200 cmpwi $num,`3*8/$SIZE_T` 201 mr $rp,r3 ; $rp is reassigned 202 li r3,0 ; possible "not handled" return code 203 bltlr- 204 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 205 bnelr- 206 207 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 208 li $i,-4096 209 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 210 add $tp,$tp,$num ; place for tp[num+1] 211 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 212 subf $tp,$tp,$sp ; $sp-$tp 213 and $tp,$tp,$i ; minimize TLB usage 214 subf $tp,$sp,$tp ; $tp-$sp 215 mr $i,$sp 216 $STUX $sp,$sp,$tp ; alloca 217 218 $PUSH r22,`-12*8-10*$SIZE_T`($i) 219 $PUSH r23,`-12*8-9*$SIZE_T`($i) 220 $PUSH r24,`-12*8-8*$SIZE_T`($i) 221 $PUSH r25,`-12*8-7*$SIZE_T`($i) 222 $PUSH r26,`-12*8-6*$SIZE_T`($i) 223 $PUSH r27,`-12*8-5*$SIZE_T`($i) 224 $PUSH r28,`-12*8-4*$SIZE_T`($i) 225 $PUSH r29,`-12*8-3*$SIZE_T`($i) 226 $PUSH r30,`-12*8-2*$SIZE_T`($i) 227 $PUSH r31,`-12*8-1*$SIZE_T`($i) 228 stfd f20,`-12*8`($i) 229 stfd f21,`-11*8`($i) 230 stfd f22,`-10*8`($i) 231 stfd f23,`-9*8`($i) 232 stfd f24,`-8*8`($i) 233 stfd f25,`-7*8`($i) 234 stfd f26,`-6*8`($i) 235 stfd f27,`-5*8`($i) 236 stfd f28,`-4*8`($i) 237 stfd f29,`-3*8`($i) 238 stfd f30,`-2*8`($i) 239 stfd f31,`-1*8`($i) 240___ 241$code.=<<___ if ($SIZE_T==8); 242 ld $a0,0($ap) ; pull ap[0] value 243 ld $n0,0($n0) ; pull n0[0] value 244 ld $t3,0($bp) ; bp[0] 245___ 246$code.=<<___ if ($SIZE_T==4); 247 mr $t1,$n0 248 lwz $a0,0($ap) ; pull ap[0,1] value 249 lwz $t0,4($ap) 250 lwz $n0,0($t1) ; pull n0[0,1] value 251 lwz $t1,4($t1) 252 lwz $t3,0($bp) ; bp[0,1] 253 lwz $t2,4($bp) 254 insrdi $a0,$t0,32,0 255 insrdi $n0,$t1,32,0 256 insrdi $t3,$t2,32,0 257___ 258$code.=<<___; 259 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 260 li $i,-64 261 add $nap_d,$tp,$num 262 and $nap_d,$nap_d,$i ; align to 64 bytes 263 264 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 265 ; nap_d is off by 1, because it's used with stfdu/lfdu 266 addi $nap_d,$nap_d,-8 267 srwi $j,$num,`3+1` ; counter register, num/2 268 mulld $t7,$t7,$n0 ; tp[0]*n0 269 addi $j,$j,-1 270 addi $tp,$sp,`$FRAME+$TRANSFER-8` 271 li $carry,0 272 mtctr $j 273 274 ; transfer bp[0] to FPU as 4x16-bit values 275 extrdi $t0,$t3,16,48 276 extrdi $t1,$t3,16,32 277 extrdi $t2,$t3,16,16 278 extrdi $t3,$t3,16,0 279 std $t0,`$FRAME+0`($sp) 280 std $t1,`$FRAME+8`($sp) 281 std $t2,`$FRAME+16`($sp) 282 std $t3,`$FRAME+24`($sp) 283 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 284 extrdi $t4,$t7,16,48 285 extrdi $t5,$t7,16,32 286 extrdi $t6,$t7,16,16 287 extrdi $t7,$t7,16,0 288 std $t4,`$FRAME+32`($sp) 289 std $t5,`$FRAME+40`($sp) 290 std $t6,`$FRAME+48`($sp) 291 std $t7,`$FRAME+56`($sp) 292___ 293$code.=<<___ if ($SIZE_T==8); 294 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 295 lwz $t1,0($ap) 296 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 297 lwz $t3,8($ap) 298 lwz $t4,4($np) ; load n[j] as 32-bit word pair 299 lwz $t5,0($np) 300 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 301 lwz $t7,8($np) 302___ 303$code.=<<___ if ($SIZE_T==4); 304 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 305 lwz $t1,4($ap) 306 lwz $t2,8($ap) 307 lwz $t3,12($ap) 308 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 309 lwz $t5,4($np) 310 lwz $t6,8($np) 311 lwz $t7,12($np) 312___ 313$code.=<<___; 314 lfd $ba,`$FRAME+0`($sp) 315 lfd $bb,`$FRAME+8`($sp) 316 lfd $bc,`$FRAME+16`($sp) 317 lfd $bd,`$FRAME+24`($sp) 318 lfd $na,`$FRAME+32`($sp) 319 lfd $nb,`$FRAME+40`($sp) 320 lfd $nc,`$FRAME+48`($sp) 321 lfd $nd,`$FRAME+56`($sp) 322 std $t0,`$FRAME+64`($sp) 323 std $t1,`$FRAME+72`($sp) 324 std $t2,`$FRAME+80`($sp) 325 std $t3,`$FRAME+88`($sp) 326 std $t4,`$FRAME+96`($sp) 327 std $t5,`$FRAME+104`($sp) 328 std $t6,`$FRAME+112`($sp) 329 std $t7,`$FRAME+120`($sp) 330 fcfid $ba,$ba 331 fcfid $bb,$bb 332 fcfid $bc,$bc 333 fcfid $bd,$bd 334 fcfid $na,$na 335 fcfid $nb,$nb 336 fcfid $nc,$nc 337 fcfid $nd,$nd 338 339 lfd $A0,`$FRAME+64`($sp) 340 lfd $A1,`$FRAME+72`($sp) 341 lfd $A2,`$FRAME+80`($sp) 342 lfd $A3,`$FRAME+88`($sp) 343 lfd $N0,`$FRAME+96`($sp) 344 lfd $N1,`$FRAME+104`($sp) 345 lfd $N2,`$FRAME+112`($sp) 346 lfd $N3,`$FRAME+120`($sp) 347 fcfid $A0,$A0 348 fcfid $A1,$A1 349 fcfid $A2,$A2 350 fcfid $A3,$A3 351 fcfid $N0,$N0 352 fcfid $N1,$N1 353 fcfid $N2,$N2 354 fcfid $N3,$N3 355 addi $ap,$ap,16 356 addi $np,$np,16 357 358 fmul $T1a,$A1,$ba 359 fmul $T1b,$A1,$bb 360 stfd $A0,8($nap_d) ; save a[j] in double format 361 stfd $A1,16($nap_d) 362 fmul $T2a,$A2,$ba 363 fmul $T2b,$A2,$bb 364 stfd $A2,24($nap_d) ; save a[j+1] in double format 365 stfd $A3,32($nap_d) 366 fmul $T3a,$A3,$ba 367 fmul $T3b,$A3,$bb 368 stfd $N0,40($nap_d) ; save n[j] in double format 369 stfd $N1,48($nap_d) 370 fmul $T0a,$A0,$ba 371 fmul $T0b,$A0,$bb 372 stfd $N2,56($nap_d) ; save n[j+1] in double format 373 stfdu $N3,64($nap_d) 374 375 fmadd $T1a,$A0,$bc,$T1a 376 fmadd $T1b,$A0,$bd,$T1b 377 fmadd $T2a,$A1,$bc,$T2a 378 fmadd $T2b,$A1,$bd,$T2b 379 fmadd $T3a,$A2,$bc,$T3a 380 fmadd $T3b,$A2,$bd,$T3b 381 fmul $dota,$A3,$bc 382 fmul $dotb,$A3,$bd 383 384 fmadd $T1a,$N1,$na,$T1a 385 fmadd $T1b,$N1,$nb,$T1b 386 fmadd $T2a,$N2,$na,$T2a 387 fmadd $T2b,$N2,$nb,$T2b 388 fmadd $T3a,$N3,$na,$T3a 389 fmadd $T3b,$N3,$nb,$T3b 390 fmadd $T0a,$N0,$na,$T0a 391 fmadd $T0b,$N0,$nb,$T0b 392 393 fmadd $T1a,$N0,$nc,$T1a 394 fmadd $T1b,$N0,$nd,$T1b 395 fmadd $T2a,$N1,$nc,$T2a 396 fmadd $T2b,$N1,$nd,$T2b 397 fmadd $T3a,$N2,$nc,$T3a 398 fmadd $T3b,$N2,$nd,$T3b 399 fmadd $dota,$N3,$nc,$dota 400 fmadd $dotb,$N3,$nd,$dotb 401 402 fctid $T0a,$T0a 403 fctid $T0b,$T0b 404 fctid $T1a,$T1a 405 fctid $T1b,$T1b 406 fctid $T2a,$T2a 407 fctid $T2b,$T2b 408 fctid $T3a,$T3a 409 fctid $T3b,$T3b 410 411 stfd $T0a,`$FRAME+0`($sp) 412 stfd $T0b,`$FRAME+8`($sp) 413 stfd $T1a,`$FRAME+16`($sp) 414 stfd $T1b,`$FRAME+24`($sp) 415 stfd $T2a,`$FRAME+32`($sp) 416 stfd $T2b,`$FRAME+40`($sp) 417 stfd $T3a,`$FRAME+48`($sp) 418 stfd $T3b,`$FRAME+56`($sp) 419 420.align 5 421L1st: 422___ 423$code.=<<___ if ($SIZE_T==8); 424 lwz $t0,4($ap) ; load a[j] as 32-bit word pair 425 lwz $t1,0($ap) 426 lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair 427 lwz $t3,8($ap) 428 lwz $t4,4($np) ; load n[j] as 32-bit word pair 429 lwz $t5,0($np) 430 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair 431 lwz $t7,8($np) 432___ 433$code.=<<___ if ($SIZE_T==4); 434 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 435 lwz $t1,4($ap) 436 lwz $t2,8($ap) 437 lwz $t3,12($ap) 438 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 439 lwz $t5,4($np) 440 lwz $t6,8($np) 441 lwz $t7,12($np) 442___ 443$code.=<<___; 444 std $t0,`$FRAME+64`($sp) 445 std $t1,`$FRAME+72`($sp) 446 std $t2,`$FRAME+80`($sp) 447 std $t3,`$FRAME+88`($sp) 448 std $t4,`$FRAME+96`($sp) 449 std $t5,`$FRAME+104`($sp) 450 std $t6,`$FRAME+112`($sp) 451 std $t7,`$FRAME+120`($sp) 452 ld $t0,`$FRAME+0`($sp) 453 ld $t1,`$FRAME+8`($sp) 454 ld $t2,`$FRAME+16`($sp) 455 ld $t3,`$FRAME+24`($sp) 456 ld $t4,`$FRAME+32`($sp) 457 ld $t5,`$FRAME+40`($sp) 458 ld $t6,`$FRAME+48`($sp) 459 ld $t7,`$FRAME+56`($sp) 460 lfd $A0,`$FRAME+64`($sp) 461 lfd $A1,`$FRAME+72`($sp) 462 lfd $A2,`$FRAME+80`($sp) 463 lfd $A3,`$FRAME+88`($sp) 464 lfd $N0,`$FRAME+96`($sp) 465 lfd $N1,`$FRAME+104`($sp) 466 lfd $N2,`$FRAME+112`($sp) 467 lfd $N3,`$FRAME+120`($sp) 468 fcfid $A0,$A0 469 fcfid $A1,$A1 470 fcfid $A2,$A2 471 fcfid $A3,$A3 472 fcfid $N0,$N0 473 fcfid $N1,$N1 474 fcfid $N2,$N2 475 fcfid $N3,$N3 476 addi $ap,$ap,16 477 addi $np,$np,16 478 479 fmul $T1a,$A1,$ba 480 fmul $T1b,$A1,$bb 481 fmul $T2a,$A2,$ba 482 fmul $T2b,$A2,$bb 483 stfd $A0,8($nap_d) ; save a[j] in double format 484 stfd $A1,16($nap_d) 485 fmul $T3a,$A3,$ba 486 fmul $T3b,$A3,$bb 487 fmadd $T0a,$A0,$ba,$dota 488 fmadd $T0b,$A0,$bb,$dotb 489 stfd $A2,24($nap_d) ; save a[j+1] in double format 490 stfd $A3,32($nap_d) 491 492 fmadd $T1a,$A0,$bc,$T1a 493 fmadd $T1b,$A0,$bd,$T1b 494 fmadd $T2a,$A1,$bc,$T2a 495 fmadd $T2b,$A1,$bd,$T2b 496 stfd $N0,40($nap_d) ; save n[j] in double format 497 stfd $N1,48($nap_d) 498 fmadd $T3a,$A2,$bc,$T3a 499 fmadd $T3b,$A2,$bd,$T3b 500 add $t0,$t0,$carry ; can not overflow 501 fmul $dota,$A3,$bc 502 fmul $dotb,$A3,$bd 503 stfd $N2,56($nap_d) ; save n[j+1] in double format 504 stfdu $N3,64($nap_d) 505 srdi $carry,$t0,16 506 add $t1,$t1,$carry 507 srdi $carry,$t1,16 508 509 fmadd $T1a,$N1,$na,$T1a 510 fmadd $T1b,$N1,$nb,$T1b 511 insrdi $t0,$t1,16,32 512 fmadd $T2a,$N2,$na,$T2a 513 fmadd $T2b,$N2,$nb,$T2b 514 add $t2,$t2,$carry 515 fmadd $T3a,$N3,$na,$T3a 516 fmadd $T3b,$N3,$nb,$T3b 517 srdi $carry,$t2,16 518 fmadd $T0a,$N0,$na,$T0a 519 fmadd $T0b,$N0,$nb,$T0b 520 insrdi $t0,$t2,16,16 521 add $t3,$t3,$carry 522 srdi $carry,$t3,16 523 524 fmadd $T1a,$N0,$nc,$T1a 525 fmadd $T1b,$N0,$nd,$T1b 526 insrdi $t0,$t3,16,0 ; 0..63 bits 527 fmadd $T2a,$N1,$nc,$T2a 528 fmadd $T2b,$N1,$nd,$T2b 529 add $t4,$t4,$carry 530 fmadd $T3a,$N2,$nc,$T3a 531 fmadd $T3b,$N2,$nd,$T3b 532 srdi $carry,$t4,16 533 fmadd $dota,$N3,$nc,$dota 534 fmadd $dotb,$N3,$nd,$dotb 535 add $t5,$t5,$carry 536 srdi $carry,$t5,16 537 insrdi $t4,$t5,16,32 538 539 fctid $T0a,$T0a 540 fctid $T0b,$T0b 541 add $t6,$t6,$carry 542 fctid $T1a,$T1a 543 fctid $T1b,$T1b 544 srdi $carry,$t6,16 545 fctid $T2a,$T2a 546 fctid $T2b,$T2b 547 insrdi $t4,$t6,16,16 548 fctid $T3a,$T3a 549 fctid $T3b,$T3b 550 add $t7,$t7,$carry 551 insrdi $t4,$t7,16,0 ; 64..127 bits 552 srdi $carry,$t7,16 ; upper 33 bits 553 554 stfd $T0a,`$FRAME+0`($sp) 555 stfd $T0b,`$FRAME+8`($sp) 556 stfd $T1a,`$FRAME+16`($sp) 557 stfd $T1b,`$FRAME+24`($sp) 558 stfd $T2a,`$FRAME+32`($sp) 559 stfd $T2b,`$FRAME+40`($sp) 560 stfd $T3a,`$FRAME+48`($sp) 561 stfd $T3b,`$FRAME+56`($sp) 562 std $t0,8($tp) ; tp[j-1] 563 stdu $t4,16($tp) ; tp[j] 564 bdnz- L1st 565 566 fctid $dota,$dota 567 fctid $dotb,$dotb 568 569 ld $t0,`$FRAME+0`($sp) 570 ld $t1,`$FRAME+8`($sp) 571 ld $t2,`$FRAME+16`($sp) 572 ld $t3,`$FRAME+24`($sp) 573 ld $t4,`$FRAME+32`($sp) 574 ld $t5,`$FRAME+40`($sp) 575 ld $t6,`$FRAME+48`($sp) 576 ld $t7,`$FRAME+56`($sp) 577 stfd $dota,`$FRAME+64`($sp) 578 stfd $dotb,`$FRAME+72`($sp) 579 580 add $t0,$t0,$carry ; can not overflow 581 srdi $carry,$t0,16 582 add $t1,$t1,$carry 583 srdi $carry,$t1,16 584 insrdi $t0,$t1,16,32 585 add $t2,$t2,$carry 586 srdi $carry,$t2,16 587 insrdi $t0,$t2,16,16 588 add $t3,$t3,$carry 589 srdi $carry,$t3,16 590 insrdi $t0,$t3,16,0 ; 0..63 bits 591 add $t4,$t4,$carry 592 srdi $carry,$t4,16 593 add $t5,$t5,$carry 594 srdi $carry,$t5,16 595 insrdi $t4,$t5,16,32 596 add $t6,$t6,$carry 597 srdi $carry,$t6,16 598 insrdi $t4,$t6,16,16 599 add $t7,$t7,$carry 600 insrdi $t4,$t7,16,0 ; 64..127 bits 601 srdi $carry,$t7,16 ; upper 33 bits 602 ld $t6,`$FRAME+64`($sp) 603 ld $t7,`$FRAME+72`($sp) 604 605 std $t0,8($tp) ; tp[j-1] 606 stdu $t4,16($tp) ; tp[j] 607 608 add $t6,$t6,$carry ; can not overflow 609 srdi $carry,$t6,16 610 add $t7,$t7,$carry 611 insrdi $t6,$t7,48,0 612 srdi $ovf,$t7,48 613 std $t6,8($tp) ; tp[num-1] 614 615 slwi $t7,$num,2 616 subf $nap_d,$t7,$nap_d ; rewind pointer 617 618 li $i,8 ; i=1 619.align 5 620Louter: 621___ 622$code.=<<___ if ($SIZE_T==8); 623 ldx $t3,$bp,$i ; bp[i] 624___ 625$code.=<<___ if ($SIZE_T==4); 626 add $t0,$bp,$i 627 lwz $t3,0($t0) ; bp[i,i+1] 628 lwz $t0,4($t0) 629 insrdi $t3,$t0,32,0 630___ 631$code.=<<___; 632 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 633 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 634 635 addi $tp,$sp,`$FRAME+$TRANSFER` 636 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 637 li $carry,0 638 mulld $t7,$t7,$n0 ; tp[0]*n0 639 mtctr $j 640 641 ; transfer bp[i] to FPU as 4x16-bit values 642 extrdi $t0,$t3,16,48 643 extrdi $t1,$t3,16,32 644 extrdi $t2,$t3,16,16 645 extrdi $t3,$t3,16,0 646 std $t0,`$FRAME+0`($sp) 647 std $t1,`$FRAME+8`($sp) 648 std $t2,`$FRAME+16`($sp) 649 std $t3,`$FRAME+24`($sp) 650 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 651 extrdi $t4,$t7,16,48 652 extrdi $t5,$t7,16,32 653 extrdi $t6,$t7,16,16 654 extrdi $t7,$t7,16,0 655 std $t4,`$FRAME+32`($sp) 656 std $t5,`$FRAME+40`($sp) 657 std $t6,`$FRAME+48`($sp) 658 std $t7,`$FRAME+56`($sp) 659 660 lfd $A0,8($nap_d) ; load a[j] in double format 661 lfd $A1,16($nap_d) 662 lfd $A2,24($nap_d) ; load a[j+1] in double format 663 lfd $A3,32($nap_d) 664 lfd $N0,40($nap_d) ; load n[j] in double format 665 lfd $N1,48($nap_d) 666 lfd $N2,56($nap_d) ; load n[j+1] in double format 667 lfdu $N3,64($nap_d) 668 669 lfd $ba,`$FRAME+0`($sp) 670 lfd $bb,`$FRAME+8`($sp) 671 lfd $bc,`$FRAME+16`($sp) 672 lfd $bd,`$FRAME+24`($sp) 673 lfd $na,`$FRAME+32`($sp) 674 lfd $nb,`$FRAME+40`($sp) 675 lfd $nc,`$FRAME+48`($sp) 676 lfd $nd,`$FRAME+56`($sp) 677 678 fcfid $ba,$ba 679 fcfid $bb,$bb 680 fcfid $bc,$bc 681 fcfid $bd,$bd 682 fcfid $na,$na 683 fcfid $nb,$nb 684 fcfid $nc,$nc 685 fcfid $nd,$nd 686 687 fmul $T1a,$A1,$ba 688 fmul $T1b,$A1,$bb 689 fmul $T2a,$A2,$ba 690 fmul $T2b,$A2,$bb 691 fmul $T3a,$A3,$ba 692 fmul $T3b,$A3,$bb 693 fmul $T0a,$A0,$ba 694 fmul $T0b,$A0,$bb 695 696 fmadd $T1a,$A0,$bc,$T1a 697 fmadd $T1b,$A0,$bd,$T1b 698 fmadd $T2a,$A1,$bc,$T2a 699 fmadd $T2b,$A1,$bd,$T2b 700 fmadd $T3a,$A2,$bc,$T3a 701 fmadd $T3b,$A2,$bd,$T3b 702 fmul $dota,$A3,$bc 703 fmul $dotb,$A3,$bd 704 705 fmadd $T1a,$N1,$na,$T1a 706 fmadd $T1b,$N1,$nb,$T1b 707 lfd $A0,8($nap_d) ; load a[j] in double format 708 lfd $A1,16($nap_d) 709 fmadd $T2a,$N2,$na,$T2a 710 fmadd $T2b,$N2,$nb,$T2b 711 lfd $A2,24($nap_d) ; load a[j+1] in double format 712 lfd $A3,32($nap_d) 713 fmadd $T3a,$N3,$na,$T3a 714 fmadd $T3b,$N3,$nb,$T3b 715 fmadd $T0a,$N0,$na,$T0a 716 fmadd $T0b,$N0,$nb,$T0b 717 718 fmadd $T1a,$N0,$nc,$T1a 719 fmadd $T1b,$N0,$nd,$T1b 720 fmadd $T2a,$N1,$nc,$T2a 721 fmadd $T2b,$N1,$nd,$T2b 722 fmadd $T3a,$N2,$nc,$T3a 723 fmadd $T3b,$N2,$nd,$T3b 724 fmadd $dota,$N3,$nc,$dota 725 fmadd $dotb,$N3,$nd,$dotb 726 727 fctid $T0a,$T0a 728 fctid $T0b,$T0b 729 fctid $T1a,$T1a 730 fctid $T1b,$T1b 731 fctid $T2a,$T2a 732 fctid $T2b,$T2b 733 fctid $T3a,$T3a 734 fctid $T3b,$T3b 735 736 stfd $T0a,`$FRAME+0`($sp) 737 stfd $T0b,`$FRAME+8`($sp) 738 stfd $T1a,`$FRAME+16`($sp) 739 stfd $T1b,`$FRAME+24`($sp) 740 stfd $T2a,`$FRAME+32`($sp) 741 stfd $T2b,`$FRAME+40`($sp) 742 stfd $T3a,`$FRAME+48`($sp) 743 stfd $T3b,`$FRAME+56`($sp) 744 745.align 5 746Linner: 747 fmul $T1a,$A1,$ba 748 fmul $T1b,$A1,$bb 749 fmul $T2a,$A2,$ba 750 fmul $T2b,$A2,$bb 751 lfd $N0,40($nap_d) ; load n[j] in double format 752 lfd $N1,48($nap_d) 753 fmul $T3a,$A3,$ba 754 fmul $T3b,$A3,$bb 755 fmadd $T0a,$A0,$ba,$dota 756 fmadd $T0b,$A0,$bb,$dotb 757 lfd $N2,56($nap_d) ; load n[j+1] in double format 758 lfdu $N3,64($nap_d) 759 760 fmadd $T1a,$A0,$bc,$T1a 761 fmadd $T1b,$A0,$bd,$T1b 762 fmadd $T2a,$A1,$bc,$T2a 763 fmadd $T2b,$A1,$bd,$T2b 764 lfd $A0,8($nap_d) ; load a[j] in double format 765 lfd $A1,16($nap_d) 766 fmadd $T3a,$A2,$bc,$T3a 767 fmadd $T3b,$A2,$bd,$T3b 768 fmul $dota,$A3,$bc 769 fmul $dotb,$A3,$bd 770 lfd $A2,24($nap_d) ; load a[j+1] in double format 771 lfd $A3,32($nap_d) 772 773 fmadd $T1a,$N1,$na,$T1a 774 fmadd $T1b,$N1,$nb,$T1b 775 ld $t0,`$FRAME+0`($sp) 776 ld $t1,`$FRAME+8`($sp) 777 fmadd $T2a,$N2,$na,$T2a 778 fmadd $T2b,$N2,$nb,$T2b 779 ld $t2,`$FRAME+16`($sp) 780 ld $t3,`$FRAME+24`($sp) 781 fmadd $T3a,$N3,$na,$T3a 782 fmadd $T3b,$N3,$nb,$T3b 783 add $t0,$t0,$carry ; can not overflow 784 ld $t4,`$FRAME+32`($sp) 785 ld $t5,`$FRAME+40`($sp) 786 fmadd $T0a,$N0,$na,$T0a 787 fmadd $T0b,$N0,$nb,$T0b 788 srdi $carry,$t0,16 789 add $t1,$t1,$carry 790 srdi $carry,$t1,16 791 ld $t6,`$FRAME+48`($sp) 792 ld $t7,`$FRAME+56`($sp) 793 794 fmadd $T1a,$N0,$nc,$T1a 795 fmadd $T1b,$N0,$nd,$T1b 796 insrdi $t0,$t1,16,32 797 ld $t1,8($tp) ; tp[j] 798 fmadd $T2a,$N1,$nc,$T2a 799 fmadd $T2b,$N1,$nd,$T2b 800 add $t2,$t2,$carry 801 fmadd $T3a,$N2,$nc,$T3a 802 fmadd $T3b,$N2,$nd,$T3b 803 srdi $carry,$t2,16 804 insrdi $t0,$t2,16,16 805 fmadd $dota,$N3,$nc,$dota 806 fmadd $dotb,$N3,$nd,$dotb 807 add $t3,$t3,$carry 808 ldu $t2,16($tp) ; tp[j+1] 809 srdi $carry,$t3,16 810 insrdi $t0,$t3,16,0 ; 0..63 bits 811 add $t4,$t4,$carry 812 813 fctid $T0a,$T0a 814 fctid $T0b,$T0b 815 srdi $carry,$t4,16 816 fctid $T1a,$T1a 817 fctid $T1b,$T1b 818 add $t5,$t5,$carry 819 fctid $T2a,$T2a 820 fctid $T2b,$T2b 821 srdi $carry,$t5,16 822 insrdi $t4,$t5,16,32 823 fctid $T3a,$T3a 824 fctid $T3b,$T3b 825 add $t6,$t6,$carry 826 srdi $carry,$t6,16 827 insrdi $t4,$t6,16,16 828 829 stfd $T0a,`$FRAME+0`($sp) 830 stfd $T0b,`$FRAME+8`($sp) 831 add $t7,$t7,$carry 832 addc $t3,$t0,$t1 833___ 834$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 835 extrdi $t0,$t0,32,0 836 extrdi $t1,$t1,32,0 837 adde $t0,$t0,$t1 838___ 839$code.=<<___; 840 stfd $T1a,`$FRAME+16`($sp) 841 stfd $T1b,`$FRAME+24`($sp) 842 insrdi $t4,$t7,16,0 ; 64..127 bits 843 srdi $carry,$t7,16 ; upper 33 bits 844 stfd $T2a,`$FRAME+32`($sp) 845 stfd $T2b,`$FRAME+40`($sp) 846 adde $t5,$t4,$t2 847___ 848$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 849 extrdi $t4,$t4,32,0 850 extrdi $t2,$t2,32,0 851 adde $t4,$t4,$t2 852___ 853$code.=<<___; 854 stfd $T3a,`$FRAME+48`($sp) 855 stfd $T3b,`$FRAME+56`($sp) 856 addze $carry,$carry 857 std $t3,-16($tp) ; tp[j-1] 858 std $t5,-8($tp) ; tp[j] 859 bdnz- Linner 860 861 fctid $dota,$dota 862 fctid $dotb,$dotb 863 ld $t0,`$FRAME+0`($sp) 864 ld $t1,`$FRAME+8`($sp) 865 ld $t2,`$FRAME+16`($sp) 866 ld $t3,`$FRAME+24`($sp) 867 ld $t4,`$FRAME+32`($sp) 868 ld $t5,`$FRAME+40`($sp) 869 ld $t6,`$FRAME+48`($sp) 870 ld $t7,`$FRAME+56`($sp) 871 stfd $dota,`$FRAME+64`($sp) 872 stfd $dotb,`$FRAME+72`($sp) 873 874 add $t0,$t0,$carry ; can not overflow 875 srdi $carry,$t0,16 876 add $t1,$t1,$carry 877 srdi $carry,$t1,16 878 insrdi $t0,$t1,16,32 879 add $t2,$t2,$carry 880 ld $t1,8($tp) ; tp[j] 881 srdi $carry,$t2,16 882 insrdi $t0,$t2,16,16 883 add $t3,$t3,$carry 884 ldu $t2,16($tp) ; tp[j+1] 885 srdi $carry,$t3,16 886 insrdi $t0,$t3,16,0 ; 0..63 bits 887 add $t4,$t4,$carry 888 srdi $carry,$t4,16 889 add $t5,$t5,$carry 890 srdi $carry,$t5,16 891 insrdi $t4,$t5,16,32 892 add $t6,$t6,$carry 893 srdi $carry,$t6,16 894 insrdi $t4,$t6,16,16 895 add $t7,$t7,$carry 896 insrdi $t4,$t7,16,0 ; 64..127 bits 897 srdi $carry,$t7,16 ; upper 33 bits 898 ld $t6,`$FRAME+64`($sp) 899 ld $t7,`$FRAME+72`($sp) 900 901 addc $t3,$t0,$t1 902___ 903$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 904 extrdi $t0,$t0,32,0 905 extrdi $t1,$t1,32,0 906 adde $t0,$t0,$t1 907___ 908$code.=<<___; 909 adde $t5,$t4,$t2 910___ 911$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 912 extrdi $t4,$t4,32,0 913 extrdi $t2,$t2,32,0 914 adde $t4,$t4,$t2 915___ 916$code.=<<___; 917 addze $carry,$carry 918 919 std $t3,-16($tp) ; tp[j-1] 920 std $t5,-8($tp) ; tp[j] 921 922 add $carry,$carry,$ovf ; comsume upmost overflow 923 add $t6,$t6,$carry ; can not overflow 924 srdi $carry,$t6,16 925 add $t7,$t7,$carry 926 insrdi $t6,$t7,48,0 927 srdi $ovf,$t7,48 928 std $t6,0($tp) ; tp[num-1] 929 930 slwi $t7,$num,2 931 addi $i,$i,8 932 subf $nap_d,$t7,$nap_d ; rewind pointer 933 cmpw $i,$num 934 blt- Louter 935___ 936 937$code.=<<___ if ($SIZE_T==8); 938 subf $np,$num,$np ; rewind np 939 addi $j,$j,1 ; restore counter 940 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 941 addi $tp,$sp,`$FRAME+$TRANSFER+8` 942 addi $t4,$sp,`$FRAME+$TRANSFER+16` 943 addi $t5,$np,8 944 addi $t6,$rp,8 945 mtctr $j 946 947.align 4 948Lsub: ldx $t0,$tp,$i 949 ldx $t1,$np,$i 950 ldx $t2,$t4,$i 951 ldx $t3,$t5,$i 952 subfe $t0,$t1,$t0 ; tp[j]-np[j] 953 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 954 stdx $t0,$rp,$i 955 stdx $t2,$t6,$i 956 addi $i,$i,16 957 bdnz- Lsub 958 959 li $i,0 960 subfe $ovf,$i,$ovf ; handle upmost overflow bit 961 and $ap,$tp,$ovf 962 andc $np,$rp,$ovf 963 or $ap,$ap,$np ; ap=borrow?tp:rp 964 addi $t7,$ap,8 965 mtctr $j 966 967.align 4 968Lcopy: ; copy or in-place refresh 969 ldx $t0,$ap,$i 970 ldx $t1,$t7,$i 971 std $i,8($nap_d) ; zap nap_d 972 std $i,16($nap_d) 973 std $i,24($nap_d) 974 std $i,32($nap_d) 975 std $i,40($nap_d) 976 std $i,48($nap_d) 977 std $i,56($nap_d) 978 stdu $i,64($nap_d) 979 stdx $t0,$rp,$i 980 stdx $t1,$t6,$i 981 stdx $i,$tp,$i ; zap tp at once 982 stdx $i,$t4,$i 983 addi $i,$i,16 984 bdnz- Lcopy 985___ 986$code.=<<___ if ($SIZE_T==4); 987 subf $np,$num,$np ; rewind np 988 addi $j,$j,1 ; restore counter 989 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 990 addi $tp,$sp,`$FRAME+$TRANSFER` 991 addi $np,$np,-4 992 addi $rp,$rp,-4 993 addi $ap,$sp,`$FRAME+$TRANSFER+4` 994 mtctr $j 995 996.align 4 997Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order 998 ldu $t2,16($tp) 999 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1000 lwz $t5,8($np) 1001 lwz $t6,12($np) 1002 lwzu $t7,16($np) 1003 extrdi $t1,$t0,32,0 1004 extrdi $t3,$t2,32,0 1005 subfe $t4,$t4,$t0 ; tp[j]-np[j] 1006 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1007 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1008 stw $t1,8($ap) 1009 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1010 stw $t2,12($ap) 1011 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1012 stwu $t3,16($ap) 1013 stw $t4,4($rp) 1014 stw $t5,8($rp) 1015 stw $t6,12($rp) 1016 stwu $t7,16($rp) 1017 bdnz- Lsub 1018 1019 li $i,0 1020 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1021 addi $tp,$sp,`$FRAME+$TRANSFER+4` 1022 subf $rp,$num,$rp ; rewind rp 1023 and $ap,$tp,$ovf 1024 andc $np,$rp,$ovf 1025 or $ap,$ap,$np ; ap=borrow?tp:rp 1026 addi $tp,$sp,`$FRAME+$TRANSFER` 1027 mtctr $j 1028 1029.align 4 1030Lcopy: ; copy or in-place refresh 1031 lwz $t0,4($ap) 1032 lwz $t1,8($ap) 1033 lwz $t2,12($ap) 1034 lwzu $t3,16($ap) 1035 std $i,8($nap_d) ; zap nap_d 1036 std $i,16($nap_d) 1037 std $i,24($nap_d) 1038 std $i,32($nap_d) 1039 std $i,40($nap_d) 1040 std $i,48($nap_d) 1041 std $i,56($nap_d) 1042 stdu $i,64($nap_d) 1043 stw $t0,4($rp) 1044 stw $t1,8($rp) 1045 stw $t2,12($rp) 1046 stwu $t3,16($rp) 1047 std $i,8($tp) ; zap tp at once 1048 stdu $i,16($tp) 1049 bdnz- Lcopy 1050___ 1051 1052$code.=<<___; 1053 $POP $i,0($sp) 1054 li r3,1 ; signal "handled" 1055 $POP r22,`-12*8-10*$SIZE_T`($i) 1056 $POP r23,`-12*8-9*$SIZE_T`($i) 1057 $POP r24,`-12*8-8*$SIZE_T`($i) 1058 $POP r25,`-12*8-7*$SIZE_T`($i) 1059 $POP r26,`-12*8-6*$SIZE_T`($i) 1060 $POP r27,`-12*8-5*$SIZE_T`($i) 1061 $POP r28,`-12*8-4*$SIZE_T`($i) 1062 $POP r29,`-12*8-3*$SIZE_T`($i) 1063 $POP r30,`-12*8-2*$SIZE_T`($i) 1064 $POP r31,`-12*8-1*$SIZE_T`($i) 1065 lfd f20,`-12*8`($i) 1066 lfd f21,`-11*8`($i) 1067 lfd f22,`-10*8`($i) 1068 lfd f23,`-9*8`($i) 1069 lfd f24,`-8*8`($i) 1070 lfd f25,`-7*8`($i) 1071 lfd f26,`-6*8`($i) 1072 lfd f27,`-5*8`($i) 1073 lfd f28,`-4*8`($i) 1074 lfd f29,`-3*8`($i) 1075 lfd f30,`-2*8`($i) 1076 lfd f31,`-1*8`($i) 1077 mr $sp,$i 1078 blr 1079 .long 0 1080 .byte 0,12,4,0,0x8c,10,6,0 1081 .long 0 1082 1083.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1084___ 1085 1086$code =~ s/\`([^\`]*)\`/eval $1/gem; 1087print $code; 1088close STDOUT; 1089