1#! /usr/bin/env perl 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# October 2005 18# 19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? 20# Because unlike integer multiplier, which simply stalls whole CPU, 21# FPU is fully pipelined and can effectively emit 48 bit partial 22# product every cycle. Why not blended SPARC v9? One can argue that 23# making this module dependent on UltraSPARC VIS extension limits its 24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) 25# implementations from compatibility matrix. But the rest, whole Sun 26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support 27# VIS extension instructions used in this module. This is considered 28# good enough to not care about HAL SPARC64 users [if any] who have 29# integer-only pure SPARCv9 module to "fall down" to. 30 31# USI&II cores currently exhibit uniform 2x improvement [over pre- 32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII 33# performance improves few percents for shorter keys and worsens few 34# percents for longer keys. This is because USIII integer multiplier 35# is >3x faster than USI&II one, which is harder to match [but see 36# TODO list below]. It should also be noted that SPARC64 V features 37# out-of-order execution, which *might* mean that integer multiplier 38# is pipelined, which in turn *might* be impossible to match... On 39# additional note, SPARC64 V implements FP Multiply-Add instruction, 40# which is perfectly usable in this context... In other words, as far 41# as Fujitsu SPARC64 V goes, talk to the author:-) 42 43# The implementation implies following "non-natural" limitations on 44# input arguments: 45# - num may not be less than 4; 46# - num has to be even; 47# Failure to meet either condition has no fatal effects, simply 48# doesn't give any performance gain. 49 50# TODO: 51# - modulo-schedule inner loop for better performance (on in-order 52# execution core such as UltraSPARC this shall result in further 53# noticeable(!) improvement); 54# - dedicated squaring procedure[?]; 55 56###################################################################### 57# November 2006 58# 59# Modulo-scheduled inner loops allow to interleave floating point and 60# integer instructions and minimize Read-After-Write penalties. This 61# results in *further* 20-50% performance improvement [depending on 62# key length, more for longer keys] on USI&II cores and 30-80% - on 63# USIII&IV. 64 65$output = pop; 66open STDOUT,">$output"; 67 68$fname="bn_mul_mont_fpu"; 69 70$frame="STACK_FRAME"; 71$bias="STACK_BIAS"; 72$locals=64; 73 74# In order to provide for 32-/64-bit ABI duality, I keep integers wider 75# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used 76# exclusively for pointers, indexes and other small values... 77# int bn_mul_mont( 78$rp="%i0"; # BN_ULONG *rp, 79$ap="%i1"; # const BN_ULONG *ap, 80$bp="%i2"; # const BN_ULONG *bp, 81$np="%i3"; # const BN_ULONG *np, 82$n0="%i4"; # const BN_ULONG *n0, 83$num="%i5"; # int num); 84 85$tp="%l0"; # t[num] 86$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved 87$ap_h="%l2"; # to these four vectors as double-precision FP values. 88$np_l="%l3"; # This way a bunch of fxtods are eliminated in second 89$np_h="%l4"; # loop and L1-cache aliasing is minimized... 90$i="%l5"; 91$j="%l6"; 92$mask="%l7"; # 16-bit mask, 0xffff 93 94$n0="%g4"; # reassigned(!) to "64-bit" register 95$carry="%i4"; # %i4 reused(!) for a carry bit 96 97# FP register naming chart 98# 99# ..HILO 100# dcba 101# -------- 102# LOa 103# LOb 104# LOc 105# LOd 106# HIa 107# HIb 108# HIc 109# HId 110# ..a 111# ..b 112$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; 113$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; 114$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; 115$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; 116 117$dota="%f24"; $dotb="%f26"; 118 119$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; 120$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; 121$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; 122$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; 123 124$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load 125 126$code=<<___; 127#include "sparc_arch.h" 128 129.section ".text",#alloc,#execinstr 130 131.global $fname 132.align 32 133$fname: 134 save %sp,-$frame-$locals,%sp 135 136 cmp $num,4 137 bl,a,pn %icc,.Lret 138 clr %i0 139 andcc $num,1,%g0 ! $num has to be even... 140 bnz,a,pn %icc,.Lret 141 clr %i0 ! signal "unsupported input value" 142 143 srl $num,1,$num 144 sethi %hi(0xffff),$mask 145 ld [%i4+0],$n0 ! $n0 reassigned, remember? 146 or $mask,%lo(0xffff),$mask 147 ld [%i4+4],%o0 148 sllx %o0,32,%o0 149 or %o0,$n0,$n0 ! $n0=n0[1].n0[0] 150 151 sll $num,3,$num ! num*=8 152 153 add %sp,$bias,%o0 ! real top of stack 154 sll $num,2,%o1 155 add %o1,$num,%o1 ! %o1=num*5 156 sub %o0,%o1,%o0 157 and %o0,-2048,%o0 ! optimize TLB utilization 158 sub %o0,$bias,%sp ! alloca(5*num*8) 159 160 rd %asi,%o7 ! save %asi 161 add %sp,$bias+$frame+$locals,$tp 162 add $tp,$num,$ap_l 163 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! 164 add $ap_l,$num,$ap_h 165 add $ap_h,$num,$np_l 166 add $np_l,$num,$np_h 167 168 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads 169 170 add $rp,$num,$rp ! readjust input pointers to point 171 add $ap,$num,$ap ! at the ends too... 172 add $bp,$num,$bp 173 add $np,$num,$np 174 175 stx %o7,[%sp+$bias+$frame+48] ! save %asi 176 177 sub %g0,$num,$i ! i=-num 178 sub %g0,$num,$j ! j=-num 179 180 add $ap,$j,%o3 181 add $bp,$i,%o4 182 183 ld [%o3+4],%g1 ! bp[0] 184 ld [%o3+0],%o0 185 ld [%o4+4],%g5 ! ap[0] 186 sllx %g1,32,%g1 187 ld [%o4+0],%o1 188 sllx %g5,32,%g5 189 or %g1,%o0,%o0 190 or %g5,%o1,%o1 191 192 add $np,$j,%o5 193 194 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 195 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 196 stx %o0,[%sp+$bias+$frame+0] 197 198 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words 199 fzeros $alo 200 ld [%o3+4],$ahi_ 201 fzeros $ahi 202 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 203 fzeros $nlo 204 ld [%o5+4],$nhi_ 205 fzeros $nhi 206 207 ! transfer b[i] to FPU as 4x16-bit values 208 ldda [%o4+2]%asi,$ba 209 fxtod $alo,$alo 210 ldda [%o4+0]%asi,$bb 211 fxtod $ahi,$ahi 212 ldda [%o4+6]%asi,$bc 213 fxtod $nlo,$nlo 214 ldda [%o4+4]%asi,$bd 215 fxtod $nhi,$nhi 216 217 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 218 ldda [%sp+$bias+$frame+6]%asi,$na 219 fxtod $ba,$ba 220 ldda [%sp+$bias+$frame+4]%asi,$nb 221 fxtod $bb,$bb 222 ldda [%sp+$bias+$frame+2]%asi,$nc 223 fxtod $bc,$bc 224 ldda [%sp+$bias+$frame+0]%asi,$nd 225 fxtod $bd,$bd 226 227 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 228 fxtod $na,$na 229 std $ahi,[$ap_h+$j] 230 fxtod $nb,$nb 231 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 232 fxtod $nc,$nc 233 std $nhi,[$np_h+$j] 234 fxtod $nd,$nd 235 236 fmuld $alo,$ba,$aloa 237 fmuld $nlo,$na,$nloa 238 fmuld $alo,$bb,$alob 239 fmuld $nlo,$nb,$nlob 240 fmuld $alo,$bc,$aloc 241 faddd $aloa,$nloa,$nloa 242 fmuld $nlo,$nc,$nloc 243 fmuld $alo,$bd,$alod 244 faddd $alob,$nlob,$nlob 245 fmuld $nlo,$nd,$nlod 246 fmuld $ahi,$ba,$ahia 247 faddd $aloc,$nloc,$nloc 248 fmuld $nhi,$na,$nhia 249 fmuld $ahi,$bb,$ahib 250 faddd $alod,$nlod,$nlod 251 fmuld $nhi,$nb,$nhib 252 fmuld $ahi,$bc,$ahic 253 faddd $ahia,$nhia,$nhia 254 fmuld $nhi,$nc,$nhic 255 fmuld $ahi,$bd,$ahid 256 faddd $ahib,$nhib,$nhib 257 fmuld $nhi,$nd,$nhid 258 259 faddd $ahic,$nhic,$dota ! $nhic 260 faddd $ahid,$nhid,$dotb ! $nhid 261 262 faddd $nloc,$nhia,$nloc 263 faddd $nlod,$nhib,$nlod 264 265 fdtox $nloa,$nloa 266 fdtox $nlob,$nlob 267 fdtox $nloc,$nloc 268 fdtox $nlod,$nlod 269 270 std $nloa,[%sp+$bias+$frame+0] 271 add $j,8,$j 272 std $nlob,[%sp+$bias+$frame+8] 273 add $ap,$j,%o4 274 std $nloc,[%sp+$bias+$frame+16] 275 add $np,$j,%o5 276 std $nlod,[%sp+$bias+$frame+24] 277 278 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 279 fzeros $alo 280 ld [%o4+4],$ahi_ 281 fzeros $ahi 282 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 283 fzeros $nlo 284 ld [%o5+4],$nhi_ 285 fzeros $nhi 286 287 fxtod $alo,$alo 288 fxtod $ahi,$ahi 289 fxtod $nlo,$nlo 290 fxtod $nhi,$nhi 291 292 ldx [%sp+$bias+$frame+0],%o0 293 fmuld $alo,$ba,$aloa 294 ldx [%sp+$bias+$frame+8],%o1 295 fmuld $nlo,$na,$nloa 296 ldx [%sp+$bias+$frame+16],%o2 297 fmuld $alo,$bb,$alob 298 ldx [%sp+$bias+$frame+24],%o3 299 fmuld $nlo,$nb,$nlob 300 301 srlx %o0,16,%o7 302 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 303 fmuld $alo,$bc,$aloc 304 add %o7,%o1,%o1 305 std $ahi,[$ap_h+$j] 306 faddd $aloa,$nloa,$nloa 307 fmuld $nlo,$nc,$nloc 308 srlx %o1,16,%o7 309 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 310 fmuld $alo,$bd,$alod 311 add %o7,%o2,%o2 312 std $nhi,[$np_h+$j] 313 faddd $alob,$nlob,$nlob 314 fmuld $nlo,$nd,$nlod 315 srlx %o2,16,%o7 316 fmuld $ahi,$ba,$ahia 317 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 318 faddd $aloc,$nloc,$nloc 319 fmuld $nhi,$na,$nhia 320 !and %o0,$mask,%o0 321 !and %o1,$mask,%o1 322 !and %o2,$mask,%o2 323 !sllx %o1,16,%o1 324 !sllx %o2,32,%o2 325 !sllx %o3,48,%o7 326 !or %o1,%o0,%o0 327 !or %o2,%o0,%o0 328 !or %o7,%o0,%o0 ! 64-bit result 329 srlx %o3,16,%g1 ! 34-bit carry 330 fmuld $ahi,$bb,$ahib 331 332 faddd $alod,$nlod,$nlod 333 fmuld $nhi,$nb,$nhib 334 fmuld $ahi,$bc,$ahic 335 faddd $ahia,$nhia,$nhia 336 fmuld $nhi,$nc,$nhic 337 fmuld $ahi,$bd,$ahid 338 faddd $ahib,$nhib,$nhib 339 fmuld $nhi,$nd,$nhid 340 341 faddd $dota,$nloa,$nloa 342 faddd $dotb,$nlob,$nlob 343 faddd $ahic,$nhic,$dota ! $nhic 344 faddd $ahid,$nhid,$dotb ! $nhid 345 346 faddd $nloc,$nhia,$nloc 347 faddd $nlod,$nhib,$nlod 348 349 fdtox $nloa,$nloa 350 fdtox $nlob,$nlob 351 fdtox $nloc,$nloc 352 fdtox $nlod,$nlod 353 354 std $nloa,[%sp+$bias+$frame+0] 355 std $nlob,[%sp+$bias+$frame+8] 356 addcc $j,8,$j 357 std $nloc,[%sp+$bias+$frame+16] 358 bz,pn %icc,.L1stskip 359 std $nlod,[%sp+$bias+$frame+24] 360 361.align 32 ! incidentally already aligned ! 362.L1st: 363 add $ap,$j,%o4 364 add $np,$j,%o5 365 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 366 fzeros $alo 367 ld [%o4+4],$ahi_ 368 fzeros $ahi 369 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 370 fzeros $nlo 371 ld [%o5+4],$nhi_ 372 fzeros $nhi 373 374 fxtod $alo,$alo 375 fxtod $ahi,$ahi 376 fxtod $nlo,$nlo 377 fxtod $nhi,$nhi 378 379 ldx [%sp+$bias+$frame+0],%o0 380 fmuld $alo,$ba,$aloa 381 ldx [%sp+$bias+$frame+8],%o1 382 fmuld $nlo,$na,$nloa 383 ldx [%sp+$bias+$frame+16],%o2 384 fmuld $alo,$bb,$alob 385 ldx [%sp+$bias+$frame+24],%o3 386 fmuld $nlo,$nb,$nlob 387 388 srlx %o0,16,%o7 389 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 390 fmuld $alo,$bc,$aloc 391 add %o7,%o1,%o1 392 std $ahi,[$ap_h+$j] 393 faddd $aloa,$nloa,$nloa 394 fmuld $nlo,$nc,$nloc 395 srlx %o1,16,%o7 396 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 397 fmuld $alo,$bd,$alod 398 add %o7,%o2,%o2 399 std $nhi,[$np_h+$j] 400 faddd $alob,$nlob,$nlob 401 fmuld $nlo,$nd,$nlod 402 srlx %o2,16,%o7 403 fmuld $ahi,$ba,$ahia 404 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 405 and %o0,$mask,%o0 406 faddd $aloc,$nloc,$nloc 407 fmuld $nhi,$na,$nhia 408 and %o1,$mask,%o1 409 and %o2,$mask,%o2 410 fmuld $ahi,$bb,$ahib 411 sllx %o1,16,%o1 412 faddd $alod,$nlod,$nlod 413 fmuld $nhi,$nb,$nhib 414 sllx %o2,32,%o2 415 fmuld $ahi,$bc,$ahic 416 sllx %o3,48,%o7 417 or %o1,%o0,%o0 418 faddd $ahia,$nhia,$nhia 419 fmuld $nhi,$nc,$nhic 420 or %o2,%o0,%o0 421 fmuld $ahi,$bd,$ahid 422 or %o7,%o0,%o0 ! 64-bit result 423 faddd $ahib,$nhib,$nhib 424 fmuld $nhi,$nd,$nhid 425 addcc %g1,%o0,%o0 426 faddd $dota,$nloa,$nloa 427 srlx %o3,16,%g1 ! 34-bit carry 428 faddd $dotb,$nlob,$nlob 429 bcs,a %xcc,.+8 430 add %g1,1,%g1 431 432 stx %o0,[$tp] ! tp[j-1]= 433 434 faddd $ahic,$nhic,$dota ! $nhic 435 faddd $ahid,$nhid,$dotb ! $nhid 436 437 faddd $nloc,$nhia,$nloc 438 faddd $nlod,$nhib,$nlod 439 440 fdtox $nloa,$nloa 441 fdtox $nlob,$nlob 442 fdtox $nloc,$nloc 443 fdtox $nlod,$nlod 444 445 std $nloa,[%sp+$bias+$frame+0] 446 std $nlob,[%sp+$bias+$frame+8] 447 std $nloc,[%sp+$bias+$frame+16] 448 std $nlod,[%sp+$bias+$frame+24] 449 450 addcc $j,8,$j 451 bnz,pt %icc,.L1st 452 add $tp,8,$tp 453 454.L1stskip: 455 fdtox $dota,$dota 456 fdtox $dotb,$dotb 457 458 ldx [%sp+$bias+$frame+0],%o0 459 ldx [%sp+$bias+$frame+8],%o1 460 ldx [%sp+$bias+$frame+16],%o2 461 ldx [%sp+$bias+$frame+24],%o3 462 463 srlx %o0,16,%o7 464 std $dota,[%sp+$bias+$frame+32] 465 add %o7,%o1,%o1 466 std $dotb,[%sp+$bias+$frame+40] 467 srlx %o1,16,%o7 468 add %o7,%o2,%o2 469 srlx %o2,16,%o7 470 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 471 and %o0,$mask,%o0 472 and %o1,$mask,%o1 473 and %o2,$mask,%o2 474 sllx %o1,16,%o1 475 sllx %o2,32,%o2 476 sllx %o3,48,%o7 477 or %o1,%o0,%o0 478 or %o2,%o0,%o0 479 or %o7,%o0,%o0 ! 64-bit result 480 ldx [%sp+$bias+$frame+32],%o4 481 addcc %g1,%o0,%o0 482 ldx [%sp+$bias+$frame+40],%o5 483 srlx %o3,16,%g1 ! 34-bit carry 484 bcs,a %xcc,.+8 485 add %g1,1,%g1 486 487 stx %o0,[$tp] ! tp[j-1]= 488 add $tp,8,$tp 489 490 srlx %o4,16,%o7 491 add %o7,%o5,%o5 492 and %o4,$mask,%o4 493 sllx %o5,16,%o7 494 or %o7,%o4,%o4 495 addcc %g1,%o4,%o4 496 srlx %o5,48,%g1 497 bcs,a %xcc,.+8 498 add %g1,1,%g1 499 500 mov %g1,$carry 501 stx %o4,[$tp] ! tp[num-1]= 502 503 ba .Louter 504 add $i,8,$i 505.align 32 506.Louter: 507 sub %g0,$num,$j ! j=-num 508 add %sp,$bias+$frame+$locals,$tp 509 510 add $ap,$j,%o3 511 add $bp,$i,%o4 512 513 ld [%o3+4],%g1 ! bp[i] 514 ld [%o3+0],%o0 515 ld [%o4+4],%g5 ! ap[0] 516 sllx %g1,32,%g1 517 ld [%o4+0],%o1 518 sllx %g5,32,%g5 519 or %g1,%o0,%o0 520 or %g5,%o1,%o1 521 522 ldx [$tp],%o2 ! tp[0] 523 mulx %o1,%o0,%o0 524 addcc %o2,%o0,%o0 525 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 526 stx %o0,[%sp+$bias+$frame+0] 527 528 ! transfer b[i] to FPU as 4x16-bit values 529 ldda [%o4+2]%asi,$ba 530 ldda [%o4+0]%asi,$bb 531 ldda [%o4+6]%asi,$bc 532 ldda [%o4+4]%asi,$bd 533 534 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 535 ldda [%sp+$bias+$frame+6]%asi,$na 536 fxtod $ba,$ba 537 ldda [%sp+$bias+$frame+4]%asi,$nb 538 fxtod $bb,$bb 539 ldda [%sp+$bias+$frame+2]%asi,$nc 540 fxtod $bc,$bc 541 ldda [%sp+$bias+$frame+0]%asi,$nd 542 fxtod $bd,$bd 543 ldd [$ap_l+$j],$alo ! load a[j] in double format 544 fxtod $na,$na 545 ldd [$ap_h+$j],$ahi 546 fxtod $nb,$nb 547 ldd [$np_l+$j],$nlo ! load n[j] in double format 548 fxtod $nc,$nc 549 ldd [$np_h+$j],$nhi 550 fxtod $nd,$nd 551 552 fmuld $alo,$ba,$aloa 553 fmuld $nlo,$na,$nloa 554 fmuld $alo,$bb,$alob 555 fmuld $nlo,$nb,$nlob 556 fmuld $alo,$bc,$aloc 557 faddd $aloa,$nloa,$nloa 558 fmuld $nlo,$nc,$nloc 559 fmuld $alo,$bd,$alod 560 faddd $alob,$nlob,$nlob 561 fmuld $nlo,$nd,$nlod 562 fmuld $ahi,$ba,$ahia 563 faddd $aloc,$nloc,$nloc 564 fmuld $nhi,$na,$nhia 565 fmuld $ahi,$bb,$ahib 566 faddd $alod,$nlod,$nlod 567 fmuld $nhi,$nb,$nhib 568 fmuld $ahi,$bc,$ahic 569 faddd $ahia,$nhia,$nhia 570 fmuld $nhi,$nc,$nhic 571 fmuld $ahi,$bd,$ahid 572 faddd $ahib,$nhib,$nhib 573 fmuld $nhi,$nd,$nhid 574 575 faddd $ahic,$nhic,$dota ! $nhic 576 faddd $ahid,$nhid,$dotb ! $nhid 577 578 faddd $nloc,$nhia,$nloc 579 faddd $nlod,$nhib,$nlod 580 581 fdtox $nloa,$nloa 582 fdtox $nlob,$nlob 583 fdtox $nloc,$nloc 584 fdtox $nlod,$nlod 585 586 std $nloa,[%sp+$bias+$frame+0] 587 std $nlob,[%sp+$bias+$frame+8] 588 std $nloc,[%sp+$bias+$frame+16] 589 add $j,8,$j 590 std $nlod,[%sp+$bias+$frame+24] 591 592 ldd [$ap_l+$j],$alo ! load a[j] in double format 593 ldd [$ap_h+$j],$ahi 594 ldd [$np_l+$j],$nlo ! load n[j] in double format 595 ldd [$np_h+$j],$nhi 596 597 fmuld $alo,$ba,$aloa 598 fmuld $nlo,$na,$nloa 599 fmuld $alo,$bb,$alob 600 fmuld $nlo,$nb,$nlob 601 fmuld $alo,$bc,$aloc 602 ldx [%sp+$bias+$frame+0],%o0 603 faddd $aloa,$nloa,$nloa 604 fmuld $nlo,$nc,$nloc 605 ldx [%sp+$bias+$frame+8],%o1 606 fmuld $alo,$bd,$alod 607 ldx [%sp+$bias+$frame+16],%o2 608 faddd $alob,$nlob,$nlob 609 fmuld $nlo,$nd,$nlod 610 ldx [%sp+$bias+$frame+24],%o3 611 fmuld $ahi,$ba,$ahia 612 613 srlx %o0,16,%o7 614 faddd $aloc,$nloc,$nloc 615 fmuld $nhi,$na,$nhia 616 add %o7,%o1,%o1 617 fmuld $ahi,$bb,$ahib 618 srlx %o1,16,%o7 619 faddd $alod,$nlod,$nlod 620 fmuld $nhi,$nb,$nhib 621 add %o7,%o2,%o2 622 fmuld $ahi,$bc,$ahic 623 srlx %o2,16,%o7 624 faddd $ahia,$nhia,$nhia 625 fmuld $nhi,$nc,$nhic 626 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 627 ! why? 628 and %o0,$mask,%o0 629 fmuld $ahi,$bd,$ahid 630 and %o1,$mask,%o1 631 and %o2,$mask,%o2 632 faddd $ahib,$nhib,$nhib 633 fmuld $nhi,$nd,$nhid 634 sllx %o1,16,%o1 635 faddd $dota,$nloa,$nloa 636 sllx %o2,32,%o2 637 faddd $dotb,$nlob,$nlob 638 sllx %o3,48,%o7 639 or %o1,%o0,%o0 640 faddd $ahic,$nhic,$dota ! $nhic 641 or %o2,%o0,%o0 642 faddd $ahid,$nhid,$dotb ! $nhid 643 or %o7,%o0,%o0 ! 64-bit result 644 ldx [$tp],%o7 645 faddd $nloc,$nhia,$nloc 646 addcc %o7,%o0,%o0 647 ! end-of-why? 648 faddd $nlod,$nhib,$nlod 649 srlx %o3,16,%g1 ! 34-bit carry 650 fdtox $nloa,$nloa 651 bcs,a %xcc,.+8 652 add %g1,1,%g1 653 654 fdtox $nlob,$nlob 655 fdtox $nloc,$nloc 656 fdtox $nlod,$nlod 657 658 std $nloa,[%sp+$bias+$frame+0] 659 std $nlob,[%sp+$bias+$frame+8] 660 addcc $j,8,$j 661 std $nloc,[%sp+$bias+$frame+16] 662 bz,pn %icc,.Linnerskip 663 std $nlod,[%sp+$bias+$frame+24] 664 665 ba .Linner 666 nop 667.align 32 668.Linner: 669 ldd [$ap_l+$j],$alo ! load a[j] in double format 670 ldd [$ap_h+$j],$ahi 671 ldd [$np_l+$j],$nlo ! load n[j] in double format 672 ldd [$np_h+$j],$nhi 673 674 fmuld $alo,$ba,$aloa 675 fmuld $nlo,$na,$nloa 676 fmuld $alo,$bb,$alob 677 fmuld $nlo,$nb,$nlob 678 fmuld $alo,$bc,$aloc 679 ldx [%sp+$bias+$frame+0],%o0 680 faddd $aloa,$nloa,$nloa 681 fmuld $nlo,$nc,$nloc 682 ldx [%sp+$bias+$frame+8],%o1 683 fmuld $alo,$bd,$alod 684 ldx [%sp+$bias+$frame+16],%o2 685 faddd $alob,$nlob,$nlob 686 fmuld $nlo,$nd,$nlod 687 ldx [%sp+$bias+$frame+24],%o3 688 fmuld $ahi,$ba,$ahia 689 690 srlx %o0,16,%o7 691 faddd $aloc,$nloc,$nloc 692 fmuld $nhi,$na,$nhia 693 add %o7,%o1,%o1 694 fmuld $ahi,$bb,$ahib 695 srlx %o1,16,%o7 696 faddd $alod,$nlod,$nlod 697 fmuld $nhi,$nb,$nhib 698 add %o7,%o2,%o2 699 fmuld $ahi,$bc,$ahic 700 srlx %o2,16,%o7 701 faddd $ahia,$nhia,$nhia 702 fmuld $nhi,$nc,$nhic 703 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 704 and %o0,$mask,%o0 705 fmuld $ahi,$bd,$ahid 706 and %o1,$mask,%o1 707 and %o2,$mask,%o2 708 faddd $ahib,$nhib,$nhib 709 fmuld $nhi,$nd,$nhid 710 sllx %o1,16,%o1 711 faddd $dota,$nloa,$nloa 712 sllx %o2,32,%o2 713 faddd $dotb,$nlob,$nlob 714 sllx %o3,48,%o7 715 or %o1,%o0,%o0 716 faddd $ahic,$nhic,$dota ! $nhic 717 or %o2,%o0,%o0 718 faddd $ahid,$nhid,$dotb ! $nhid 719 or %o7,%o0,%o0 ! 64-bit result 720 faddd $nloc,$nhia,$nloc 721 addcc %g1,%o0,%o0 722 ldx [$tp+8],%o7 ! tp[j] 723 faddd $nlod,$nhib,$nlod 724 srlx %o3,16,%g1 ! 34-bit carry 725 fdtox $nloa,$nloa 726 bcs,a %xcc,.+8 727 add %g1,1,%g1 728 fdtox $nlob,$nlob 729 addcc %o7,%o0,%o0 730 fdtox $nloc,$nloc 731 bcs,a %xcc,.+8 732 add %g1,1,%g1 733 734 stx %o0,[$tp] ! tp[j-1] 735 fdtox $nlod,$nlod 736 737 std $nloa,[%sp+$bias+$frame+0] 738 std $nlob,[%sp+$bias+$frame+8] 739 std $nloc,[%sp+$bias+$frame+16] 740 addcc $j,8,$j 741 std $nlod,[%sp+$bias+$frame+24] 742 bnz,pt %icc,.Linner 743 add $tp,8,$tp 744 745.Linnerskip: 746 fdtox $dota,$dota 747 fdtox $dotb,$dotb 748 749 ldx [%sp+$bias+$frame+0],%o0 750 ldx [%sp+$bias+$frame+8],%o1 751 ldx [%sp+$bias+$frame+16],%o2 752 ldx [%sp+$bias+$frame+24],%o3 753 754 srlx %o0,16,%o7 755 std $dota,[%sp+$bias+$frame+32] 756 add %o7,%o1,%o1 757 std $dotb,[%sp+$bias+$frame+40] 758 srlx %o1,16,%o7 759 add %o7,%o2,%o2 760 srlx %o2,16,%o7 761 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 762 and %o0,$mask,%o0 763 and %o1,$mask,%o1 764 and %o2,$mask,%o2 765 sllx %o1,16,%o1 766 sllx %o2,32,%o2 767 sllx %o3,48,%o7 768 or %o1,%o0,%o0 769 or %o2,%o0,%o0 770 ldx [%sp+$bias+$frame+32],%o4 771 or %o7,%o0,%o0 ! 64-bit result 772 ldx [%sp+$bias+$frame+40],%o5 773 addcc %g1,%o0,%o0 774 ldx [$tp+8],%o7 ! tp[j] 775 srlx %o3,16,%g1 ! 34-bit carry 776 bcs,a %xcc,.+8 777 add %g1,1,%g1 778 779 addcc %o7,%o0,%o0 780 bcs,a %xcc,.+8 781 add %g1,1,%g1 782 783 stx %o0,[$tp] ! tp[j-1] 784 add $tp,8,$tp 785 786 srlx %o4,16,%o7 787 add %o7,%o5,%o5 788 and %o4,$mask,%o4 789 sllx %o5,16,%o7 790 or %o7,%o4,%o4 791 addcc %g1,%o4,%o4 792 srlx %o5,48,%g1 793 bcs,a %xcc,.+8 794 add %g1,1,%g1 795 796 addcc $carry,%o4,%o4 797 stx %o4,[$tp] ! tp[num-1] 798 mov %g1,$carry 799 bcs,a %xcc,.+8 800 add $carry,1,$carry 801 802 addcc $i,8,$i 803 bnz %icc,.Louter 804 nop 805 806 add $tp,8,$tp ! adjust tp to point at the end 807 orn %g0,%g0,%g4 808 sub %g0,$num,%o7 ! n=-num 809 ba .Lsub 810 subcc %g0,%g0,%g0 ! clear %icc.c 811 812.align 32 813.Lsub: 814 ldx [$tp+%o7],%o0 815 add $np,%o7,%g1 816 ld [%g1+0],%o2 817 ld [%g1+4],%o3 818 srlx %o0,32,%o1 819 subccc %o0,%o2,%o2 820 add $rp,%o7,%g1 821 subccc %o1,%o3,%o3 822 st %o2,[%g1+0] 823 add %o7,8,%o7 824 brnz,pt %o7,.Lsub 825 st %o3,[%g1+4] 826 subc $carry,0,%g4 827 sub %g0,$num,%o7 ! n=-num 828 ba .Lcopy 829 nop 830 831.align 32 832.Lcopy: 833 ldx [$tp+%o7],%o0 834 add $rp,%o7,%g1 835 ld [%g1+0],%o2 836 ld [%g1+4],%o3 837 stx %g0,[$tp+%o7] 838 and %o0,%g4,%o0 839 srlx %o0,32,%o1 840 andn %o2,%g4,%o2 841 andn %o3,%g4,%o3 842 or %o2,%o0,%o0 843 or %o3,%o1,%o1 844 st %o0,[%g1+0] 845 add %o7,8,%o7 846 brnz,pt %o7,.Lcopy 847 st %o1,[%g1+4] 848 sub %g0,$num,%o7 ! n=-num 849 850.Lzap: 851 stx %g0,[$ap_l+%o7] 852 stx %g0,[$ap_h+%o7] 853 stx %g0,[$np_l+%o7] 854 stx %g0,[$np_h+%o7] 855 add %o7,8,%o7 856 brnz,pt %o7,.Lzap 857 nop 858 859 ldx [%sp+$bias+$frame+48],%o7 860 wr %g0,%o7,%asi ! restore %asi 861 862 mov 1,%i0 863.Lret: 864 ret 865 restore 866.type $fname,#function 867.size $fname,(.-$fname) 868.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" 869.align 32 870___ 871 872$code =~ s/\`([^\`]*)\`/eval($1)/gem; 873 874# Below substitution makes it possible to compile without demanding 875# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I 876# dare to do this, because VIS capability is detected at run-time now 877# and this routine is not called on CPU not capable to execute it. Do 878# note that fzeros is not the only VIS dependency! Another dependency 879# is implicit and is just _a_ numerical value loaded to %asi register, 880# which assembler can't recognize as VIS specific... 881$code =~ s/fzeros\s+%f([0-9]+)/ 882 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) 883 /gem; 884 885print $code; 886# flush 887close STDOUT or die "error closing STDOUT: $!"; 888