1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# SHA256 performance improvement over compiler generated code varies 11# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 12# build]. Just like in SHA1 module I aim to ensure scalability on 13# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 14 15# SHA512 on pre-T1 UltraSPARC. 16# 17# Performance is >75% better than 64-bit code generated by Sun C and 18# over 2x than 32-bit code. X[16] resides on stack, but access to it 19# is scheduled for L2 latency and staged through 32 least significant 20# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 21# duality. Nevetheless it's ~40% faster than SHA256, which is pretty 22# good [optimal coefficient is 50%]. 23# 24# SHA512 on UltraSPARC T1. 25# 26# It's not any faster than 64-bit code generated by Sun C 5.8. This is 27# because 64-bit code generator has the advantage of using 64-bit 28# loads(*) to access X[16], which I consciously traded for 32-/64-bit 29# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 30# code by 60%, not to mention that it doesn't suffer from severe decay 31# when running 4 times physical cores threads and that it leaves gcc 32# [3.4] behind by over 4x factor! If compared to SHA256, single thread 33# performance is only 10% better, but overall throughput for maximum 34# amount of threads for given CPU exceeds corresponding one of SHA256 35# by 30% [again, optimal coefficient is 50%]. 36# 37# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 38# in-order, i.e. load instruction has to complete prior next 39# instruction in given thread is executed, even if the latter is 40# not dependent on load result! This means that on T1 two 32-bit 41# loads are always slower than one 64-bit load. Once again this 42# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 43# 2x32-bit loads can be as fast as 1x64-bit ones. 44 45$bits=32; 46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 47if ($bits==64) { $bias=2047; $frame=192; } 48else { $bias=0; $frame=112; } 49 50$output=shift; 51open STDOUT,">$output"; 52 53if ($output =~ /512/) { 54 $label="512"; 55 $SZ=8; 56 $LD="ldx"; # load from memory 57 $ST="stx"; # store to memory 58 $SLL="sllx"; # shift left logical 59 $SRL="srlx"; # shift right logical 60 @Sigma0=(28,34,39); 61 @Sigma1=(14,18,41); 62 @sigma0=( 7, 1, 8); # right shift first 63 @sigma1=( 6,19,61); # right shift first 64 $lastK=0x817; 65 $rounds=80; 66 $align=4; 67 68 $locals=16*$SZ; # X[16] 69 70 $A="%o0"; 71 $B="%o1"; 72 $C="%o2"; 73 $D="%o3"; 74 $E="%o4"; 75 $F="%o5"; 76 $G="%g1"; 77 $H="%o7"; 78 @V=($A,$B,$C,$D,$E,$F,$G,$H); 79} else { 80 $label="256"; 81 $SZ=4; 82 $LD="ld"; # load from memory 83 $ST="st"; # store to memory 84 $SLL="sll"; # shift left logical 85 $SRL="srl"; # shift right logical 86 @Sigma0=( 2,13,22); 87 @Sigma1=( 6,11,25); 88 @sigma0=( 3, 7,18); # right shift first 89 @sigma1=(10,17,19); # right shift first 90 $lastK=0x8f2; 91 $rounds=64; 92 $align=8; 93 94 $locals=0; # X[16] is register resident 95 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 96 97 $A="%l0"; 98 $B="%l1"; 99 $C="%l2"; 100 $D="%l3"; 101 $E="%l4"; 102 $F="%l5"; 103 $G="%l6"; 104 $H="%l7"; 105 @V=($A,$B,$C,$D,$E,$F,$G,$H); 106} 107$T1="%g2"; 108$tmp0="%g3"; 109$tmp1="%g4"; 110$tmp2="%g5"; 111 112$ctx="%i0"; 113$inp="%i1"; 114$len="%i2"; 115$Ktbl="%i3"; 116$tmp31="%i4"; 117$tmp32="%i5"; 118 119########### SHA256 120$Xload = sub { 121my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 122 123 if ($i==0) { 124$code.=<<___; 125 ldx [$inp+0],@X[0] 126 ldx [$inp+16],@X[2] 127 ldx [$inp+32],@X[4] 128 ldx [$inp+48],@X[6] 129 ldx [$inp+8],@X[1] 130 ldx [$inp+24],@X[3] 131 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 132 ldx [$inp+40],@X[5] 133 bz,pt %icc,.Laligned 134 ldx [$inp+56],@X[7] 135 136 sllx @X[0],$tmp31,@X[0] 137 ldx [$inp+64],$T1 138___ 139for($j=0;$j<7;$j++) 140{ $code.=<<___; 141 srlx @X[$j+1],$tmp32,$tmp1 142 sllx @X[$j+1],$tmp31,@X[$j+1] 143 or $tmp1,@X[$j],@X[$j] 144___ 145} 146$code.=<<___; 147 srlx $T1,$tmp32,$T1 148 or $T1,@X[7],@X[7] 149.Laligned: 150___ 151 } 152 153 if ($i&1) { 154 $code.="\tadd @X[$i/2],$h,$T1\n"; 155 } else { 156 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 157 } 158} if ($SZ==4); 159 160########### SHA512 161$Xload = sub { 162my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 163my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 164 165$code.=<<___ if ($i==0); 166 ld [$inp+0],%l0 167 ld [$inp+4],%l1 168 ld [$inp+8],%l2 169 ld [$inp+12],%l3 170 ld [$inp+16],%l4 171 ld [$inp+20],%l5 172 ld [$inp+24],%l6 173 ld [$inp+28],%l7 174___ 175$code.=<<___ if ($i<15); 176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 177 add $tmp31,32,$tmp0 178 sllx @pair[0],$tmp0,$tmp1 179 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 180 srlx @pair[2],$tmp32,@pair[1] 181 or $tmp1,$tmp2,$tmp2 182 or @pair[1],$tmp2,$tmp2 183 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 184 add $h,$tmp2,$T1 185 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 186___ 187$code.=<<___ if ($i==12); 188 brnz,a $tmp31,.+8 189 ld [$inp+128],%l0 190___ 191$code.=<<___ if ($i==15); 192 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 193 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 194 add $tmp31,32,$tmp0 195 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 196 sllx @pair[0],$tmp0,$tmp1 197 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 198 srlx @pair[2],$tmp32,@pair[1] 199 or $tmp1,$tmp2,$tmp2 200 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 201 or @pair[1],$tmp2,$tmp2 202 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 203 add $h,$tmp2,$T1 204 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`] 205 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 206 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 207 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 208___ 209} if ($SZ==8); 210 211########### common 212sub BODY_00_15 { 213my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 214 215 if ($i<16) { 216 &$Xload(@_); 217 } else { 218 $code.="\tadd $h,$T1,$T1\n"; 219 } 220 221$code.=<<___; 222 $SRL $e,@Sigma1[0],$h !! $i 223 xor $f,$g,$tmp2 224 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 225 and $e,$tmp2,$tmp2 226 $SRL $e,@Sigma1[1],$tmp0 227 xor $tmp1,$h,$h 228 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 229 xor $tmp0,$h,$h 230 $SRL $e,@Sigma1[2],$tmp0 231 xor $tmp1,$h,$h 232 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 233 xor $tmp0,$h,$h 234 xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 235 xor $tmp1,$h,$tmp0 ! Sigma1(e) 236 237 $SRL $a,@Sigma0[0],$h 238 add $tmp2,$T1,$T1 239 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 240 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 241 add $tmp0,$T1,$T1 242 $SRL $a,@Sigma0[1],$tmp0 243 xor $tmp1,$h,$h 244 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 245 xor $tmp0,$h,$h 246 $SRL $a,@Sigma0[2],$tmp0 247 xor $tmp1,$h,$h 248 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 249 xor $tmp0,$h,$h 250 xor $tmp1,$h,$h ! Sigma0(a) 251 252 or $a,$b,$tmp0 253 and $a,$b,$tmp1 254 and $c,$tmp0,$tmp0 255 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 256 add $tmp2,$T1,$T1 ! +=K[$i] 257 add $tmp1,$h,$h 258 259 add $T1,$d,$d 260 add $T1,$h,$h 261___ 262} 263 264########### SHA256 265$BODY_16_XX = sub { 266my $i=@_[0]; 267my $xi; 268 269 if ($i&1) { 270 $xi=$tmp32; 271 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 272 } else { 273 $xi=@X[(($i+1)/2)%8]; 274 } 275$code.=<<___; 276 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 277 sll $xi,`32-@sigma0[2]`,$tmp1 278 srl $xi,@sigma0[1],$tmp0 279 xor $tmp1,$T1,$T1 280 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 281 xor $tmp0,$T1,$T1 282 srl $xi,@sigma0[2],$tmp0 283 xor $tmp1,$T1,$T1 284___ 285 if ($i&1) { 286 $xi=@X[(($i+14)/2)%8]; 287 } else { 288 $xi=$tmp32; 289 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 290 } 291$code.=<<___; 292 srl $xi,@sigma1[0],$tmp2 293 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 294 sll $xi,`32-@sigma1[2]`,$tmp1 295 srl $xi,@sigma1[1],$tmp0 296 xor $tmp1,$tmp2,$tmp2 297 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 298 xor $tmp0,$tmp2,$tmp2 299 srl $xi,@sigma1[2],$tmp0 300 xor $tmp1,$tmp2,$tmp2 301___ 302 if ($i&1) { 303 $xi=@X[($i/2)%8]; 304$code.=<<___; 305 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 306 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 307 srl @X[($i/2)%8],0,$tmp0 308 add $tmp2,$tmp1,$tmp1 309 add $xi,$T1,$T1 ! +=X[i] 310 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 311 add $tmp1,$T1,$T1 312 313 srl $T1,0,$T1 314 or $T1,@X[($i/2)%8],@X[($i/2)%8] 315___ 316 } else { 317 $xi=@X[(($i+9)/2)%8]; 318$code.=<<___; 319 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 320 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 321 add $xi,$T1,$T1 ! +=X[i+9] 322 add $tmp2,$tmp1,$tmp1 323 srl @X[($i/2)%8],0,@X[($i/2)%8] 324 add $tmp1,$T1,$T1 325 326 sllx $T1,32,$tmp0 327 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 328___ 329 } 330 &BODY_00_15(@_); 331} if ($SZ==4); 332 333########### SHA512 334$BODY_16_XX = sub { 335my $i=@_[0]; 336my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 337 338$code.=<<___; 339 sllx %l2,32,$tmp0 !! Xupdate($i) 340 or %l3,$tmp0,$tmp0 341 342 srlx $tmp0,@sigma0[0],$T1 343 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2 344 sllx $tmp0,`64-@sigma0[2]`,$tmp1 345 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3 346 srlx $tmp0,@sigma0[1],$tmp0 347 xor $tmp1,$T1,$T1 348 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 349 xor $tmp0,$T1,$T1 350 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 351 xor $tmp1,$T1,$T1 352 sllx %l6,32,$tmp2 353 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 354 or %l7,$tmp2,$tmp2 355 356 srlx $tmp2,@sigma1[0],$tmp1 357 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6 358 sllx $tmp2,`64-@sigma1[2]`,$tmp0 359 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7 360 srlx $tmp2,@sigma1[1],$tmp2 361 xor $tmp0,$tmp1,$tmp1 362 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 363 xor $tmp2,$tmp1,$tmp1 364 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 365 xor $tmp0,$tmp1,$tmp1 366 sllx %l4,32,$tmp0 367 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 368 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4 369 or %l5,$tmp0,$tmp0 370 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5 371 372 sllx %l0,32,$tmp2 373 add $tmp1,$T1,$T1 374 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0 375 or %l1,$tmp2,$tmp2 376 add $tmp0,$T1,$T1 ! +=X[$i+9] 377 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1 378 add $tmp2,$T1,$T1 ! +=X[$i] 379 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`] 380___ 381 &BODY_00_15(@_); 382} if ($SZ==8); 383 384$code.=<<___ if ($bits==64); 385.register %g2,#scratch 386.register %g3,#scratch 387___ 388$code.=<<___; 389.section ".text",#alloc,#execinstr 390 391.align 64 392K${label}: 393.type K${label},#object 394___ 395if ($SZ==4) { 396$code.=<<___; 397 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 398 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 399 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 400 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 401 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 402 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 403 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 404 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 405 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 406 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 407 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 408 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 409 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 410 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 411 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 412 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 413___ 414} else { 415$code.=<<___; 416 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 417 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 418 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 419 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 420 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 421 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 422 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 423 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 424 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 425 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 426 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 427 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 428 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 429 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 430 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 431 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 432 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 433 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 434 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 435 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 436 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 437 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 438 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 439 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 440 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 441 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 442 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 443 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 444 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 445 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 446 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 447 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 448 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 449 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 450 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 451 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 452 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 453 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 454 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 455 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 456___ 457} 458$code.=<<___; 459.size K${label},.-K${label} 460.globl sha${label}_block_data_order 461sha${label}_block_data_order: 462 save %sp,`-$frame-$locals`,%sp 463 and $inp,`$align-1`,$tmp31 464 sllx $len,`log(16*$SZ)/log(2)`,$len 465 andn $inp,`$align-1`,$inp 466 sll $tmp31,3,$tmp31 467 add $inp,$len,$len 468___ 469$code.=<<___ if ($SZ==8); # SHA512 470 mov 32,$tmp32 471 sub $tmp32,$tmp31,$tmp32 472___ 473$code.=<<___; 474.Lpic: call .+8 475 add %o7,K${label}-.Lpic,$Ktbl 476 477 $LD [$ctx+`0*$SZ`],$A 478 $LD [$ctx+`1*$SZ`],$B 479 $LD [$ctx+`2*$SZ`],$C 480 $LD [$ctx+`3*$SZ`],$D 481 $LD [$ctx+`4*$SZ`],$E 482 $LD [$ctx+`5*$SZ`],$F 483 $LD [$ctx+`6*$SZ`],$G 484 $LD [$ctx+`7*$SZ`],$H 485 486.Lloop: 487___ 488for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 489$code.=".L16_xx:\n"; 490for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 491$code.=<<___; 492 and $tmp2,0xfff,$tmp2 493 cmp $tmp2,$lastK 494 bne .L16_xx 495 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 496 497___ 498$code.=<<___ if ($SZ==4); # SHA256 499 $LD [$ctx+`0*$SZ`],@X[0] 500 $LD [$ctx+`1*$SZ`],@X[1] 501 $LD [$ctx+`2*$SZ`],@X[2] 502 $LD [$ctx+`3*$SZ`],@X[3] 503 $LD [$ctx+`4*$SZ`],@X[4] 504 $LD [$ctx+`5*$SZ`],@X[5] 505 $LD [$ctx+`6*$SZ`],@X[6] 506 $LD [$ctx+`7*$SZ`],@X[7] 507 508 add $A,@X[0],$A 509 $ST $A,[$ctx+`0*$SZ`] 510 add $B,@X[1],$B 511 $ST $B,[$ctx+`1*$SZ`] 512 add $C,@X[2],$C 513 $ST $C,[$ctx+`2*$SZ`] 514 add $D,@X[3],$D 515 $ST $D,[$ctx+`3*$SZ`] 516 add $E,@X[4],$E 517 $ST $E,[$ctx+`4*$SZ`] 518 add $F,@X[5],$F 519 $ST $F,[$ctx+`5*$SZ`] 520 add $G,@X[6],$G 521 $ST $G,[$ctx+`6*$SZ`] 522 add $H,@X[7],$H 523 $ST $H,[$ctx+`7*$SZ`] 524___ 525$code.=<<___ if ($SZ==8); # SHA512 526 ld [$ctx+`0*$SZ+0`],%l0 527 ld [$ctx+`0*$SZ+4`],%l1 528 ld [$ctx+`1*$SZ+0`],%l2 529 ld [$ctx+`1*$SZ+4`],%l3 530 ld [$ctx+`2*$SZ+0`],%l4 531 ld [$ctx+`2*$SZ+4`],%l5 532 ld [$ctx+`3*$SZ+0`],%l6 533 534 sllx %l0,32,$tmp0 535 ld [$ctx+`3*$SZ+4`],%l7 536 sllx %l2,32,$tmp1 537 or %l1,$tmp0,$tmp0 538 or %l3,$tmp1,$tmp1 539 add $tmp0,$A,$A 540 add $tmp1,$B,$B 541 $ST $A,[$ctx+`0*$SZ`] 542 sllx %l4,32,$tmp2 543 $ST $B,[$ctx+`1*$SZ`] 544 sllx %l6,32,$T1 545 or %l5,$tmp2,$tmp2 546 or %l7,$T1,$T1 547 add $tmp2,$C,$C 548 $ST $C,[$ctx+`2*$SZ`] 549 add $T1,$D,$D 550 $ST $D,[$ctx+`3*$SZ`] 551 552 ld [$ctx+`4*$SZ+0`],%l0 553 ld [$ctx+`4*$SZ+4`],%l1 554 ld [$ctx+`5*$SZ+0`],%l2 555 ld [$ctx+`5*$SZ+4`],%l3 556 ld [$ctx+`6*$SZ+0`],%l4 557 ld [$ctx+`6*$SZ+4`],%l5 558 ld [$ctx+`7*$SZ+0`],%l6 559 560 sllx %l0,32,$tmp0 561 ld [$ctx+`7*$SZ+4`],%l7 562 sllx %l2,32,$tmp1 563 or %l1,$tmp0,$tmp0 564 or %l3,$tmp1,$tmp1 565 add $tmp0,$E,$E 566 add $tmp1,$F,$F 567 $ST $E,[$ctx+`4*$SZ`] 568 sllx %l4,32,$tmp2 569 $ST $F,[$ctx+`5*$SZ`] 570 sllx %l6,32,$T1 571 or %l5,$tmp2,$tmp2 572 or %l7,$T1,$T1 573 add $tmp2,$G,$G 574 $ST $G,[$ctx+`6*$SZ`] 575 add $T1,$H,$H 576 $ST $H,[$ctx+`7*$SZ`] 577___ 578$code.=<<___; 579 add $inp,`16*$SZ`,$inp ! advance inp 580 cmp $inp,$len 581 bne `$bits==64?"%xcc":"%icc"`,.Lloop 582 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 583 584 ret 585 restore 586.type sha${label}_block_data_order,#function 587.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 588.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 589.align 4 590___ 591 592$code =~ s/\`([^\`]*)\`/eval $1/gem; 593print $code; 594close STDOUT; 595