1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# SHA256 block transform for x86. September 2007. 11# 12# Performance improvement over compiler generated code varies from 13# 10% to 40% [see below]. Not very impressive on some µ-archs, but 14# it's 5 times smaller and optimizies amount of writes. 15# 16# May 2012. 17# 18# Optimization including two of Pavel Semjanov's ideas, alternative 19# Maj and full unroll, resulted in ~20-25% improvement on most CPUs, 20# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost 21# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not 22# on P4, where it kills performance, nor Sandy Bridge, where folded 23# loop is approximately as fast... 24# 25# June 2012. 26# 27# Add AMD XOP-specific code path, >30% improvement on Bulldozer over 28# May version, >60% over original. Add AVX+shrd code path, >25% 29# improvement on Sandy Bridge over May version, 60% over original. 30# 31# May 2013. 32# 33# Replace AMD XOP code path with SSSE3 to cover more processors. 34# (Biggest improvement coefficient is on upcoming Atom Silvermont, 35# not shown.) Add AVX+BMI code path. 36# 37# March 2014. 38# 39# Add support for Intel SHA Extensions. 40# 41# Performance in clock cycles per processed byte (less is better): 42# 43# gcc icc x86 asm(*) SIMD x86_64 asm(**) 44# Pentium 46 57 40/38 - - 45# PIII 36 33 27/24 - - 46# P4 41 38 28 - 17.3 47# AMD K8 27 25 19/15.5 - 14.9 48# Core2 26 23 18/15.6 14.3 13.8 49# Westmere 27 - 19/15.7 13.4 12.3 50# Sandy Bridge 25 - 15.9 12.4 11.6 51# Ivy Bridge 24 - 15.0 11.4 10.3 52# Haswell 22 - 13.9 9.46 7.80 53# Skylake 20 - 14.9 9.50 7.70 54# Bulldozer 36 - 27/22 17.0 13.6 55# VIA Nano 36 - 25/22 16.8 16.5 56# Atom 50 - 30/25 21.9 18.9 57# Silvermont 40 - 34/31 22.9 20.6 58# Goldmont 29 - 20 16.3(***) 59# 60# (*) numbers after slash are for unrolled loop, where applicable; 61# (**) x86_64 assembly performance is presented for reference 62# purposes, results are best-available; 63# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; 64 65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 66push(@INC,"${dir}","${dir}../../../perlasm"); 67require "x86asm.pl"; 68 69$output=pop; 70open STDOUT,">$output"; 71 72&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 73 74$xmm=$avx=0; 75for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } 76 77# In upstream, this is controlled by shelling out to the compiler to check 78# versions, but BoringSSL is intended to be used with pre-generated perlasm 79# output, so this isn't useful anyway. 80# 81# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. 82$avx = 1; 83 84$avx = 0 unless ($xmm); 85 86$shaext=$xmm; ### set to zero if compiling for 1.0.1 87 88# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 89# been tested. 90$shaext = 0; 91 92$unroll_after = 64*4; # If pre-evicted from L1P cache first spin of 93 # fully unrolled loop was measured to run about 94 # 3-4x slower. If slowdown coefficient is N and 95 # unrolled loop is m times faster, then you break 96 # even at (N-1)/(m-1) blocks. Then it needs to be 97 # adjusted for probability of code being evicted, 98 # code size/cache size=1/4. Typical m is 1.15... 99 100$A="eax"; 101$E="edx"; 102$T="ebx"; 103$Aoff=&DWP(4,"esp"); 104$Boff=&DWP(8,"esp"); 105$Coff=&DWP(12,"esp"); 106$Doff=&DWP(16,"esp"); 107$Eoff=&DWP(20,"esp"); 108$Foff=&DWP(24,"esp"); 109$Goff=&DWP(28,"esp"); 110$Hoff=&DWP(32,"esp"); 111$Xoff=&DWP(36,"esp"); 112$K256="ebp"; 113 114sub BODY_16_63() { 115 &mov ($T,"ecx"); # "ecx" is preloaded 116 &mov ("esi",&DWP(4*(9+15+16-14),"esp")); 117 &ror ("ecx",18-7); 118 &mov ("edi","esi"); 119 &ror ("esi",19-17); 120 &xor ("ecx",$T); 121 &shr ($T,3); 122 &ror ("ecx",7); 123 &xor ("esi","edi"); 124 &xor ($T,"ecx"); # T = sigma0(X[-15]) 125 &ror ("esi",17); 126 &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] 127 &shr ("edi",10); 128 &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] 129 #&xor ("edi","esi") # sigma1(X[-2]) 130 # &add ($T,"edi"); # T += sigma1(X[-2]) 131 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 132 133 &BODY_00_15(1); 134} 135sub BODY_00_15() { 136 my $in_16_63=shift; 137 138 &mov ("ecx",$E); 139 &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) 140 &mov ("esi",$Foff); 141 &ror ("ecx",25-11); 142 &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) 143 &mov ("edi",$Goff); 144 &xor ("ecx",$E); 145 &xor ("esi","edi"); 146 &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); 147 &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] 148 &ror ("ecx",11-6); 149 &and ("esi",$E); 150 &mov ($Eoff,$E); # modulo-scheduled 151 &xor ($E,"ecx"); 152 &add ($T,$Hoff); # T += h 153 &xor ("esi","edi"); # Ch(e,f,g) 154 &ror ($E,6); # Sigma1(e) 155 &mov ("ecx",$A); 156 &add ($T,"esi"); # T += Ch(e,f,g) 157 158 &ror ("ecx",22-13); 159 &add ($T,$E); # T += Sigma1(e) 160 &mov ("edi",$Boff); 161 &xor ("ecx",$A); 162 &mov ($Aoff,$A); # modulo-scheduled 163 &lea ("esp",&DWP(-4,"esp")); 164 &ror ("ecx",13-2); 165 &mov ("esi",&DWP(0,$K256)); 166 &xor ("ecx",$A); 167 &mov ($E,$Eoff); # e in next iteration, d in this one 168 &xor ($A,"edi"); # a ^= b 169 &ror ("ecx",2); # Sigma0(a) 170 171 &add ($T,"esi"); # T+= K[i] 172 &mov (&DWP(0,"esp"),$A); # (b^c) in next round 173 &add ($E,$T); # d += T 174 &and ($A,&DWP(4,"esp")); # a &= (b^c) 175 &add ($T,"ecx"); # T += Sigma0(a) 176 &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 177 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T 178 &add ($K256,4); 179 &add ($A,$T); # h += T 180} 181 182&external_label("OPENSSL_ia32cap_P") if (!$i386); 183 184&function_begin("sha256_block_data_order"); 185 &mov ("esi",wparam(0)); # ctx 186 &mov ("edi",wparam(1)); # inp 187 &mov ("eax",wparam(2)); # num 188 &mov ("ebx","esp"); # saved sp 189 190 &call (&label("pic_point")); # make it PIC! 191&set_label("pic_point"); 192 &blindpop($K256); 193 &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); 194 195 &sub ("esp",16); 196 &and ("esp",-64); 197 198 &shl ("eax",6); 199 &add ("eax","edi"); 200 &mov (&DWP(0,"esp"),"esi"); # ctx 201 &mov (&DWP(4,"esp"),"edi"); # inp 202 &mov (&DWP(8,"esp"),"eax"); # inp+num*128 203 &mov (&DWP(12,"esp"),"ebx"); # saved sp 204 if (!$i386 && $xmm) { 205 &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); 206 &mov ("ecx",&DWP(0,"edx")); 207 &mov ("ebx",&DWP(4,"edx")); 208 &test ("ecx",1<<20); # check for P4 209 &jnz (&label("loop")); 210 &mov ("edx",&DWP(8,"edx")) if ($xmm); 211 &test ("ecx",1<<24); # check for FXSR 212 &jz ($unroll_after?&label("no_xmm"):&label("loop")); 213 &and ("ecx",1<<30); # mask "Intel CPU" bit 214 &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits 215 &test ("edx",1<<29) if ($shaext); # check for SHA 216 &jnz (&label("shaext")) if ($shaext); 217 &or ("ecx","ebx"); 218 &and ("ecx",1<<28|1<<30); 219 &cmp ("ecx",1<<28|1<<30); 220 if ($xmm) { 221 &je (&label("AVX")) if ($avx); 222 &test ("ebx",1<<9); # check for SSSE3 223 &jnz (&label("SSSE3")); 224 } else { 225 &je (&label("loop_shrd")); 226 } 227 if ($unroll_after) { 228&set_label("no_xmm"); 229 &sub ("eax","edi"); 230 &cmp ("eax",$unroll_after); 231 &jae (&label("unrolled")); 232 } } 233 &jmp (&label("loop")); 234 235sub COMPACT_LOOP() { 236my $suffix=shift; 237 238&set_label("loop$suffix",$suffix?32:16); 239 # copy input block to stack reversing byte and dword order 240 for($i=0;$i<4;$i++) { 241 &mov ("eax",&DWP($i*16+0,"edi")); 242 &mov ("ebx",&DWP($i*16+4,"edi")); 243 &mov ("ecx",&DWP($i*16+8,"edi")); 244 &bswap ("eax"); 245 &mov ("edx",&DWP($i*16+12,"edi")); 246 &bswap ("ebx"); 247 &push ("eax"); 248 &bswap ("ecx"); 249 &push ("ebx"); 250 &bswap ("edx"); 251 &push ("ecx"); 252 &push ("edx"); 253 } 254 &add ("edi",64); 255 &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H 256 &mov (&DWP(4*(9+16)+4,"esp"),"edi"); 257 258 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 259 &mov ($A,&DWP(0,"esi")); 260 &mov ("ebx",&DWP(4,"esi")); 261 &mov ("ecx",&DWP(8,"esi")); 262 &mov ("edi",&DWP(12,"esi")); 263 # &mov ($Aoff,$A); 264 &mov ($Boff,"ebx"); 265 &xor ("ebx","ecx"); 266 &mov ($Coff,"ecx"); 267 &mov ($Doff,"edi"); 268 &mov (&DWP(0,"esp"),"ebx"); # magic 269 &mov ($E,&DWP(16,"esi")); 270 &mov ("ebx",&DWP(20,"esi")); 271 &mov ("ecx",&DWP(24,"esi")); 272 &mov ("edi",&DWP(28,"esi")); 273 # &mov ($Eoff,$E); 274 &mov ($Foff,"ebx"); 275 &mov ($Goff,"ecx"); 276 &mov ($Hoff,"edi"); 277 278&set_label("00_15$suffix",16); 279 280 &BODY_00_15(); 281 282 &cmp ("esi",0xc19bf174); 283 &jne (&label("00_15$suffix")); 284 285 &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) 286 &jmp (&label("16_63$suffix")); 287 288&set_label("16_63$suffix",16); 289 290 &BODY_16_63(); 291 292 &cmp ("esi",0xc67178f2); 293 &jne (&label("16_63$suffix")); 294 295 &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx 296 # &mov ($A,$Aoff); 297 &mov ("ebx",$Boff); 298 # &mov ("edi",$Coff); 299 &mov ("ecx",$Doff); 300 &add ($A,&DWP(0,"esi")); 301 &add ("ebx",&DWP(4,"esi")); 302 &add ("edi",&DWP(8,"esi")); 303 &add ("ecx",&DWP(12,"esi")); 304 &mov (&DWP(0,"esi"),$A); 305 &mov (&DWP(4,"esi"),"ebx"); 306 &mov (&DWP(8,"esi"),"edi"); 307 &mov (&DWP(12,"esi"),"ecx"); 308 # &mov ($E,$Eoff); 309 &mov ("eax",$Foff); 310 &mov ("ebx",$Goff); 311 &mov ("ecx",$Hoff); 312 &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp 313 &add ($E,&DWP(16,"esi")); 314 &add ("eax",&DWP(20,"esi")); 315 &add ("ebx",&DWP(24,"esi")); 316 &add ("ecx",&DWP(28,"esi")); 317 &mov (&DWP(16,"esi"),$E); 318 &mov (&DWP(20,"esi"),"eax"); 319 &mov (&DWP(24,"esi"),"ebx"); 320 &mov (&DWP(28,"esi"),"ecx"); 321 322 &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame 323 &sub ($K256,4*64); # rewind K 324 325 &cmp ("edi",&DWP(8,"esp")); # are we done yet? 326 &jb (&label("loop$suffix")); 327} 328 &COMPACT_LOOP(); 329 &mov ("esp",&DWP(12,"esp")); # restore sp 330&function_end_A(); 331 if (!$i386 && !$xmm) { 332 # ~20% improvement on Sandy Bridge 333 local *ror = sub { &shrd(@_[0],@_) }; 334 &COMPACT_LOOP("_shrd"); 335 &mov ("esp",&DWP(12,"esp")); # restore sp 336&function_end_A(); 337 } 338 339&set_label("K256",64); # Yes! I keep it in the code segment! 340@K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 341 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 342 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 343 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 344 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 345 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 346 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 347 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 348 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 349 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 350 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 351 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 352 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 353 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 354 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 355 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); 356&data_word(@K256); 357&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask 358&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>"); 359 360($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets 361sub off { &DWP(4*(((shift)-$i)&7),"esp"); } 362 363if (!$i386 && $unroll_after) { 364my @AH=($A,$K256); 365 366&set_label("unrolled",16); 367 &lea ("esp",&DWP(-96,"esp")); 368 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 369 &mov ($AH[0],&DWP(0,"esi")); 370 &mov ($AH[1],&DWP(4,"esi")); 371 &mov ("ecx",&DWP(8,"esi")); 372 &mov ("ebx",&DWP(12,"esi")); 373 #&mov (&DWP(0,"esp"),$AH[0]); 374 &mov (&DWP(4,"esp"),$AH[1]); 375 &xor ($AH[1],"ecx"); # magic 376 &mov (&DWP(8,"esp"),"ecx"); 377 &mov (&DWP(12,"esp"),"ebx"); 378 &mov ($E,&DWP(16,"esi")); 379 &mov ("ebx",&DWP(20,"esi")); 380 &mov ("ecx",&DWP(24,"esi")); 381 &mov ("esi",&DWP(28,"esi")); 382 #&mov (&DWP(16,"esp"),$E); 383 &mov (&DWP(20,"esp"),"ebx"); 384 &mov (&DWP(24,"esp"),"ecx"); 385 &mov (&DWP(28,"esp"),"esi"); 386 &jmp (&label("grand_loop")); 387 388&set_label("grand_loop",16); 389 # copy input block to stack reversing byte order 390 for($i=0;$i<5;$i++) { 391 &mov ("ebx",&DWP(12*$i+0,"edi")); 392 &mov ("ecx",&DWP(12*$i+4,"edi")); 393 &bswap ("ebx"); 394 &mov ("esi",&DWP(12*$i+8,"edi")); 395 &bswap ("ecx"); 396 &mov (&DWP(32+12*$i+0,"esp"),"ebx"); 397 &bswap ("esi"); 398 &mov (&DWP(32+12*$i+4,"esp"),"ecx"); 399 &mov (&DWP(32+12*$i+8,"esp"),"esi"); 400 } 401 &mov ("ebx",&DWP($i*12,"edi")); 402 &add ("edi",64); 403 &bswap ("ebx"); 404 &mov (&DWP(96+4,"esp"),"edi"); 405 &mov (&DWP(32+12*$i,"esp"),"ebx"); 406 407 my ($t1,$t2) = ("ecx","esi"); 408 409 for ($i=0;$i<64;$i++) { 410 411 if ($i>=16) { 412 &mov ($T,$t1); # $t1 is preloaded 413 # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); 414 &ror ($t1,18-7); 415 &mov ("edi",$t2); 416 &ror ($t2,19-17); 417 &xor ($t1,$T); 418 &shr ($T,3); 419 &ror ($t1,7); 420 &xor ($t2,"edi"); 421 &xor ($T,$t1); # T = sigma0(X[-15]) 422 &ror ($t2,17); 423 &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] 424 &shr ("edi",10); 425 &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] 426 #&xor ("edi",$t2) # sigma1(X[-2]) 427 # &add ($T,"edi"); # T += sigma1(X[-2]) 428 # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] 429 } 430 &mov ($t1,$E); 431 &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) 432 &mov ($t2,&off($f)); 433 &ror ($E,25-11); 434 &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) 435 &mov ("edi",&off($g)); 436 &xor ($E,$t1); 437 &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] 438 &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] 439 &xor ($t2,"edi"); 440 &ror ($E,11-6); 441 &and ($t2,$t1); 442 &mov (&off($e),$t1); # save $E, modulo-scheduled 443 &xor ($E,$t1); 444 &add ($T,&off($h)); # T += h 445 &xor ("edi",$t2); # Ch(e,f,g) 446 &ror ($E,6); # Sigma1(e) 447 &mov ($t1,$AH[0]); 448 &add ($T,"edi"); # T += Ch(e,f,g) 449 450 &ror ($t1,22-13); 451 &mov ($t2,$AH[0]); 452 &mov ("edi",&off($b)); 453 &xor ($t1,$AH[0]); 454 &mov (&off($a),$AH[0]); # save $A, modulo-scheduled 455 &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round 456 &ror ($t1,13-2); 457 &and ($AH[1],$AH[0]); # (b^c) &= (a^b) 458 &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] 459 &xor ($t1,$t2); 460 &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) 461 &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); 462 &ror ($t1,2); # Sigma0(a) 463 464 &add ($AH[1],$E); # h += T 465 &add ($E,&off($d)); # d += T 466 &add ($AH[1],$t1); # h += Sigma0(a) 467 &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); 468 469 @AH = reverse(@AH); # rotate(a,h) 470 ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) 471 } 472 &mov ("esi",&DWP(96,"esp")); #ctx 473 #&mov ($AH[0],&DWP(0,"esp")); 474 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 475 #&mov ("edi", &DWP(8,"esp")); 476 &mov ("ecx",&DWP(12,"esp")); 477 &add ($AH[0],&DWP(0,"esi")); 478 &add ($AH[1],&DWP(4,"esi")); 479 &add ("edi",&DWP(8,"esi")); 480 &add ("ecx",&DWP(12,"esi")); 481 &mov (&DWP(0,"esi"),$AH[0]); 482 &mov (&DWP(4,"esi"),$AH[1]); 483 &mov (&DWP(8,"esi"),"edi"); 484 &mov (&DWP(12,"esi"),"ecx"); 485 #&mov (&DWP(0,"esp"),$AH[0]); 486 &mov (&DWP(4,"esp"),$AH[1]); 487 &xor ($AH[1],"edi"); # magic 488 &mov (&DWP(8,"esp"),"edi"); 489 &mov (&DWP(12,"esp"),"ecx"); 490 #&mov ($E,&DWP(16,"esp")); 491 &mov ("edi",&DWP(20,"esp")); 492 &mov ("ebx",&DWP(24,"esp")); 493 &mov ("ecx",&DWP(28,"esp")); 494 &add ($E,&DWP(16,"esi")); 495 &add ("edi",&DWP(20,"esi")); 496 &add ("ebx",&DWP(24,"esi")); 497 &add ("ecx",&DWP(28,"esi")); 498 &mov (&DWP(16,"esi"),$E); 499 &mov (&DWP(20,"esi"),"edi"); 500 &mov (&DWP(24,"esi"),"ebx"); 501 &mov (&DWP(28,"esi"),"ecx"); 502 #&mov (&DWP(16,"esp"),$E); 503 &mov (&DWP(20,"esp"),"edi"); 504 &mov ("edi",&DWP(96+4,"esp")); # inp 505 &mov (&DWP(24,"esp"),"ebx"); 506 &mov (&DWP(28,"esp"),"ecx"); 507 508 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 509 &jb (&label("grand_loop")); 510 511 &mov ("esp",&DWP(96+12,"esp")); # restore sp 512&function_end_A(); 513} 514 if (!$i386 && $xmm) {{{ 515if ($shaext) { 516###################################################################### 517# Intel SHA Extensions implementation of SHA256 update function. 518# 519my ($ctx,$inp,$end)=("esi","edi","eax"); 520my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); 521my @MSG=map("xmm$_",(3..6)); 522 523sub sha256op38 { 524 my ($opcodelet,$dst,$src)=@_; 525 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 526 { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } 527} 528sub sha256rnds2 { sha256op38(0xcb,@_); } 529sub sha256msg1 { sha256op38(0xcc,@_); } 530sub sha256msg2 { sha256op38(0xcd,@_); } 531 532&set_label("shaext",32); 533 &sub ("esp",32); 534 535 &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA 536 &lea ($K256,&DWP(0x80,$K256)); 537 &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE 538 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 539 540 &pshufd ($Wi,$ABEF,0x1b); # ABCD 541 &pshufd ($ABEF,$ABEF,0xb1); # CDAB 542 &pshufd ($CDGH,$CDGH,0x1b); # EFGH 543 &palignr ($ABEF,$CDGH,8); # ABEF 544 &punpcklqdq ($CDGH,$Wi); # CDGH 545 &jmp (&label("loop_shaext")); 546 547&set_label("loop_shaext",16); 548 &movdqu (@MSG[0],&QWP(0,$inp)); 549 &movdqu (@MSG[1],&QWP(0x10,$inp)); 550 &movdqu (@MSG[2],&QWP(0x20,$inp)); 551 &pshufb (@MSG[0],$TMP); 552 &movdqu (@MSG[3],&QWP(0x30,$inp)); 553 &movdqa (&QWP(16,"esp"),$CDGH); # offload 554 555 &movdqa ($Wi,&QWP(0*16-0x80,$K256)); 556 &paddd ($Wi,@MSG[0]); 557 &pshufb (@MSG[1],$TMP); 558 &sha256rnds2 ($CDGH,$ABEF); # 0-3 559 &pshufd ($Wi,$Wi,0x0e); 560 &nop (); 561 &movdqa (&QWP(0,"esp"),$ABEF); # offload 562 &sha256rnds2 ($ABEF,$CDGH); 563 564 &movdqa ($Wi,&QWP(1*16-0x80,$K256)); 565 &paddd ($Wi,@MSG[1]); 566 &pshufb (@MSG[2],$TMP); 567 &sha256rnds2 ($CDGH,$ABEF); # 4-7 568 &pshufd ($Wi,$Wi,0x0e); 569 &lea ($inp,&DWP(0x40,$inp)); 570 &sha256msg1 (@MSG[0],@MSG[1]); 571 &sha256rnds2 ($ABEF,$CDGH); 572 573 &movdqa ($Wi,&QWP(2*16-0x80,$K256)); 574 &paddd ($Wi,@MSG[2]); 575 &pshufb (@MSG[3],$TMP); 576 &sha256rnds2 ($CDGH,$ABEF); # 8-11 577 &pshufd ($Wi,$Wi,0x0e); 578 &movdqa ($TMP,@MSG[3]); 579 &palignr ($TMP,@MSG[2],4); 580 &nop (); 581 &paddd (@MSG[0],$TMP); 582 &sha256msg1 (@MSG[1],@MSG[2]); 583 &sha256rnds2 ($ABEF,$CDGH); 584 585 &movdqa ($Wi,&QWP(3*16-0x80,$K256)); 586 &paddd ($Wi,@MSG[3]); 587 &sha256msg2 (@MSG[0],@MSG[3]); 588 &sha256rnds2 ($CDGH,$ABEF); # 12-15 589 &pshufd ($Wi,$Wi,0x0e); 590 &movdqa ($TMP,@MSG[0]); 591 &palignr ($TMP,@MSG[3],4); 592 &nop (); 593 &paddd (@MSG[1],$TMP); 594 &sha256msg1 (@MSG[2],@MSG[3]); 595 &sha256rnds2 ($ABEF,$CDGH); 596 597for($i=4;$i<16-3;$i++) { 598 &movdqa ($Wi,&QWP($i*16-0x80,$K256)); 599 &paddd ($Wi,@MSG[0]); 600 &sha256msg2 (@MSG[1],@MSG[0]); 601 &sha256rnds2 ($CDGH,$ABEF); # 16-19... 602 &pshufd ($Wi,$Wi,0x0e); 603 &movdqa ($TMP,@MSG[1]); 604 &palignr ($TMP,@MSG[0],4); 605 &nop (); 606 &paddd (@MSG[2],$TMP); 607 &sha256msg1 (@MSG[3],@MSG[0]); 608 &sha256rnds2 ($ABEF,$CDGH); 609 610 push(@MSG,shift(@MSG)); 611} 612 &movdqa ($Wi,&QWP(13*16-0x80,$K256)); 613 &paddd ($Wi,@MSG[0]); 614 &sha256msg2 (@MSG[1],@MSG[0]); 615 &sha256rnds2 ($CDGH,$ABEF); # 52-55 616 &pshufd ($Wi,$Wi,0x0e); 617 &movdqa ($TMP,@MSG[1]) 618 &palignr ($TMP,@MSG[0],4); 619 &sha256rnds2 ($ABEF,$CDGH); 620 &paddd (@MSG[2],$TMP); 621 622 &movdqa ($Wi,&QWP(14*16-0x80,$K256)); 623 &paddd ($Wi,@MSG[1]); 624 &sha256rnds2 ($CDGH,$ABEF); # 56-59 625 &pshufd ($Wi,$Wi,0x0e); 626 &sha256msg2 (@MSG[2],@MSG[1]); 627 &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask 628 &sha256rnds2 ($ABEF,$CDGH); 629 630 &movdqa ($Wi,&QWP(15*16-0x80,$K256)); 631 &paddd ($Wi,@MSG[2]); 632 &nop (); 633 &sha256rnds2 ($CDGH,$ABEF); # 60-63 634 &pshufd ($Wi,$Wi,0x0e); 635 &cmp ($end,$inp); 636 &nop (); 637 &sha256rnds2 ($ABEF,$CDGH); 638 639 &paddd ($CDGH,&QWP(16,"esp")); 640 &paddd ($ABEF,&QWP(0,"esp")); 641 &jnz (&label("loop_shaext")); 642 643 &pshufd ($CDGH,$CDGH,0xb1); # DCHG 644 &pshufd ($TMP,$ABEF,0x1b); # FEBA 645 &pshufd ($ABEF,$ABEF,0xb1); # BAFE 646 &punpckhqdq ($ABEF,$CDGH); # DCBA 647 &palignr ($CDGH,$TMP,8); # HGFE 648 649 &mov ("esp",&DWP(32+12,"esp")); 650 &movdqu (&QWP(0,$ctx),$ABEF); 651 &movdqu (&QWP(16,$ctx),$CDGH); 652&function_end_A(); 653} 654 655my @X = map("xmm$_",(0..3)); 656my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); 657my @AH = ($A,$T); 658 659&set_label("SSSE3",32); 660 &lea ("esp",&DWP(-96,"esp")); 661 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 662 &mov ($AH[0],&DWP(0,"esi")); 663 &mov ($AH[1],&DWP(4,"esi")); 664 &mov ("ecx",&DWP(8,"esi")); 665 &mov ("edi",&DWP(12,"esi")); 666 #&mov (&DWP(0,"esp"),$AH[0]); 667 &mov (&DWP(4,"esp"),$AH[1]); 668 &xor ($AH[1],"ecx"); # magic 669 &mov (&DWP(8,"esp"),"ecx"); 670 &mov (&DWP(12,"esp"),"edi"); 671 &mov ($E,&DWP(16,"esi")); 672 &mov ("edi",&DWP(20,"esi")); 673 &mov ("ecx",&DWP(24,"esi")); 674 &mov ("esi",&DWP(28,"esi")); 675 #&mov (&DWP(16,"esp"),$E); 676 &mov (&DWP(20,"esp"),"edi"); 677 &mov ("edi",&DWP(96+4,"esp")); # inp 678 &mov (&DWP(24,"esp"),"ecx"); 679 &mov (&DWP(28,"esp"),"esi"); 680 &movdqa ($t3,&QWP(256,$K256)); 681 &jmp (&label("grand_ssse3")); 682 683&set_label("grand_ssse3",16); 684 # load input, reverse byte order, add K256[0..15], save to stack 685 &movdqu (@X[0],&QWP(0,"edi")); 686 &movdqu (@X[1],&QWP(16,"edi")); 687 &movdqu (@X[2],&QWP(32,"edi")); 688 &movdqu (@X[3],&QWP(48,"edi")); 689 &add ("edi",64); 690 &pshufb (@X[0],$t3); 691 &mov (&DWP(96+4,"esp"),"edi"); 692 &pshufb (@X[1],$t3); 693 &movdqa ($t0,&QWP(0,$K256)); 694 &pshufb (@X[2],$t3); 695 &movdqa ($t1,&QWP(16,$K256)); 696 &paddd ($t0,@X[0]); 697 &pshufb (@X[3],$t3); 698 &movdqa ($t2,&QWP(32,$K256)); 699 &paddd ($t1,@X[1]); 700 &movdqa ($t3,&QWP(48,$K256)); 701 &movdqa (&QWP(32+0,"esp"),$t0); 702 &paddd ($t2,@X[2]); 703 &movdqa (&QWP(32+16,"esp"),$t1); 704 &paddd ($t3,@X[3]); 705 &movdqa (&QWP(32+32,"esp"),$t2); 706 &movdqa (&QWP(32+48,"esp"),$t3); 707 &jmp (&label("ssse3_00_47")); 708 709&set_label("ssse3_00_47",16); 710 &add ($K256,64); 711 712sub SSSE3_00_47 () { 713my $j = shift; 714my $body = shift; 715my @X = @_; 716my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 717 718 eval(shift(@insns)); 719 &movdqa ($t0,@X[1]); 720 eval(shift(@insns)); # @ 721 eval(shift(@insns)); 722 &movdqa ($t3,@X[3]); 723 eval(shift(@insns)); 724 eval(shift(@insns)); 725 &palignr ($t0,@X[0],4); # X[1..4] 726 eval(shift(@insns)); 727 eval(shift(@insns)); # @ 728 eval(shift(@insns)); 729 &palignr ($t3,@X[2],4); # X[9..12] 730 eval(shift(@insns)); 731 eval(shift(@insns)); 732 eval(shift(@insns)); 733 &movdqa ($t1,$t0); 734 eval(shift(@insns)); # @ 735 eval(shift(@insns)); 736 &movdqa ($t2,$t0); 737 eval(shift(@insns)); 738 eval(shift(@insns)); 739 &psrld ($t0,3); 740 eval(shift(@insns)); 741 eval(shift(@insns)); # @ 742 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 743 eval(shift(@insns)); 744 eval(shift(@insns)); 745 &psrld ($t2,7); 746 eval(shift(@insns)); 747 eval(shift(@insns)); 748 eval(shift(@insns)); # @ 749 eval(shift(@insns)); 750 &pshufd ($t3,@X[3],0b11111010); # X[14..15] 751 eval(shift(@insns)); 752 eval(shift(@insns)); 753 &pslld ($t1,32-18); 754 eval(shift(@insns)); 755 eval(shift(@insns)); # @ 756 &pxor ($t0,$t2); 757 eval(shift(@insns)); 758 eval(shift(@insns)); 759 &psrld ($t2,18-7); 760 eval(shift(@insns)); 761 eval(shift(@insns)); 762 eval(shift(@insns)); # @ 763 &pxor ($t0,$t1); 764 eval(shift(@insns)); 765 eval(shift(@insns)); 766 &pslld ($t1,18-7); 767 eval(shift(@insns)); 768 eval(shift(@insns)); 769 eval(shift(@insns)); # @ 770 &pxor ($t0,$t2); 771 eval(shift(@insns)); 772 eval(shift(@insns)); 773 &movdqa ($t2,$t3); 774 eval(shift(@insns)); 775 eval(shift(@insns)); 776 eval(shift(@insns)); # @ 777 &pxor ($t0,$t1); # sigma0(X[1..4]) 778 eval(shift(@insns)); 779 eval(shift(@insns)); 780 &psrld ($t3,10); 781 eval(shift(@insns)); 782 eval(shift(@insns)); 783 eval(shift(@insns)); # @ 784 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 785 eval(shift(@insns)); 786 eval(shift(@insns)); 787 &psrlq ($t2,17); 788 eval(shift(@insns)); 789 eval(shift(@insns)); 790 eval(shift(@insns)); # @ 791 &pxor ($t3,$t2); 792 eval(shift(@insns)); 793 eval(shift(@insns)); 794 &psrlq ($t2,19-17); 795 eval(shift(@insns)); 796 eval(shift(@insns)); 797 eval(shift(@insns)); # @ 798 &pxor ($t3,$t2); 799 eval(shift(@insns)); 800 eval(shift(@insns)); 801 &pshufd ($t3,$t3,0b10000000); 802 eval(shift(@insns)); 803 eval(shift(@insns)); 804 eval(shift(@insns)); # @ 805 eval(shift(@insns)); 806 eval(shift(@insns)); 807 eval(shift(@insns)); 808 eval(shift(@insns)); 809 eval(shift(@insns)); # @ 810 eval(shift(@insns)); 811 &psrldq ($t3,8); 812 eval(shift(@insns)); 813 eval(shift(@insns)); 814 eval(shift(@insns)); 815 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 816 eval(shift(@insns)); # @ 817 eval(shift(@insns)); 818 eval(shift(@insns)); 819 eval(shift(@insns)); 820 eval(shift(@insns)); 821 eval(shift(@insns)); # @ 822 eval(shift(@insns)); 823 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 824 eval(shift(@insns)); 825 eval(shift(@insns)); 826 eval(shift(@insns)); 827 &movdqa ($t2,$t3); 828 eval(shift(@insns)); # @ 829 &psrld ($t3,10); 830 eval(shift(@insns)); 831 &psrlq ($t2,17); 832 eval(shift(@insns)); 833 eval(shift(@insns)); 834 eval(shift(@insns)); 835 eval(shift(@insns)); # @ 836 &pxor ($t3,$t2); 837 eval(shift(@insns)); 838 eval(shift(@insns)); 839 &psrlq ($t2,19-17); 840 eval(shift(@insns)); 841 eval(shift(@insns)); 842 eval(shift(@insns)); # @ 843 &pxor ($t3,$t2); 844 eval(shift(@insns)); 845 eval(shift(@insns)); 846 eval(shift(@insns)); 847 &pshufd ($t3,$t3,0b00001000); 848 eval(shift(@insns)); 849 eval(shift(@insns)); # @ 850 &movdqa ($t2,&QWP(16*$j,$K256)); 851 eval(shift(@insns)); 852 eval(shift(@insns)); 853 &pslldq ($t3,8); 854 eval(shift(@insns)); 855 eval(shift(@insns)); 856 eval(shift(@insns)); # @ 857 eval(shift(@insns)); 858 eval(shift(@insns)); 859 eval(shift(@insns)); 860 eval(shift(@insns)); 861 eval(shift(@insns)); # @ 862 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 863 eval(shift(@insns)); 864 eval(shift(@insns)); 865 eval(shift(@insns)); 866 eval(shift(@insns)); 867 &paddd ($t2,@X[0]); 868 eval(shift(@insns)); # @ 869 870 foreach (@insns) { eval; } # remaining instructions 871 872 &movdqa (&QWP(32+16*$j,"esp"),$t2); 873} 874 875sub body_00_15 () { 876 ( 877 '&mov ("ecx",$E);', 878 '&ror ($E,25-11);', 879 '&mov ("esi",&off($f));', 880 '&xor ($E,"ecx");', 881 '&mov ("edi",&off($g));', 882 '&xor ("esi","edi");', 883 '&ror ($E,11-6);', 884 '&and ("esi","ecx");', 885 '&mov (&off($e),"ecx");', # save $E, modulo-scheduled 886 '&xor ($E,"ecx");', 887 '&xor ("edi","esi");', # Ch(e,f,g) 888 '&ror ($E,6);', # T = Sigma1(e) 889 '&mov ("ecx",$AH[0]);', 890 '&add ($E,"edi");', # T += Ch(e,f,g) 891 '&mov ("edi",&off($b));', 892 '&mov ("esi",$AH[0]);', 893 894 '&ror ("ecx",22-13);', 895 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 896 '&xor ("ecx",$AH[0]);', 897 '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round 898 '&add ($E,&off($h));', # T += h 899 '&ror ("ecx",13-2);', 900 '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) 901 '&xor ("ecx","esi");', 902 '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] 903 '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) 904 '&ror ("ecx",2);', # Sigma0(a) 905 906 '&add ($AH[1],$E);', # h += T 907 '&add ($E,&off($d));', # d += T 908 '&add ($AH[1],"ecx");'. # h += Sigma0(a) 909 910 '@AH = reverse(@AH); $i++;' # rotate(a,h) 911 ); 912} 913 914 for ($i=0,$j=0; $j<4; $j++) { 915 &SSSE3_00_47($j,\&body_00_15,@X); 916 push(@X,shift(@X)); # rotate(@X) 917 } 918 &cmp (&DWP(16*$j,$K256),0x00010203); 919 &jne (&label("ssse3_00_47")); 920 921 for ($i=0; $i<16; ) { 922 foreach(body_00_15()) { eval; } 923 } 924 925 &mov ("esi",&DWP(96,"esp")); #ctx 926 #&mov ($AH[0],&DWP(0,"esp")); 927 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 928 #&mov ("edi", &DWP(8,"esp")); 929 &mov ("ecx",&DWP(12,"esp")); 930 &add ($AH[0],&DWP(0,"esi")); 931 &add ($AH[1],&DWP(4,"esi")); 932 &add ("edi",&DWP(8,"esi")); 933 &add ("ecx",&DWP(12,"esi")); 934 &mov (&DWP(0,"esi"),$AH[0]); 935 &mov (&DWP(4,"esi"),$AH[1]); 936 &mov (&DWP(8,"esi"),"edi"); 937 &mov (&DWP(12,"esi"),"ecx"); 938 #&mov (&DWP(0,"esp"),$AH[0]); 939 &mov (&DWP(4,"esp"),$AH[1]); 940 &xor ($AH[1],"edi"); # magic 941 &mov (&DWP(8,"esp"),"edi"); 942 &mov (&DWP(12,"esp"),"ecx"); 943 #&mov ($E,&DWP(16,"esp")); 944 &mov ("edi",&DWP(20,"esp")); 945 &mov ("ecx",&DWP(24,"esp")); 946 &add ($E,&DWP(16,"esi")); 947 &add ("edi",&DWP(20,"esi")); 948 &add ("ecx",&DWP(24,"esi")); 949 &mov (&DWP(16,"esi"),$E); 950 &mov (&DWP(20,"esi"),"edi"); 951 &mov (&DWP(20,"esp"),"edi"); 952 &mov ("edi",&DWP(28,"esp")); 953 &mov (&DWP(24,"esi"),"ecx"); 954 #&mov (&DWP(16,"esp"),$E); 955 &add ("edi",&DWP(28,"esi")); 956 &mov (&DWP(24,"esp"),"ecx"); 957 &mov (&DWP(28,"esi"),"edi"); 958 &mov (&DWP(28,"esp"),"edi"); 959 &mov ("edi",&DWP(96+4,"esp")); # inp 960 961 &movdqa ($t3,&QWP(64,$K256)); 962 &sub ($K256,3*64); # rewind K 963 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 964 &jb (&label("grand_ssse3")); 965 966 &mov ("esp",&DWP(96+12,"esp")); # restore sp 967&function_end_A(); 968 if ($avx) { 969&set_label("AVX",32); 970 if ($avx>1) { 971 &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 972 &cmp ("edx",1<<8|1<<3); 973 &je (&label("AVX_BMI")); 974 } 975 &lea ("esp",&DWP(-96,"esp")); 976 &vzeroall (); 977 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 978 &mov ($AH[0],&DWP(0,"esi")); 979 &mov ($AH[1],&DWP(4,"esi")); 980 &mov ("ecx",&DWP(8,"esi")); 981 &mov ("edi",&DWP(12,"esi")); 982 #&mov (&DWP(0,"esp"),$AH[0]); 983 &mov (&DWP(4,"esp"),$AH[1]); 984 &xor ($AH[1],"ecx"); # magic 985 &mov (&DWP(8,"esp"),"ecx"); 986 &mov (&DWP(12,"esp"),"edi"); 987 &mov ($E,&DWP(16,"esi")); 988 &mov ("edi",&DWP(20,"esi")); 989 &mov ("ecx",&DWP(24,"esi")); 990 &mov ("esi",&DWP(28,"esi")); 991 #&mov (&DWP(16,"esp"),$E); 992 &mov (&DWP(20,"esp"),"edi"); 993 &mov ("edi",&DWP(96+4,"esp")); # inp 994 &mov (&DWP(24,"esp"),"ecx"); 995 &mov (&DWP(28,"esp"),"esi"); 996 &vmovdqa ($t3,&QWP(256,$K256)); 997 &jmp (&label("grand_avx")); 998 999&set_label("grand_avx",32); 1000 # load input, reverse byte order, add K256[0..15], save to stack 1001 &vmovdqu (@X[0],&QWP(0,"edi")); 1002 &vmovdqu (@X[1],&QWP(16,"edi")); 1003 &vmovdqu (@X[2],&QWP(32,"edi")); 1004 &vmovdqu (@X[3],&QWP(48,"edi")); 1005 &add ("edi",64); 1006 &vpshufb (@X[0],@X[0],$t3); 1007 &mov (&DWP(96+4,"esp"),"edi"); 1008 &vpshufb (@X[1],@X[1],$t3); 1009 &vpshufb (@X[2],@X[2],$t3); 1010 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1011 &vpshufb (@X[3],@X[3],$t3); 1012 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1013 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1014 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1015 &vmovdqa (&QWP(32+0,"esp"),$t0); 1016 &vmovdqa (&QWP(32+16,"esp"),$t1); 1017 &vmovdqa (&QWP(32+32,"esp"),$t2); 1018 &vmovdqa (&QWP(32+48,"esp"),$t3); 1019 &jmp (&label("avx_00_47")); 1020 1021&set_label("avx_00_47",16); 1022 &add ($K256,64); 1023 1024sub Xupdate_AVX () { 1025 ( 1026 '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] 1027 '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] 1028 '&vpsrld ($t2,$t0,7);', 1029 '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] 1030 '&vpsrld ($t3,$t0,3);', 1031 '&vpslld ($t1,$t0,14);', 1032 '&vpxor ($t0,$t3,$t2);', 1033 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1034 '&vpsrld ($t2,$t2,18-7);', 1035 '&vpxor ($t0,$t0,$t1);', 1036 '&vpslld ($t1,$t1,25-14);', 1037 '&vpxor ($t0,$t0,$t2);', 1038 '&vpsrld ($t2,$t3,10);', 1039 '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) 1040 '&vpsrlq ($t1,$t3,17);', 1041 '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 1042 '&vpxor ($t2,$t2,$t1);', 1043 '&vpsrlq ($t3,$t3,19);', 1044 '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] 1045 '&vpshufd ($t3,$t2,0b10000100);', 1046 '&vpsrldq ($t3,$t3,8);', 1047 '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) 1048 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1049 '&vpsrld ($t2,$t3,10);', 1050 '&vpsrlq ($t1,$t3,17);', 1051 '&vpxor ($t2,$t2,$t1);', 1052 '&vpsrlq ($t3,$t3,19);', 1053 '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] 1054 '&vpshufd ($t3,$t2,0b11101000);', 1055 '&vpslldq ($t3,$t3,8);', 1056 '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) 1057 ); 1058} 1059 1060local *ror = sub { &shrd(@_[0],@_) }; 1061sub AVX_00_47 () { 1062my $j = shift; 1063my $body = shift; 1064my @X = @_; 1065my @insns = (&$body,&$body,&$body,&$body); # 120 instructions 1066my $insn; 1067 1068 foreach (Xupdate_AVX()) { # 31 instructions 1069 eval; 1070 eval(shift(@insns)); 1071 eval(shift(@insns)); 1072 eval($insn = shift(@insns)); 1073 eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); 1074 } 1075 &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); 1076 foreach (@insns) { eval; } # remaining instructions 1077 &vmovdqa (&QWP(32+16*$j,"esp"),$t2); 1078} 1079 1080 for ($i=0,$j=0; $j<4; $j++) { 1081 &AVX_00_47($j,\&body_00_15,@X); 1082 push(@X,shift(@X)); # rotate(@X) 1083 } 1084 &cmp (&DWP(16*$j,$K256),0x00010203); 1085 &jne (&label("avx_00_47")); 1086 1087 for ($i=0; $i<16; ) { 1088 foreach(body_00_15()) { eval; } 1089 } 1090 1091 &mov ("esi",&DWP(96,"esp")); #ctx 1092 #&mov ($AH[0],&DWP(0,"esp")); 1093 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1094 #&mov ("edi", &DWP(8,"esp")); 1095 &mov ("ecx",&DWP(12,"esp")); 1096 &add ($AH[0],&DWP(0,"esi")); 1097 &add ($AH[1],&DWP(4,"esi")); 1098 &add ("edi",&DWP(8,"esi")); 1099 &add ("ecx",&DWP(12,"esi")); 1100 &mov (&DWP(0,"esi"),$AH[0]); 1101 &mov (&DWP(4,"esi"),$AH[1]); 1102 &mov (&DWP(8,"esi"),"edi"); 1103 &mov (&DWP(12,"esi"),"ecx"); 1104 #&mov (&DWP(0,"esp"),$AH[0]); 1105 &mov (&DWP(4,"esp"),$AH[1]); 1106 &xor ($AH[1],"edi"); # magic 1107 &mov (&DWP(8,"esp"),"edi"); 1108 &mov (&DWP(12,"esp"),"ecx"); 1109 #&mov ($E,&DWP(16,"esp")); 1110 &mov ("edi",&DWP(20,"esp")); 1111 &mov ("ecx",&DWP(24,"esp")); 1112 &add ($E,&DWP(16,"esi")); 1113 &add ("edi",&DWP(20,"esi")); 1114 &add ("ecx",&DWP(24,"esi")); 1115 &mov (&DWP(16,"esi"),$E); 1116 &mov (&DWP(20,"esi"),"edi"); 1117 &mov (&DWP(20,"esp"),"edi"); 1118 &mov ("edi",&DWP(28,"esp")); 1119 &mov (&DWP(24,"esi"),"ecx"); 1120 #&mov (&DWP(16,"esp"),$E); 1121 &add ("edi",&DWP(28,"esi")); 1122 &mov (&DWP(24,"esp"),"ecx"); 1123 &mov (&DWP(28,"esi"),"edi"); 1124 &mov (&DWP(28,"esp"),"edi"); 1125 &mov ("edi",&DWP(96+4,"esp")); # inp 1126 1127 &vmovdqa ($t3,&QWP(64,$K256)); 1128 &sub ($K256,3*64); # rewind K 1129 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1130 &jb (&label("grand_avx")); 1131 1132 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1133 &vzeroall (); 1134&function_end_A(); 1135 if ($avx>1) { 1136sub bodyx_00_15 () { # +10% 1137 ( 1138 '&rorx ("ecx",$E,6)', 1139 '&rorx ("esi",$E,11)', 1140 '&mov (&off($e),$E)', # save $E, modulo-scheduled 1141 '&rorx ("edi",$E,25)', 1142 '&xor ("ecx","esi")', 1143 '&andn ("esi",$E,&off($g))', 1144 '&xor ("ecx","edi")', # Sigma1(e) 1145 '&and ($E,&off($f))', 1146 '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled 1147 '&or ($E,"esi")', # T = Ch(e,f,g) 1148 1149 '&rorx ("edi",$AH[0],2)', 1150 '&rorx ("esi",$AH[0],13)', 1151 '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) 1152 '&rorx ("ecx",$AH[0],22)', 1153 '&xor ("esi","edi")', 1154 '&mov ("edi",&off($b))', 1155 '&xor ("ecx","esi")', # Sigma0(a) 1156 1157 '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round 1158 '&add ($E,&off($h))', # T += h 1159 '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) 1160 '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] 1161 '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) 1162 1163 '&add ("ecx",$E)', # h += T 1164 '&add ($E,&off($d))', # d += T 1165 '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) 1166 1167 '@AH = reverse(@AH); $i++;' # rotate(a,h) 1168 ); 1169} 1170 1171&set_label("AVX_BMI",32); 1172 &lea ("esp",&DWP(-96,"esp")); 1173 &vzeroall (); 1174 # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack 1175 &mov ($AH[0],&DWP(0,"esi")); 1176 &mov ($AH[1],&DWP(4,"esi")); 1177 &mov ("ecx",&DWP(8,"esi")); 1178 &mov ("edi",&DWP(12,"esi")); 1179 #&mov (&DWP(0,"esp"),$AH[0]); 1180 &mov (&DWP(4,"esp"),$AH[1]); 1181 &xor ($AH[1],"ecx"); # magic 1182 &mov (&DWP(8,"esp"),"ecx"); 1183 &mov (&DWP(12,"esp"),"edi"); 1184 &mov ($E,&DWP(16,"esi")); 1185 &mov ("edi",&DWP(20,"esi")); 1186 &mov ("ecx",&DWP(24,"esi")); 1187 &mov ("esi",&DWP(28,"esi")); 1188 #&mov (&DWP(16,"esp"),$E); 1189 &mov (&DWP(20,"esp"),"edi"); 1190 &mov ("edi",&DWP(96+4,"esp")); # inp 1191 &mov (&DWP(24,"esp"),"ecx"); 1192 &mov (&DWP(28,"esp"),"esi"); 1193 &vmovdqa ($t3,&QWP(256,$K256)); 1194 &jmp (&label("grand_avx_bmi")); 1195 1196&set_label("grand_avx_bmi",32); 1197 # load input, reverse byte order, add K256[0..15], save to stack 1198 &vmovdqu (@X[0],&QWP(0,"edi")); 1199 &vmovdqu (@X[1],&QWP(16,"edi")); 1200 &vmovdqu (@X[2],&QWP(32,"edi")); 1201 &vmovdqu (@X[3],&QWP(48,"edi")); 1202 &add ("edi",64); 1203 &vpshufb (@X[0],@X[0],$t3); 1204 &mov (&DWP(96+4,"esp"),"edi"); 1205 &vpshufb (@X[1],@X[1],$t3); 1206 &vpshufb (@X[2],@X[2],$t3); 1207 &vpaddd ($t0,@X[0],&QWP(0,$K256)); 1208 &vpshufb (@X[3],@X[3],$t3); 1209 &vpaddd ($t1,@X[1],&QWP(16,$K256)); 1210 &vpaddd ($t2,@X[2],&QWP(32,$K256)); 1211 &vpaddd ($t3,@X[3],&QWP(48,$K256)); 1212 &vmovdqa (&QWP(32+0,"esp"),$t0); 1213 &vmovdqa (&QWP(32+16,"esp"),$t1); 1214 &vmovdqa (&QWP(32+32,"esp"),$t2); 1215 &vmovdqa (&QWP(32+48,"esp"),$t3); 1216 &jmp (&label("avx_bmi_00_47")); 1217 1218&set_label("avx_bmi_00_47",16); 1219 &add ($K256,64); 1220 1221 for ($i=0,$j=0; $j<4; $j++) { 1222 &AVX_00_47($j,\&bodyx_00_15,@X); 1223 push(@X,shift(@X)); # rotate(@X) 1224 } 1225 &cmp (&DWP(16*$j,$K256),0x00010203); 1226 &jne (&label("avx_bmi_00_47")); 1227 1228 for ($i=0; $i<16; ) { 1229 foreach(bodyx_00_15()) { eval; } 1230 } 1231 1232 &mov ("esi",&DWP(96,"esp")); #ctx 1233 #&mov ($AH[0],&DWP(0,"esp")); 1234 &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); 1235 #&mov ("edi", &DWP(8,"esp")); 1236 &mov ("ecx",&DWP(12,"esp")); 1237 &add ($AH[0],&DWP(0,"esi")); 1238 &add ($AH[1],&DWP(4,"esi")); 1239 &add ("edi",&DWP(8,"esi")); 1240 &add ("ecx",&DWP(12,"esi")); 1241 &mov (&DWP(0,"esi"),$AH[0]); 1242 &mov (&DWP(4,"esi"),$AH[1]); 1243 &mov (&DWP(8,"esi"),"edi"); 1244 &mov (&DWP(12,"esi"),"ecx"); 1245 #&mov (&DWP(0,"esp"),$AH[0]); 1246 &mov (&DWP(4,"esp"),$AH[1]); 1247 &xor ($AH[1],"edi"); # magic 1248 &mov (&DWP(8,"esp"),"edi"); 1249 &mov (&DWP(12,"esp"),"ecx"); 1250 #&mov ($E,&DWP(16,"esp")); 1251 &mov ("edi",&DWP(20,"esp")); 1252 &mov ("ecx",&DWP(24,"esp")); 1253 &add ($E,&DWP(16,"esi")); 1254 &add ("edi",&DWP(20,"esi")); 1255 &add ("ecx",&DWP(24,"esi")); 1256 &mov (&DWP(16,"esi"),$E); 1257 &mov (&DWP(20,"esi"),"edi"); 1258 &mov (&DWP(20,"esp"),"edi"); 1259 &mov ("edi",&DWP(28,"esp")); 1260 &mov (&DWP(24,"esi"),"ecx"); 1261 #&mov (&DWP(16,"esp"),$E); 1262 &add ("edi",&DWP(28,"esi")); 1263 &mov (&DWP(24,"esp"),"ecx"); 1264 &mov (&DWP(28,"esi"),"edi"); 1265 &mov (&DWP(28,"esp"),"edi"); 1266 &mov ("edi",&DWP(96+4,"esp")); # inp 1267 1268 &vmovdqa ($t3,&QWP(64,$K256)); 1269 &sub ($K256,3*64); # rewind K 1270 &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? 1271 &jb (&label("grand_avx_bmi")); 1272 1273 &mov ("esp",&DWP(96+12,"esp")); # restore sp 1274 &vzeroall (); 1275&function_end_A(); 1276 } 1277 } 1278 }}} 1279&function_end_B("sha256_block_data_order"); 1280 1281&asm_finish(); 1282 1283close STDOUT; 1284