1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. Rights for redistribution and usage in source and binary 6# forms are granted according to the OpenSSL license. 7# ==================================================================== 8# 9# sha256/512_block procedure for x86_64. 10# 11# 40% improvement over compiler-generated code on Opteron. On EM64T 12# sha256 was observed to run >80% faster and sha512 - >40%. No magical 13# tricks, just straight implementation... I really wonder why gcc 14# [being armed with inline assembler] fails to generate as fast code. 15# The only thing which is cool about this module is that it's very 16# same instruction sequence used for both SHA-256 and SHA-512. In 17# former case the instructions operate on 32-bit operands, while in 18# latter - on 64-bit ones. All I had to do is to get one flavor right, 19# the other one passed the test right away:-) 20# 21# sha256_block runs in ~1005 cycles on Opteron, which gives you 22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 23# frequency in GHz. sha512_block runs in ~1275 cycles, which results 24# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 25# Well, if you compare it to IA-64 implementation, which maintains 26# X[16] in register bank[!], tends to 4 instructions per CPU clock 27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 28# issue Opteron pipeline and X[16] maintained in memory. So that *if* 29# there is a way to improve it, *then* the only way would be to try to 30# offload X[16] updates to SSE unit, but that would require "deeper" 31# loop unroll, which in turn would naturally cause size blow-up, not 32# to mention increased complexity! And once again, only *if* it's 33# actually possible to noticeably improve overall ILP, instruction 34# level parallelism, on a given CPU implementation in this case. 35# 36# Special note on Intel EM64T. While Opteron CPU exhibits perfect 37# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 38# [currently available] EM64T CPUs apparently are far from it. On the 39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 40# sha256_block:-( This is presumably because 64-bit shifts/rotates 41# apparently are not atomic instructions, but implemented in microcode. 42# 43# May 2012. 44# 45# Optimization including one of Pavel Semjanov's ideas, alternative 46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 47# unfortunately -2% SHA512 on P4 [which nobody should care about 48# that much]. 49# 50# June 2012. 51# 52# Add SIMD code paths, see below for improvement coefficients. SSSE3 53# code path was not attempted for SHA512, because improvement is not 54# estimated to be high enough, noticeably less than 9%, to justify 55# the effort, not on pre-AVX processors. [Obviously with exclusion 56# for VIA Nano, but it has SHA512 instruction that is faster and 57# should be used instead.] For reference, corresponding estimated 58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 59# higher coefficients are observed on VIA Nano and Bulldozer has more 60# to do with specifics of their architecture [which is topic for 61# separate discussion]. 62# 63# November 2012. 64# 65# Add AVX2 code path. Two consecutive input blocks are loaded to 66# 256-bit %ymm registers, with data from first block to least 67# significant 128-bit halves and data from second to most significant. 68# The data is then processed with same SIMD instruction sequence as 69# for AVX, but with %ymm as operands. Side effect is increased stack 70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 71# code size increase. 72# 73# March 2014. 74# 75# Add support for Intel SHA Extensions. 76 77###################################################################### 78# Current performance in cycles per processed byte (less is better): 79# 80# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 81# 82# AMD K8 14.9 - - 9.57 - 83# P4 17.3 - - 30.8 - 84# Core 2 15.6 13.8(+13%) - 9.97 - 85# Westmere 14.8 12.3(+19%) - 9.58 - 86# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 87# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 88# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 89# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 90# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 91# VIA Nano 23.0 16.5(+39%) - 14.7 - 92# Atom 23.0 18.9(+22%) - 14.7 - 93# Silvermont 27.4 20.6(+33%) - 17.5 - 94# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 95# 96# (*) whichever best applicable, including SHAEXT; 97# (**) switch from ror to shrd stands for fair share of improvement; 98# (***) execution time is fully determined by remaining integer-only 99# part, body_00_15; reducing the amount of SIMD instructions 100# below certain limit makes no difference/sense; to conserve 101# space SHA256 XOP code path is therefore omitted; 102 103$flavour = shift; 104$output = shift; 105if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 106 107$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 108 109$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 110( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 111( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 112die "can't locate x86_64-xlate.pl"; 113 114# In upstream, this is controlled by shelling out to the compiler to check 115# versions, but BoringSSL is intended to be used with pre-generated perlasm 116# output, so this isn't useful anyway. 117# 118# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 119# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 120# did not tie them together until after $shaext was added. 121$avx = 1; 122 123# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 124# been tested. 125$shaext=0; ### set to zero if compiling for 1.0.1 126$avx=1 if (!$shaext && $avx); 127 128open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 129*STDOUT=*OUT; 130 131if ($output =~ /512/) { 132 $func="sha512_block_data_order"; 133 $TABLE="K512"; 134 $SZ=8; 135 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 136 "%r8", "%r9", "%r10","%r11"); 137 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 138 @Sigma0=(28,34,39); 139 @Sigma1=(14,18,41); 140 @sigma0=(1, 8, 7); 141 @sigma1=(19,61, 6); 142 $rounds=80; 143} else { 144 $func="sha256_block_data_order"; 145 $TABLE="K256"; 146 $SZ=4; 147 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 148 "%r8d","%r9d","%r10d","%r11d"); 149 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 150 @Sigma0=( 2,13,22); 151 @Sigma1=( 6,11,25); 152 @sigma0=( 7,18, 3); 153 @sigma1=(17,19,10); 154 $rounds=64; 155} 156 157$ctx="%rdi"; # 1st arg, zapped by $a3 158$inp="%rsi"; # 2nd arg 159$Tbl="%rbp"; 160 161$_ctx="16*$SZ+0*8(%rsp)"; 162$_inp="16*$SZ+1*8(%rsp)"; 163$_end="16*$SZ+2*8(%rsp)"; 164$_rsp="16*$SZ+3*8(%rsp)"; 165$framesz="16*$SZ+4*8"; 166 167 168sub ROUND_00_15() 169{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 170 my $STRIDE=$SZ; 171 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 172 173$code.=<<___; 174 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 175 mov $f,$a2 176 177 xor $e,$a0 178 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 179 xor $g,$a2 # f^g 180 181 mov $T1,`$SZ*($i&0xf)`(%rsp) 182 xor $a,$a1 183 and $e,$a2 # (f^g)&e 184 185 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 186 add $h,$T1 # T1+=h 187 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 188 189 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 190 xor $e,$a0 191 add $a2,$T1 # T1+=Ch(e,f,g) 192 193 mov $a,$a2 194 add ($Tbl),$T1 # T1+=K[round] 195 xor $a,$a1 196 197 xor $b,$a2 # a^b, b^c in next round 198 ror \$$Sigma1[0],$a0 # Sigma1(e) 199 mov $b,$h 200 201 and $a2,$a3 202 ror \$$Sigma0[0],$a1 # Sigma0(a) 203 add $a0,$T1 # T1+=Sigma1(e) 204 205 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 206 add $T1,$d # d+=T1 207 add $T1,$h # h+=T1 208 209 lea $STRIDE($Tbl),$Tbl # round++ 210___ 211$code.=<<___ if ($i<15); 212 add $a1,$h # h+=Sigma0(a) 213___ 214 ($a2,$a3) = ($a3,$a2); 215} 216 217sub ROUND_16_XX() 218{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 219 220$code.=<<___; 221 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 222 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 223 224 mov $a0,$T1 225 ror \$`$sigma0[1]-$sigma0[0]`,$a0 226 add $a1,$a # modulo-scheduled h+=Sigma0(a) 227 mov $a2,$a1 228 ror \$`$sigma1[1]-$sigma1[0]`,$a2 229 230 xor $T1,$a0 231 shr \$$sigma0[2],$T1 232 ror \$$sigma0[0],$a0 233 xor $a1,$a2 234 shr \$$sigma1[2],$a1 235 236 ror \$$sigma1[0],$a2 237 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 238 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 239 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 240 241 add `$SZ*($i&0xf)`(%rsp),$T1 242 mov $e,$a0 243 add $a2,$T1 244 mov $a,$a1 245___ 246 &ROUND_00_15(@_); 247} 248 249$code=<<___; 250.text 251 252.extern OPENSSL_ia32cap_P 253.globl $func 254.type $func,\@function,3 255.align 16 256$func: 257___ 258$code.=<<___ if ($SZ==4 || $avx); 259 lea OPENSSL_ia32cap_P(%rip),%r11 260 mov 0(%r11),%r9d 261 mov 4(%r11),%r10d 262 mov 8(%r11),%r11d 263___ 264$code.=<<___ if ($SZ==4 && $shaext); 265 test \$`1<<29`,%r11d # check for SHA 266 jnz _shaext_shortcut 267___ 268$code.=<<___ if ($avx && $SZ==8); 269 test \$`1<<11`,%r10d # check for XOP 270 jnz .Lxop_shortcut 271___ 272$code.=<<___ if ($avx>1); 273 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 274 cmp \$`1<<8|1<<5|1<<3`,%r11d 275 je .Lavx2_shortcut 276___ 277$code.=<<___ if ($avx); 278 and \$`1<<30`,%r9d # mask "Intel CPU" bit 279 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 280 or %r9d,%r10d 281 cmp \$`1<<28|1<<9|1<<30`,%r10d 282 je .Lavx_shortcut 283___ 284$code.=<<___ if ($SZ==4); 285 test \$`1<<9`,%r10d 286 jnz .Lssse3_shortcut 287___ 288$code.=<<___; 289 mov %rsp,%rax # copy %rsp 290 push %rbx 291 push %rbp 292 push %r12 293 push %r13 294 push %r14 295 push %r15 296 shl \$4,%rdx # num*16 297 sub \$$framesz,%rsp 298 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 299 and \$-64,%rsp # align stack frame 300 mov $ctx,$_ctx # save ctx, 1st arg 301 mov $inp,$_inp # save inp, 2nd arh 302 mov %rdx,$_end # save end pointer, "3rd" arg 303 mov %rax,$_rsp # save copy of %rsp 304.Lprologue: 305 306 mov $SZ*0($ctx),$A 307 mov $SZ*1($ctx),$B 308 mov $SZ*2($ctx),$C 309 mov $SZ*3($ctx),$D 310 mov $SZ*4($ctx),$E 311 mov $SZ*5($ctx),$F 312 mov $SZ*6($ctx),$G 313 mov $SZ*7($ctx),$H 314 jmp .Lloop 315 316.align 16 317.Lloop: 318 mov $B,$a3 319 lea $TABLE(%rip),$Tbl 320 xor $C,$a3 # magic 321___ 322 for($i=0;$i<16;$i++) { 323 $code.=" mov $SZ*$i($inp),$T1\n"; 324 $code.=" mov @ROT[4],$a0\n"; 325 $code.=" mov @ROT[0],$a1\n"; 326 $code.=" bswap $T1\n"; 327 &ROUND_00_15($i,@ROT); 328 unshift(@ROT,pop(@ROT)); 329 } 330$code.=<<___; 331 jmp .Lrounds_16_xx 332.align 16 333.Lrounds_16_xx: 334___ 335 for(;$i<32;$i++) { 336 &ROUND_16_XX($i,@ROT); 337 unshift(@ROT,pop(@ROT)); 338 } 339 340$code.=<<___; 341 cmpb \$0,`$SZ-1`($Tbl) 342 jnz .Lrounds_16_xx 343 344 mov $_ctx,$ctx 345 add $a1,$A # modulo-scheduled h+=Sigma0(a) 346 lea 16*$SZ($inp),$inp 347 348 add $SZ*0($ctx),$A 349 add $SZ*1($ctx),$B 350 add $SZ*2($ctx),$C 351 add $SZ*3($ctx),$D 352 add $SZ*4($ctx),$E 353 add $SZ*5($ctx),$F 354 add $SZ*6($ctx),$G 355 add $SZ*7($ctx),$H 356 357 cmp $_end,$inp 358 359 mov $A,$SZ*0($ctx) 360 mov $B,$SZ*1($ctx) 361 mov $C,$SZ*2($ctx) 362 mov $D,$SZ*3($ctx) 363 mov $E,$SZ*4($ctx) 364 mov $F,$SZ*5($ctx) 365 mov $G,$SZ*6($ctx) 366 mov $H,$SZ*7($ctx) 367 jb .Lloop 368 369 mov $_rsp,%rsi 370 mov -48(%rsi),%r15 371 mov -40(%rsi),%r14 372 mov -32(%rsi),%r13 373 mov -24(%rsi),%r12 374 mov -16(%rsi),%rbp 375 mov -8(%rsi),%rbx 376 lea (%rsi),%rsp 377.Lepilogue: 378 ret 379.size $func,.-$func 380___ 381 382if ($SZ==4) { 383$code.=<<___; 384.align 64 385.type $TABLE,\@object 386$TABLE: 387 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 388 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 389 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 390 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 391 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 392 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 393 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 394 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 395 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 396 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 397 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 398 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 399 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 400 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 401 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 402 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 403 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 404 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 405 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 406 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 407 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 408 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 409 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 410 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 411 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 412 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 413 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 414 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 415 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 416 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 417 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 418 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 419 420 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 421 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 422 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 423 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 424 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 425 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 426 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 427___ 428} else { 429$code.=<<___; 430.align 64 431.type $TABLE,\@object 432$TABLE: 433 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 434 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 435 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 436 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 437 .quad 0x3956c25bf348b538,0x59f111f1b605d019 438 .quad 0x3956c25bf348b538,0x59f111f1b605d019 439 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 440 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 441 .quad 0xd807aa98a3030242,0x12835b0145706fbe 442 .quad 0xd807aa98a3030242,0x12835b0145706fbe 443 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 444 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 445 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 446 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 447 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 448 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 449 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 450 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 451 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 452 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 453 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 454 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 455 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 456 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 457 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 458 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 459 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 460 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 461 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 462 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 463 .quad 0x06ca6351e003826f,0x142929670a0e6e70 464 .quad 0x06ca6351e003826f,0x142929670a0e6e70 465 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 466 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 467 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 468 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 469 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 470 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 471 .quad 0x81c2c92e47edaee6,0x92722c851482353b 472 .quad 0x81c2c92e47edaee6,0x92722c851482353b 473 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 474 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 475 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 476 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 477 .quad 0xd192e819d6ef5218,0xd69906245565a910 478 .quad 0xd192e819d6ef5218,0xd69906245565a910 479 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 480 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 481 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 482 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 483 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 484 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 485 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 486 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 487 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 488 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 489 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 490 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 491 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 492 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 493 .quad 0x90befffa23631e28,0xa4506cebde82bde9 494 .quad 0x90befffa23631e28,0xa4506cebde82bde9 495 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 496 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 497 .quad 0xca273eceea26619c,0xd186b8c721c0c207 498 .quad 0xca273eceea26619c,0xd186b8c721c0c207 499 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 500 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 501 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 502 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 503 .quad 0x113f9804bef90dae,0x1b710b35131c471b 504 .quad 0x113f9804bef90dae,0x1b710b35131c471b 505 .quad 0x28db77f523047d84,0x32caab7b40c72493 506 .quad 0x28db77f523047d84,0x32caab7b40c72493 507 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 508 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 509 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 510 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 511 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 512 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 513 514 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 515 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 516 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 517___ 518} 519 520###################################################################### 521# SIMD code paths 522# 523if ($SZ==4 && $shaext) {{{ 524###################################################################### 525# Intel SHA Extensions implementation of SHA256 update function. 526# 527my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 528 529my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 530my @MSG=map("%xmm$_",(3..6)); 531 532$code.=<<___; 533.type sha256_block_data_order_shaext,\@function,3 534.align 64 535sha256_block_data_order_shaext: 536_shaext_shortcut: 537___ 538$code.=<<___ if ($win64); 539 lea `-8-5*16`(%rsp),%rsp 540 movaps %xmm6,-8-5*16(%rax) 541 movaps %xmm7,-8-4*16(%rax) 542 movaps %xmm8,-8-3*16(%rax) 543 movaps %xmm9,-8-2*16(%rax) 544 movaps %xmm10,-8-1*16(%rax) 545.Lprologue_shaext: 546___ 547$code.=<<___; 548 lea K256+0x80(%rip),$Tbl 549 movdqu ($ctx),$ABEF # DCBA 550 movdqu 16($ctx),$CDGH # HGFE 551 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 552 553 pshufd \$0x1b,$ABEF,$Wi # ABCD 554 pshufd \$0xb1,$ABEF,$ABEF # CDAB 555 pshufd \$0x1b,$CDGH,$CDGH # EFGH 556 movdqa $TMP,$BSWAP # offload 557 palignr \$8,$CDGH,$ABEF # ABEF 558 punpcklqdq $Wi,$CDGH # CDGH 559 jmp .Loop_shaext 560 561.align 16 562.Loop_shaext: 563 movdqu ($inp),@MSG[0] 564 movdqu 0x10($inp),@MSG[1] 565 movdqu 0x20($inp),@MSG[2] 566 pshufb $TMP,@MSG[0] 567 movdqu 0x30($inp),@MSG[3] 568 569 movdqa 0*32-0x80($Tbl),$Wi 570 paddd @MSG[0],$Wi 571 pshufb $TMP,@MSG[1] 572 movdqa $CDGH,$CDGH_SAVE # offload 573 sha256rnds2 $ABEF,$CDGH # 0-3 574 pshufd \$0x0e,$Wi,$Wi 575 nop 576 movdqa $ABEF,$ABEF_SAVE # offload 577 sha256rnds2 $CDGH,$ABEF 578 579 movdqa 1*32-0x80($Tbl),$Wi 580 paddd @MSG[1],$Wi 581 pshufb $TMP,@MSG[2] 582 sha256rnds2 $ABEF,$CDGH # 4-7 583 pshufd \$0x0e,$Wi,$Wi 584 lea 0x40($inp),$inp 585 sha256msg1 @MSG[1],@MSG[0] 586 sha256rnds2 $CDGH,$ABEF 587 588 movdqa 2*32-0x80($Tbl),$Wi 589 paddd @MSG[2],$Wi 590 pshufb $TMP,@MSG[3] 591 sha256rnds2 $ABEF,$CDGH # 8-11 592 pshufd \$0x0e,$Wi,$Wi 593 movdqa @MSG[3],$TMP 594 palignr \$4,@MSG[2],$TMP 595 nop 596 paddd $TMP,@MSG[0] 597 sha256msg1 @MSG[2],@MSG[1] 598 sha256rnds2 $CDGH,$ABEF 599 600 movdqa 3*32-0x80($Tbl),$Wi 601 paddd @MSG[3],$Wi 602 sha256msg2 @MSG[3],@MSG[0] 603 sha256rnds2 $ABEF,$CDGH # 12-15 604 pshufd \$0x0e,$Wi,$Wi 605 movdqa @MSG[0],$TMP 606 palignr \$4,@MSG[3],$TMP 607 nop 608 paddd $TMP,@MSG[1] 609 sha256msg1 @MSG[3],@MSG[2] 610 sha256rnds2 $CDGH,$ABEF 611___ 612for($i=4;$i<16-3;$i++) { 613$code.=<<___; 614 movdqa $i*32-0x80($Tbl),$Wi 615 paddd @MSG[0],$Wi 616 sha256msg2 @MSG[0],@MSG[1] 617 sha256rnds2 $ABEF,$CDGH # 16-19... 618 pshufd \$0x0e,$Wi,$Wi 619 movdqa @MSG[1],$TMP 620 palignr \$4,@MSG[0],$TMP 621 nop 622 paddd $TMP,@MSG[2] 623 sha256msg1 @MSG[0],@MSG[3] 624 sha256rnds2 $CDGH,$ABEF 625___ 626 push(@MSG,shift(@MSG)); 627} 628$code.=<<___; 629 movdqa 13*32-0x80($Tbl),$Wi 630 paddd @MSG[0],$Wi 631 sha256msg2 @MSG[0],@MSG[1] 632 sha256rnds2 $ABEF,$CDGH # 52-55 633 pshufd \$0x0e,$Wi,$Wi 634 movdqa @MSG[1],$TMP 635 palignr \$4,@MSG[0],$TMP 636 sha256rnds2 $CDGH,$ABEF 637 paddd $TMP,@MSG[2] 638 639 movdqa 14*32-0x80($Tbl),$Wi 640 paddd @MSG[1],$Wi 641 sha256rnds2 $ABEF,$CDGH # 56-59 642 pshufd \$0x0e,$Wi,$Wi 643 sha256msg2 @MSG[1],@MSG[2] 644 movdqa $BSWAP,$TMP 645 sha256rnds2 $CDGH,$ABEF 646 647 movdqa 15*32-0x80($Tbl),$Wi 648 paddd @MSG[2],$Wi 649 nop 650 sha256rnds2 $ABEF,$CDGH # 60-63 651 pshufd \$0x0e,$Wi,$Wi 652 dec $num 653 nop 654 sha256rnds2 $CDGH,$ABEF 655 656 paddd $CDGH_SAVE,$CDGH 657 paddd $ABEF_SAVE,$ABEF 658 jnz .Loop_shaext 659 660 pshufd \$0xb1,$CDGH,$CDGH # DCHG 661 pshufd \$0x1b,$ABEF,$TMP # FEBA 662 pshufd \$0xb1,$ABEF,$ABEF # BAFE 663 punpckhqdq $CDGH,$ABEF # DCBA 664 palignr \$8,$TMP,$CDGH # HGFE 665 666 movdqu $ABEF,($ctx) 667 movdqu $CDGH,16($ctx) 668___ 669$code.=<<___ if ($win64); 670 movaps -8-5*16(%rax),%xmm6 671 movaps -8-4*16(%rax),%xmm7 672 movaps -8-3*16(%rax),%xmm8 673 movaps -8-2*16(%rax),%xmm9 674 movaps -8-1*16(%rax),%xmm10 675 mov %rax,%rsp 676.Lepilogue_shaext: 677___ 678$code.=<<___; 679 ret 680.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 681___ 682}}} 683{{{ 684 685my $a4=$T1; 686my ($a,$b,$c,$d,$e,$f,$g,$h); 687 688sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 689{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 690 my $arg = pop; 691 $arg = "\$$arg" if ($arg*1 eq $arg); 692 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 693} 694 695sub body_00_15 () { 696 ( 697 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 698 699 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 700 '&mov ($a,$a1)', 701 '&mov ($a4,$f)', 702 703 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 704 '&xor ($a0,$e)', 705 '&xor ($a4,$g)', # f^g 706 707 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 708 '&xor ($a1,$a)', 709 '&and ($a4,$e)', # (f^g)&e 710 711 '&xor ($a0,$e)', 712 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 713 '&mov ($a2,$a)', 714 715 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 716 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 717 '&xor ($a2,$b)', # a^b, b^c in next round 718 719 '&add ($h,$a4)', # h+=Ch(e,f,g) 720 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 721 '&and ($a3,$a2)', # (b^c)&(a^b) 722 723 '&xor ($a1,$a)', 724 '&add ($h,$a0)', # h+=Sigma1(e) 725 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 726 727 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 728 '&add ($d,$h)', # d+=h 729 '&add ($h,$a3)', # h+=Maj(a,b,c) 730 731 '&mov ($a0,$d)', 732 '&add ($a1,$h);'. # h+=Sigma0(a) 733 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 734 ); 735} 736 737###################################################################### 738# SSSE3 code path 739# 740if ($SZ==4) { # SHA256 only 741my @X = map("%xmm$_",(0..3)); 742my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 743 744$code.=<<___; 745.type ${func}_ssse3,\@function,3 746.align 64 747${func}_ssse3: 748.Lssse3_shortcut: 749 mov %rsp,%rax # copy %rsp 750 push %rbx 751 push %rbp 752 push %r12 753 push %r13 754 push %r14 755 push %r15 756 shl \$4,%rdx # num*16 757 sub \$`$framesz+$win64*16*4`,%rsp 758 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 759 and \$-64,%rsp # align stack frame 760 mov $ctx,$_ctx # save ctx, 1st arg 761 mov $inp,$_inp # save inp, 2nd arh 762 mov %rdx,$_end # save end pointer, "3rd" arg 763 mov %rax,$_rsp # save copy of %rsp 764___ 765$code.=<<___ if ($win64); 766 movaps %xmm6,16*$SZ+32(%rsp) 767 movaps %xmm7,16*$SZ+48(%rsp) 768 movaps %xmm8,16*$SZ+64(%rsp) 769 movaps %xmm9,16*$SZ+80(%rsp) 770___ 771$code.=<<___; 772.Lprologue_ssse3: 773 774 mov $SZ*0($ctx),$A 775 mov $SZ*1($ctx),$B 776 mov $SZ*2($ctx),$C 777 mov $SZ*3($ctx),$D 778 mov $SZ*4($ctx),$E 779 mov $SZ*5($ctx),$F 780 mov $SZ*6($ctx),$G 781 mov $SZ*7($ctx),$H 782___ 783 784$code.=<<___; 785 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 786 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 787 jmp .Lloop_ssse3 788.align 16 789.Lloop_ssse3: 790 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 791 movdqu 0x00($inp),@X[0] 792 movdqu 0x10($inp),@X[1] 793 movdqu 0x20($inp),@X[2] 794 pshufb $t3,@X[0] 795 movdqu 0x30($inp),@X[3] 796 lea $TABLE(%rip),$Tbl 797 pshufb $t3,@X[1] 798 movdqa 0x00($Tbl),$t0 799 movdqa 0x20($Tbl),$t1 800 pshufb $t3,@X[2] 801 paddd @X[0],$t0 802 movdqa 0x40($Tbl),$t2 803 pshufb $t3,@X[3] 804 movdqa 0x60($Tbl),$t3 805 paddd @X[1],$t1 806 paddd @X[2],$t2 807 paddd @X[3],$t3 808 movdqa $t0,0x00(%rsp) 809 mov $A,$a1 810 movdqa $t1,0x10(%rsp) 811 mov $B,$a3 812 movdqa $t2,0x20(%rsp) 813 xor $C,$a3 # magic 814 movdqa $t3,0x30(%rsp) 815 mov $E,$a0 816 jmp .Lssse3_00_47 817 818.align 16 819.Lssse3_00_47: 820 sub \$`-16*2*$SZ`,$Tbl # size optimization 821___ 822sub Xupdate_256_SSSE3 () { 823 ( 824 '&movdqa ($t0,@X[1]);', 825 '&movdqa ($t3,@X[3])', 826 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 827 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 828 '&movdqa ($t1,$t0)', 829 '&movdqa ($t2,$t0);', 830 '&psrld ($t0,$sigma0[2])', 831 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 832 '&psrld ($t2,$sigma0[0])', 833 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 834 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 835 '&pxor ($t0,$t2)', 836 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 837 '&pxor ($t0,$t1)', 838 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 839 '&pxor ($t0,$t2);', 840 '&movdqa ($t2,$t3)', 841 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 842 '&psrld ($t3,$sigma1[2])', 843 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 844 '&psrlq ($t2,$sigma1[0])', 845 '&pxor ($t3,$t2);', 846 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 847 '&pxor ($t3,$t2)', 848 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 849 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 850 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 851 '&movdqa ($t2,$t3);', 852 '&psrld ($t3,$sigma1[2])', 853 '&psrlq ($t2,$sigma1[0])', 854 '&pxor ($t3,$t2);', 855 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 856 '&pxor ($t3,$t2);', 857 '&movdqa ($t2,16*2*$j."($Tbl)")', 858 '&pshufb ($t3,$t5)', 859 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 860 ); 861} 862 863sub SSSE3_256_00_47 () { 864my $j = shift; 865my $body = shift; 866my @X = @_; 867my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 868 869 if (0) { 870 foreach (Xupdate_256_SSSE3()) { # 36 instructions 871 eval; 872 eval(shift(@insns)); 873 eval(shift(@insns)); 874 eval(shift(@insns)); 875 } 876 } else { # squeeze extra 4% on Westmere and 19% on Atom 877 eval(shift(@insns)); #@ 878 &movdqa ($t0,@X[1]); 879 eval(shift(@insns)); 880 eval(shift(@insns)); 881 &movdqa ($t3,@X[3]); 882 eval(shift(@insns)); #@ 883 eval(shift(@insns)); 884 eval(shift(@insns)); 885 eval(shift(@insns)); #@ 886 eval(shift(@insns)); 887 &palignr ($t0,@X[0],$SZ); # X[1..4] 888 eval(shift(@insns)); 889 eval(shift(@insns)); 890 &palignr ($t3,@X[2],$SZ); # X[9..12] 891 eval(shift(@insns)); 892 eval(shift(@insns)); 893 eval(shift(@insns)); 894 eval(shift(@insns)); #@ 895 &movdqa ($t1,$t0); 896 eval(shift(@insns)); 897 eval(shift(@insns)); 898 &movdqa ($t2,$t0); 899 eval(shift(@insns)); #@ 900 eval(shift(@insns)); 901 &psrld ($t0,$sigma0[2]); 902 eval(shift(@insns)); 903 eval(shift(@insns)); 904 eval(shift(@insns)); 905 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 906 eval(shift(@insns)); #@ 907 eval(shift(@insns)); 908 &psrld ($t2,$sigma0[0]); 909 eval(shift(@insns)); 910 eval(shift(@insns)); 911 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 912 eval(shift(@insns)); 913 eval(shift(@insns)); #@ 914 &pslld ($t1,8*$SZ-$sigma0[1]); 915 eval(shift(@insns)); 916 eval(shift(@insns)); 917 &pxor ($t0,$t2); 918 eval(shift(@insns)); #@ 919 eval(shift(@insns)); 920 eval(shift(@insns)); 921 eval(shift(@insns)); #@ 922 &psrld ($t2,$sigma0[1]-$sigma0[0]); 923 eval(shift(@insns)); 924 &pxor ($t0,$t1); 925 eval(shift(@insns)); 926 eval(shift(@insns)); 927 &pslld ($t1,$sigma0[1]-$sigma0[0]); 928 eval(shift(@insns)); 929 eval(shift(@insns)); 930 &pxor ($t0,$t2); 931 eval(shift(@insns)); 932 eval(shift(@insns)); #@ 933 &movdqa ($t2,$t3); 934 eval(shift(@insns)); 935 eval(shift(@insns)); 936 &pxor ($t0,$t1); # sigma0(X[1..4]) 937 eval(shift(@insns)); #@ 938 eval(shift(@insns)); 939 eval(shift(@insns)); 940 &psrld ($t3,$sigma1[2]); 941 eval(shift(@insns)); 942 eval(shift(@insns)); 943 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 944 eval(shift(@insns)); #@ 945 eval(shift(@insns)); 946 &psrlq ($t2,$sigma1[0]); 947 eval(shift(@insns)); 948 eval(shift(@insns)); 949 eval(shift(@insns)); 950 &pxor ($t3,$t2); 951 eval(shift(@insns)); #@ 952 eval(shift(@insns)); 953 eval(shift(@insns)); 954 eval(shift(@insns)); #@ 955 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 956 eval(shift(@insns)); 957 eval(shift(@insns)); 958 &pxor ($t3,$t2); 959 eval(shift(@insns)); #@ 960 eval(shift(@insns)); 961 eval(shift(@insns)); 962 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 963 &pshufd ($t3,$t3,0b10000000); 964 eval(shift(@insns)); 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 &psrldq ($t3,8); 968 eval(shift(@insns)); 969 eval(shift(@insns)); #@ 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 eval(shift(@insns)); #@ 973 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 974 eval(shift(@insns)); 975 eval(shift(@insns)); 976 eval(shift(@insns)); 977 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 978 eval(shift(@insns)); 979 eval(shift(@insns)); #@ 980 eval(shift(@insns)); 981 &movdqa ($t2,$t3); 982 eval(shift(@insns)); 983 eval(shift(@insns)); 984 &psrld ($t3,$sigma1[2]); 985 eval(shift(@insns)); 986 eval(shift(@insns)); #@ 987 &psrlq ($t2,$sigma1[0]); 988 eval(shift(@insns)); 989 eval(shift(@insns)); 990 &pxor ($t3,$t2); 991 eval(shift(@insns)); #@ 992 eval(shift(@insns)); 993 eval(shift(@insns)); 994 eval(shift(@insns)); #@ 995 eval(shift(@insns)); 996 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 997 eval(shift(@insns)); 998 eval(shift(@insns)); 999 eval(shift(@insns)); 1000 &pxor ($t3,$t2); 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); #@ 1004 #&pshufb ($t3,$t5); 1005 &pshufd ($t3,$t3,0b00001000); 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 &movdqa ($t2,16*2*$j."($Tbl)"); 1009 eval(shift(@insns)); #@ 1010 eval(shift(@insns)); 1011 &pslldq ($t3,8); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); 1015 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1016 eval(shift(@insns)); #@ 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 } 1020 &paddd ($t2,@X[0]); 1021 foreach (@insns) { eval; } # remaining instructions 1022 &movdqa (16*$j."(%rsp)",$t2); 1023} 1024 1025 for ($i=0,$j=0; $j<4; $j++) { 1026 &SSSE3_256_00_47($j,\&body_00_15,@X); 1027 push(@X,shift(@X)); # rotate(@X) 1028 } 1029 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1030 &jne (".Lssse3_00_47"); 1031 1032 for ($i=0; $i<16; ) { 1033 foreach(body_00_15()) { eval; } 1034 } 1035$code.=<<___; 1036 mov $_ctx,$ctx 1037 mov $a1,$A 1038 1039 add $SZ*0($ctx),$A 1040 lea 16*$SZ($inp),$inp 1041 add $SZ*1($ctx),$B 1042 add $SZ*2($ctx),$C 1043 add $SZ*3($ctx),$D 1044 add $SZ*4($ctx),$E 1045 add $SZ*5($ctx),$F 1046 add $SZ*6($ctx),$G 1047 add $SZ*7($ctx),$H 1048 1049 cmp $_end,$inp 1050 1051 mov $A,$SZ*0($ctx) 1052 mov $B,$SZ*1($ctx) 1053 mov $C,$SZ*2($ctx) 1054 mov $D,$SZ*3($ctx) 1055 mov $E,$SZ*4($ctx) 1056 mov $F,$SZ*5($ctx) 1057 mov $G,$SZ*6($ctx) 1058 mov $H,$SZ*7($ctx) 1059 jb .Lloop_ssse3 1060 1061 mov $_rsp,%rsi 1062___ 1063$code.=<<___ if ($win64); 1064 movaps 16*$SZ+32(%rsp),%xmm6 1065 movaps 16*$SZ+48(%rsp),%xmm7 1066 movaps 16*$SZ+64(%rsp),%xmm8 1067 movaps 16*$SZ+80(%rsp),%xmm9 1068___ 1069$code.=<<___; 1070 mov -48(%rsi),%r15 1071 mov -40(%rsi),%r14 1072 mov -32(%rsi),%r13 1073 mov -24(%rsi),%r12 1074 mov -16(%rsi),%rbp 1075 mov -8(%rsi),%rbx 1076 lea (%rsi),%rsp 1077.Lepilogue_ssse3: 1078 ret 1079.size ${func}_ssse3,.-${func}_ssse3 1080___ 1081} 1082 1083if ($avx) {{ 1084###################################################################### 1085# XOP code path 1086# 1087if ($SZ==8) { # SHA512 only 1088$code.=<<___; 1089.type ${func}_xop,\@function,3 1090.align 64 1091${func}_xop: 1092.Lxop_shortcut: 1093 mov %rsp,%rax # copy %rsp 1094 push %rbx 1095 push %rbp 1096 push %r12 1097 push %r13 1098 push %r14 1099 push %r15 1100 shl \$4,%rdx # num*16 1101 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1102 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1103 and \$-64,%rsp # align stack frame 1104 mov $ctx,$_ctx # save ctx, 1st arg 1105 mov $inp,$_inp # save inp, 2nd arh 1106 mov %rdx,$_end # save end pointer, "3rd" arg 1107 mov %rax,$_rsp # save copy of %rsp 1108___ 1109$code.=<<___ if ($win64); 1110 movaps %xmm6,16*$SZ+32(%rsp) 1111 movaps %xmm7,16*$SZ+48(%rsp) 1112 movaps %xmm8,16*$SZ+64(%rsp) 1113 movaps %xmm9,16*$SZ+80(%rsp) 1114___ 1115$code.=<<___ if ($win64 && $SZ>4); 1116 movaps %xmm10,16*$SZ+96(%rsp) 1117 movaps %xmm11,16*$SZ+112(%rsp) 1118___ 1119$code.=<<___; 1120.Lprologue_xop: 1121 1122 vzeroupper 1123 mov $SZ*0($ctx),$A 1124 mov $SZ*1($ctx),$B 1125 mov $SZ*2($ctx),$C 1126 mov $SZ*3($ctx),$D 1127 mov $SZ*4($ctx),$E 1128 mov $SZ*5($ctx),$F 1129 mov $SZ*6($ctx),$G 1130 mov $SZ*7($ctx),$H 1131 jmp .Lloop_xop 1132___ 1133 if ($SZ==4) { # SHA256 1134 my @X = map("%xmm$_",(0..3)); 1135 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1136 1137$code.=<<___; 1138.align 16 1139.Lloop_xop: 1140 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1141 vmovdqu 0x00($inp),@X[0] 1142 vmovdqu 0x10($inp),@X[1] 1143 vmovdqu 0x20($inp),@X[2] 1144 vmovdqu 0x30($inp),@X[3] 1145 vpshufb $t3,@X[0],@X[0] 1146 lea $TABLE(%rip),$Tbl 1147 vpshufb $t3,@X[1],@X[1] 1148 vpshufb $t3,@X[2],@X[2] 1149 vpaddd 0x00($Tbl),@X[0],$t0 1150 vpshufb $t3,@X[3],@X[3] 1151 vpaddd 0x20($Tbl),@X[1],$t1 1152 vpaddd 0x40($Tbl),@X[2],$t2 1153 vpaddd 0x60($Tbl),@X[3],$t3 1154 vmovdqa $t0,0x00(%rsp) 1155 mov $A,$a1 1156 vmovdqa $t1,0x10(%rsp) 1157 mov $B,$a3 1158 vmovdqa $t2,0x20(%rsp) 1159 xor $C,$a3 # magic 1160 vmovdqa $t3,0x30(%rsp) 1161 mov $E,$a0 1162 jmp .Lxop_00_47 1163 1164.align 16 1165.Lxop_00_47: 1166 sub \$`-16*2*$SZ`,$Tbl # size optimization 1167___ 1168sub XOP_256_00_47 () { 1169my $j = shift; 1170my $body = shift; 1171my @X = @_; 1172my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1173 1174 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1175 eval(shift(@insns)); 1176 eval(shift(@insns)); 1177 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1178 eval(shift(@insns)); 1179 eval(shift(@insns)); 1180 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1181 eval(shift(@insns)); 1182 eval(shift(@insns)); 1183 &vpsrld ($t0,$t0,$sigma0[2]); 1184 eval(shift(@insns)); 1185 eval(shift(@insns)); 1186 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1187 eval(shift(@insns)); 1188 eval(shift(@insns)); 1189 eval(shift(@insns)); 1190 eval(shift(@insns)); 1191 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1192 eval(shift(@insns)); 1193 eval(shift(@insns)); 1194 &vpxor ($t0,$t0,$t1); 1195 eval(shift(@insns)); 1196 eval(shift(@insns)); 1197 eval(shift(@insns)); 1198 eval(shift(@insns)); 1199 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1200 eval(shift(@insns)); 1201 eval(shift(@insns)); 1202 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1203 eval(shift(@insns)); 1204 eval(shift(@insns)); 1205 &vpsrld ($t2,@X[3],$sigma1[2]); 1206 eval(shift(@insns)); 1207 eval(shift(@insns)); 1208 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1209 eval(shift(@insns)); 1210 eval(shift(@insns)); 1211 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1212 eval(shift(@insns)); 1213 eval(shift(@insns)); 1214 &vpxor ($t3,$t3,$t2); 1215 eval(shift(@insns)); 1216 eval(shift(@insns)); 1217 eval(shift(@insns)); 1218 eval(shift(@insns)); 1219 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1220 eval(shift(@insns)); 1221 eval(shift(@insns)); 1222 eval(shift(@insns)); 1223 eval(shift(@insns)); 1224 &vpsrldq ($t3,$t3,8); 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); 1227 eval(shift(@insns)); 1228 eval(shift(@insns)); 1229 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1230 eval(shift(@insns)); 1231 eval(shift(@insns)); 1232 eval(shift(@insns)); 1233 eval(shift(@insns)); 1234 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1235 eval(shift(@insns)); 1236 eval(shift(@insns)); 1237 &vpsrld ($t2,@X[0],$sigma1[2]); 1238 eval(shift(@insns)); 1239 eval(shift(@insns)); 1240 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1241 eval(shift(@insns)); 1242 eval(shift(@insns)); 1243 &vpxor ($t3,$t3,$t2); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 eval(shift(@insns)); 1247 eval(shift(@insns)); 1248 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1249 eval(shift(@insns)); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 &vpslldq ($t3,$t3,8); # 22 instructions 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); 1257 eval(shift(@insns)); 1258 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1264 foreach (@insns) { eval; } # remaining instructions 1265 &vmovdqa (16*$j."(%rsp)",$t2); 1266} 1267 1268 for ($i=0,$j=0; $j<4; $j++) { 1269 &XOP_256_00_47($j,\&body_00_15,@X); 1270 push(@X,shift(@X)); # rotate(@X) 1271 } 1272 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1273 &jne (".Lxop_00_47"); 1274 1275 for ($i=0; $i<16; ) { 1276 foreach(body_00_15()) { eval; } 1277 } 1278 1279 } else { # SHA512 1280 my @X = map("%xmm$_",(0..7)); 1281 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1282 1283$code.=<<___; 1284.align 16 1285.Lloop_xop: 1286 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1287 vmovdqu 0x00($inp),@X[0] 1288 lea $TABLE+0x80(%rip),$Tbl # size optimization 1289 vmovdqu 0x10($inp),@X[1] 1290 vmovdqu 0x20($inp),@X[2] 1291 vpshufb $t3,@X[0],@X[0] 1292 vmovdqu 0x30($inp),@X[3] 1293 vpshufb $t3,@X[1],@X[1] 1294 vmovdqu 0x40($inp),@X[4] 1295 vpshufb $t3,@X[2],@X[2] 1296 vmovdqu 0x50($inp),@X[5] 1297 vpshufb $t3,@X[3],@X[3] 1298 vmovdqu 0x60($inp),@X[6] 1299 vpshufb $t3,@X[4],@X[4] 1300 vmovdqu 0x70($inp),@X[7] 1301 vpshufb $t3,@X[5],@X[5] 1302 vpaddq -0x80($Tbl),@X[0],$t0 1303 vpshufb $t3,@X[6],@X[6] 1304 vpaddq -0x60($Tbl),@X[1],$t1 1305 vpshufb $t3,@X[7],@X[7] 1306 vpaddq -0x40($Tbl),@X[2],$t2 1307 vpaddq -0x20($Tbl),@X[3],$t3 1308 vmovdqa $t0,0x00(%rsp) 1309 vpaddq 0x00($Tbl),@X[4],$t0 1310 vmovdqa $t1,0x10(%rsp) 1311 vpaddq 0x20($Tbl),@X[5],$t1 1312 vmovdqa $t2,0x20(%rsp) 1313 vpaddq 0x40($Tbl),@X[6],$t2 1314 vmovdqa $t3,0x30(%rsp) 1315 vpaddq 0x60($Tbl),@X[7],$t3 1316 vmovdqa $t0,0x40(%rsp) 1317 mov $A,$a1 1318 vmovdqa $t1,0x50(%rsp) 1319 mov $B,$a3 1320 vmovdqa $t2,0x60(%rsp) 1321 xor $C,$a3 # magic 1322 vmovdqa $t3,0x70(%rsp) 1323 mov $E,$a0 1324 jmp .Lxop_00_47 1325 1326.align 16 1327.Lxop_00_47: 1328 add \$`16*2*$SZ`,$Tbl 1329___ 1330sub XOP_512_00_47 () { 1331my $j = shift; 1332my $body = shift; 1333my @X = @_; 1334my @insns = (&$body,&$body); # 52 instructions 1335 1336 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1337 eval(shift(@insns)); 1338 eval(shift(@insns)); 1339 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1340 eval(shift(@insns)); 1341 eval(shift(@insns)); 1342 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1343 eval(shift(@insns)); 1344 eval(shift(@insns)); 1345 &vpsrlq ($t0,$t0,$sigma0[2]); 1346 eval(shift(@insns)); 1347 eval(shift(@insns)); 1348 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1349 eval(shift(@insns)); 1350 eval(shift(@insns)); 1351 eval(shift(@insns)); 1352 eval(shift(@insns)); 1353 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1354 eval(shift(@insns)); 1355 eval(shift(@insns)); 1356 &vpxor ($t0,$t0,$t1); 1357 eval(shift(@insns)); 1358 eval(shift(@insns)); 1359 eval(shift(@insns)); 1360 eval(shift(@insns)); 1361 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1362 eval(shift(@insns)); 1363 eval(shift(@insns)); 1364 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1365 eval(shift(@insns)); 1366 eval(shift(@insns)); 1367 &vpsrlq ($t2,@X[7],$sigma1[2]); 1368 eval(shift(@insns)); 1369 eval(shift(@insns)); 1370 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1371 eval(shift(@insns)); 1372 eval(shift(@insns)); 1373 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1374 eval(shift(@insns)); 1375 eval(shift(@insns)); 1376 &vpxor ($t3,$t3,$t2); 1377 eval(shift(@insns)); 1378 eval(shift(@insns)); 1379 eval(shift(@insns)); 1380 eval(shift(@insns)); 1381 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1382 eval(shift(@insns)); 1383 eval(shift(@insns)); 1384 eval(shift(@insns)); 1385 eval(shift(@insns)); 1386 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1387 eval(shift(@insns)); 1388 eval(shift(@insns)); 1389 eval(shift(@insns)); 1390 eval(shift(@insns)); 1391 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1392 foreach (@insns) { eval; } # remaining instructions 1393 &vmovdqa (16*$j."(%rsp)",$t2); 1394} 1395 1396 for ($i=0,$j=0; $j<8; $j++) { 1397 &XOP_512_00_47($j,\&body_00_15,@X); 1398 push(@X,shift(@X)); # rotate(@X) 1399 } 1400 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1401 &jne (".Lxop_00_47"); 1402 1403 for ($i=0; $i<16; ) { 1404 foreach(body_00_15()) { eval; } 1405 } 1406} 1407$code.=<<___; 1408 mov $_ctx,$ctx 1409 mov $a1,$A 1410 1411 add $SZ*0($ctx),$A 1412 lea 16*$SZ($inp),$inp 1413 add $SZ*1($ctx),$B 1414 add $SZ*2($ctx),$C 1415 add $SZ*3($ctx),$D 1416 add $SZ*4($ctx),$E 1417 add $SZ*5($ctx),$F 1418 add $SZ*6($ctx),$G 1419 add $SZ*7($ctx),$H 1420 1421 cmp $_end,$inp 1422 1423 mov $A,$SZ*0($ctx) 1424 mov $B,$SZ*1($ctx) 1425 mov $C,$SZ*2($ctx) 1426 mov $D,$SZ*3($ctx) 1427 mov $E,$SZ*4($ctx) 1428 mov $F,$SZ*5($ctx) 1429 mov $G,$SZ*6($ctx) 1430 mov $H,$SZ*7($ctx) 1431 jb .Lloop_xop 1432 1433 mov $_rsp,%rsi 1434 vzeroupper 1435___ 1436$code.=<<___ if ($win64); 1437 movaps 16*$SZ+32(%rsp),%xmm6 1438 movaps 16*$SZ+48(%rsp),%xmm7 1439 movaps 16*$SZ+64(%rsp),%xmm8 1440 movaps 16*$SZ+80(%rsp),%xmm9 1441___ 1442$code.=<<___ if ($win64 && $SZ>4); 1443 movaps 16*$SZ+96(%rsp),%xmm10 1444 movaps 16*$SZ+112(%rsp),%xmm11 1445___ 1446$code.=<<___; 1447 mov -48(%rsi),%r15 1448 mov -40(%rsi),%r14 1449 mov -32(%rsi),%r13 1450 mov -24(%rsi),%r12 1451 mov -16(%rsi),%rbp 1452 mov -8(%rsi),%rbx 1453 lea (%rsi),%rsp 1454.Lepilogue_xop: 1455 ret 1456.size ${func}_xop,.-${func}_xop 1457___ 1458} 1459###################################################################### 1460# AVX+shrd code path 1461# 1462local *ror = sub { &shrd(@_[0],@_) }; 1463 1464$code.=<<___; 1465.type ${func}_avx,\@function,3 1466.align 64 1467${func}_avx: 1468.Lavx_shortcut: 1469 mov %rsp,%rax # copy %rsp 1470 push %rbx 1471 push %rbp 1472 push %r12 1473 push %r13 1474 push %r14 1475 push %r15 1476 shl \$4,%rdx # num*16 1477 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1478 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1479 and \$-64,%rsp # align stack frame 1480 mov $ctx,$_ctx # save ctx, 1st arg 1481 mov $inp,$_inp # save inp, 2nd arh 1482 mov %rdx,$_end # save end pointer, "3rd" arg 1483 mov %rax,$_rsp # save copy of %rsp 1484___ 1485$code.=<<___ if ($win64); 1486 movaps %xmm6,16*$SZ+32(%rsp) 1487 movaps %xmm7,16*$SZ+48(%rsp) 1488 movaps %xmm8,16*$SZ+64(%rsp) 1489 movaps %xmm9,16*$SZ+80(%rsp) 1490___ 1491$code.=<<___ if ($win64 && $SZ>4); 1492 movaps %xmm10,16*$SZ+96(%rsp) 1493 movaps %xmm11,16*$SZ+112(%rsp) 1494___ 1495$code.=<<___; 1496.Lprologue_avx: 1497 1498 vzeroupper 1499 mov $SZ*0($ctx),$A 1500 mov $SZ*1($ctx),$B 1501 mov $SZ*2($ctx),$C 1502 mov $SZ*3($ctx),$D 1503 mov $SZ*4($ctx),$E 1504 mov $SZ*5($ctx),$F 1505 mov $SZ*6($ctx),$G 1506 mov $SZ*7($ctx),$H 1507___ 1508 if ($SZ==4) { # SHA256 1509 my @X = map("%xmm$_",(0..3)); 1510 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1511 1512$code.=<<___; 1513 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1514 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1515 jmp .Lloop_avx 1516.align 16 1517.Lloop_avx: 1518 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1519 vmovdqu 0x00($inp),@X[0] 1520 vmovdqu 0x10($inp),@X[1] 1521 vmovdqu 0x20($inp),@X[2] 1522 vmovdqu 0x30($inp),@X[3] 1523 vpshufb $t3,@X[0],@X[0] 1524 lea $TABLE(%rip),$Tbl 1525 vpshufb $t3,@X[1],@X[1] 1526 vpshufb $t3,@X[2],@X[2] 1527 vpaddd 0x00($Tbl),@X[0],$t0 1528 vpshufb $t3,@X[3],@X[3] 1529 vpaddd 0x20($Tbl),@X[1],$t1 1530 vpaddd 0x40($Tbl),@X[2],$t2 1531 vpaddd 0x60($Tbl),@X[3],$t3 1532 vmovdqa $t0,0x00(%rsp) 1533 mov $A,$a1 1534 vmovdqa $t1,0x10(%rsp) 1535 mov $B,$a3 1536 vmovdqa $t2,0x20(%rsp) 1537 xor $C,$a3 # magic 1538 vmovdqa $t3,0x30(%rsp) 1539 mov $E,$a0 1540 jmp .Lavx_00_47 1541 1542.align 16 1543.Lavx_00_47: 1544 sub \$`-16*2*$SZ`,$Tbl # size optimization 1545___ 1546sub Xupdate_256_AVX () { 1547 ( 1548 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1549 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1550 '&vpsrld ($t2,$t0,$sigma0[0]);', 1551 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1552 '&vpsrld ($t3,$t0,$sigma0[2])', 1553 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1554 '&vpxor ($t0,$t3,$t2)', 1555 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1556 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1557 '&vpxor ($t0,$t0,$t1)', 1558 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1559 '&vpxor ($t0,$t0,$t2)', 1560 '&vpsrld ($t2,$t3,$sigma1[2]);', 1561 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1562 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1563 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1564 '&vpxor ($t2,$t2,$t3);', 1565 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1566 '&vpxor ($t2,$t2,$t3)', 1567 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1568 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1569 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1570 '&vpsrld ($t2,$t3,$sigma1[2])', 1571 '&vpsrlq ($t3,$t3,$sigma1[0])', 1572 '&vpxor ($t2,$t2,$t3);', 1573 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1574 '&vpxor ($t2,$t2,$t3)', 1575 '&vpshufb ($t2,$t2,$t5)', 1576 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1577 ); 1578} 1579 1580sub AVX_256_00_47 () { 1581my $j = shift; 1582my $body = shift; 1583my @X = @_; 1584my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1585 1586 foreach (Xupdate_256_AVX()) { # 29 instructions 1587 eval; 1588 eval(shift(@insns)); 1589 eval(shift(@insns)); 1590 eval(shift(@insns)); 1591 } 1592 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1593 foreach (@insns) { eval; } # remaining instructions 1594 &vmovdqa (16*$j."(%rsp)",$t2); 1595} 1596 1597 for ($i=0,$j=0; $j<4; $j++) { 1598 &AVX_256_00_47($j,\&body_00_15,@X); 1599 push(@X,shift(@X)); # rotate(@X) 1600 } 1601 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1602 &jne (".Lavx_00_47"); 1603 1604 for ($i=0; $i<16; ) { 1605 foreach(body_00_15()) { eval; } 1606 } 1607 1608 } else { # SHA512 1609 my @X = map("%xmm$_",(0..7)); 1610 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1611 1612$code.=<<___; 1613 jmp .Lloop_avx 1614.align 16 1615.Lloop_avx: 1616 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1617 vmovdqu 0x00($inp),@X[0] 1618 lea $TABLE+0x80(%rip),$Tbl # size optimization 1619 vmovdqu 0x10($inp),@X[1] 1620 vmovdqu 0x20($inp),@X[2] 1621 vpshufb $t3,@X[0],@X[0] 1622 vmovdqu 0x30($inp),@X[3] 1623 vpshufb $t3,@X[1],@X[1] 1624 vmovdqu 0x40($inp),@X[4] 1625 vpshufb $t3,@X[2],@X[2] 1626 vmovdqu 0x50($inp),@X[5] 1627 vpshufb $t3,@X[3],@X[3] 1628 vmovdqu 0x60($inp),@X[6] 1629 vpshufb $t3,@X[4],@X[4] 1630 vmovdqu 0x70($inp),@X[7] 1631 vpshufb $t3,@X[5],@X[5] 1632 vpaddq -0x80($Tbl),@X[0],$t0 1633 vpshufb $t3,@X[6],@X[6] 1634 vpaddq -0x60($Tbl),@X[1],$t1 1635 vpshufb $t3,@X[7],@X[7] 1636 vpaddq -0x40($Tbl),@X[2],$t2 1637 vpaddq -0x20($Tbl),@X[3],$t3 1638 vmovdqa $t0,0x00(%rsp) 1639 vpaddq 0x00($Tbl),@X[4],$t0 1640 vmovdqa $t1,0x10(%rsp) 1641 vpaddq 0x20($Tbl),@X[5],$t1 1642 vmovdqa $t2,0x20(%rsp) 1643 vpaddq 0x40($Tbl),@X[6],$t2 1644 vmovdqa $t3,0x30(%rsp) 1645 vpaddq 0x60($Tbl),@X[7],$t3 1646 vmovdqa $t0,0x40(%rsp) 1647 mov $A,$a1 1648 vmovdqa $t1,0x50(%rsp) 1649 mov $B,$a3 1650 vmovdqa $t2,0x60(%rsp) 1651 xor $C,$a3 # magic 1652 vmovdqa $t3,0x70(%rsp) 1653 mov $E,$a0 1654 jmp .Lavx_00_47 1655 1656.align 16 1657.Lavx_00_47: 1658 add \$`16*2*$SZ`,$Tbl 1659___ 1660sub Xupdate_512_AVX () { 1661 ( 1662 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1663 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1664 '&vpsrlq ($t2,$t0,$sigma0[0])', 1665 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1666 '&vpsrlq ($t3,$t0,$sigma0[2])', 1667 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1668 '&vpxor ($t0,$t3,$t2)', 1669 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1670 '&vpxor ($t0,$t0,$t1)', 1671 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1672 '&vpxor ($t0,$t0,$t2)', 1673 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1674 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1675 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1676 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1677 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1678 '&vpxor ($t3,$t3,$t2)', 1679 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1680 '&vpxor ($t3,$t3,$t1)', 1681 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1682 '&vpxor ($t3,$t3,$t2)', 1683 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1684 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1685 ); 1686} 1687 1688sub AVX_512_00_47 () { 1689my $j = shift; 1690my $body = shift; 1691my @X = @_; 1692my @insns = (&$body,&$body); # 52 instructions 1693 1694 foreach (Xupdate_512_AVX()) { # 23 instructions 1695 eval; 1696 eval(shift(@insns)); 1697 eval(shift(@insns)); 1698 } 1699 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1700 foreach (@insns) { eval; } # remaining instructions 1701 &vmovdqa (16*$j."(%rsp)",$t2); 1702} 1703 1704 for ($i=0,$j=0; $j<8; $j++) { 1705 &AVX_512_00_47($j,\&body_00_15,@X); 1706 push(@X,shift(@X)); # rotate(@X) 1707 } 1708 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1709 &jne (".Lavx_00_47"); 1710 1711 for ($i=0; $i<16; ) { 1712 foreach(body_00_15()) { eval; } 1713 } 1714} 1715$code.=<<___; 1716 mov $_ctx,$ctx 1717 mov $a1,$A 1718 1719 add $SZ*0($ctx),$A 1720 lea 16*$SZ($inp),$inp 1721 add $SZ*1($ctx),$B 1722 add $SZ*2($ctx),$C 1723 add $SZ*3($ctx),$D 1724 add $SZ*4($ctx),$E 1725 add $SZ*5($ctx),$F 1726 add $SZ*6($ctx),$G 1727 add $SZ*7($ctx),$H 1728 1729 cmp $_end,$inp 1730 1731 mov $A,$SZ*0($ctx) 1732 mov $B,$SZ*1($ctx) 1733 mov $C,$SZ*2($ctx) 1734 mov $D,$SZ*3($ctx) 1735 mov $E,$SZ*4($ctx) 1736 mov $F,$SZ*5($ctx) 1737 mov $G,$SZ*6($ctx) 1738 mov $H,$SZ*7($ctx) 1739 jb .Lloop_avx 1740 1741 mov $_rsp,%rsi 1742 vzeroupper 1743___ 1744$code.=<<___ if ($win64); 1745 movaps 16*$SZ+32(%rsp),%xmm6 1746 movaps 16*$SZ+48(%rsp),%xmm7 1747 movaps 16*$SZ+64(%rsp),%xmm8 1748 movaps 16*$SZ+80(%rsp),%xmm9 1749___ 1750$code.=<<___ if ($win64 && $SZ>4); 1751 movaps 16*$SZ+96(%rsp),%xmm10 1752 movaps 16*$SZ+112(%rsp),%xmm11 1753___ 1754$code.=<<___; 1755 mov -48(%rsi),%r15 1756 mov -40(%rsi),%r14 1757 mov -32(%rsi),%r13 1758 mov -24(%rsi),%r12 1759 mov -16(%rsi),%rbp 1760 mov -8(%rsi),%rbx 1761 lea (%rsi),%rsp 1762.Lepilogue_avx: 1763 ret 1764.size ${func}_avx,.-${func}_avx 1765___ 1766 1767if ($avx>1) {{ 1768###################################################################### 1769# AVX2+BMI code path 1770# 1771my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1772my $PUSH8=8*2*$SZ; 1773use integer; 1774 1775sub bodyx_00_15 () { 1776 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1777 ( 1778 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1779 1780 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1781 '&and ($a4,$e)', # f&e 1782 '&rorx ($a0,$e,$Sigma1[2])', 1783 '&rorx ($a2,$e,$Sigma1[1])', 1784 1785 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1786 '&lea ($h,"($h,$a4)")', 1787 '&andn ($a4,$e,$g)', # ~e&g 1788 '&xor ($a0,$a2)', 1789 1790 '&rorx ($a1,$e,$Sigma1[0])', 1791 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1792 '&xor ($a0,$a1)', # Sigma1(e) 1793 '&mov ($a2,$a)', 1794 1795 '&rorx ($a4,$a,$Sigma0[2])', 1796 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1797 '&xor ($a2,$b)', # a^b, b^c in next round 1798 '&rorx ($a1,$a,$Sigma0[1])', 1799 1800 '&rorx ($a0,$a,$Sigma0[0])', 1801 '&lea ($d,"($d,$h)")', # d+=h 1802 '&and ($a3,$a2)', # (b^c)&(a^b) 1803 '&xor ($a1,$a4)', 1804 1805 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1806 '&xor ($a1,$a0)', # Sigma0(a) 1807 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1808 '&mov ($a4,$e)', # copy of f in future 1809 1810 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1811 ); 1812 # and at the finish one has to $a+=$a1 1813} 1814 1815$code.=<<___; 1816.type ${func}_avx2,\@function,3 1817.align 64 1818${func}_avx2: 1819.Lavx2_shortcut: 1820 mov %rsp,%rax # copy %rsp 1821 push %rbx 1822 push %rbp 1823 push %r12 1824 push %r13 1825 push %r14 1826 push %r15 1827 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1828 shl \$4,%rdx # num*16 1829 and \$-256*$SZ,%rsp # align stack frame 1830 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1831 add \$`2*$SZ*($rounds-8)`,%rsp 1832 mov $ctx,$_ctx # save ctx, 1st arg 1833 mov $inp,$_inp # save inp, 2nd arh 1834 mov %rdx,$_end # save end pointer, "3rd" arg 1835 mov %rax,$_rsp # save copy of %rsp 1836___ 1837$code.=<<___ if ($win64); 1838 movaps %xmm6,16*$SZ+32(%rsp) 1839 movaps %xmm7,16*$SZ+48(%rsp) 1840 movaps %xmm8,16*$SZ+64(%rsp) 1841 movaps %xmm9,16*$SZ+80(%rsp) 1842___ 1843$code.=<<___ if ($win64 && $SZ>4); 1844 movaps %xmm10,16*$SZ+96(%rsp) 1845 movaps %xmm11,16*$SZ+112(%rsp) 1846___ 1847$code.=<<___; 1848.Lprologue_avx2: 1849 1850 vzeroupper 1851 sub \$-16*$SZ,$inp # inp++, size optimization 1852 mov $SZ*0($ctx),$A 1853 mov $inp,%r12 # borrow $T1 1854 mov $SZ*1($ctx),$B 1855 cmp %rdx,$inp # $_end 1856 mov $SZ*2($ctx),$C 1857 cmove %rsp,%r12 # next block or random data 1858 mov $SZ*3($ctx),$D 1859 mov $SZ*4($ctx),$E 1860 mov $SZ*5($ctx),$F 1861 mov $SZ*6($ctx),$G 1862 mov $SZ*7($ctx),$H 1863___ 1864 if ($SZ==4) { # SHA256 1865 my @X = map("%ymm$_",(0..3)); 1866 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1867 1868$code.=<<___; 1869 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1870 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1871 jmp .Loop_avx2 1872.align 16 1873.Loop_avx2: 1874 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1875 vmovdqu -16*$SZ+0($inp),%xmm0 1876 vmovdqu -16*$SZ+16($inp),%xmm1 1877 vmovdqu -16*$SZ+32($inp),%xmm2 1878 vmovdqu -16*$SZ+48($inp),%xmm3 1879 #mov $inp,$_inp # offload $inp 1880 vinserti128 \$1,(%r12),@X[0],@X[0] 1881 vinserti128 \$1,16(%r12),@X[1],@X[1] 1882 vpshufb $t3,@X[0],@X[0] 1883 vinserti128 \$1,32(%r12),@X[2],@X[2] 1884 vpshufb $t3,@X[1],@X[1] 1885 vinserti128 \$1,48(%r12),@X[3],@X[3] 1886 1887 lea $TABLE(%rip),$Tbl 1888 vpshufb $t3,@X[2],@X[2] 1889 vpaddd 0x00($Tbl),@X[0],$t0 1890 vpshufb $t3,@X[3],@X[3] 1891 vpaddd 0x20($Tbl),@X[1],$t1 1892 vpaddd 0x40($Tbl),@X[2],$t2 1893 vpaddd 0x60($Tbl),@X[3],$t3 1894 vmovdqa $t0,0x00(%rsp) 1895 xor $a1,$a1 1896 vmovdqa $t1,0x20(%rsp) 1897 lea -$PUSH8(%rsp),%rsp 1898 mov $B,$a3 1899 vmovdqa $t2,0x00(%rsp) 1900 xor $C,$a3 # magic 1901 vmovdqa $t3,0x20(%rsp) 1902 mov $F,$a4 1903 sub \$-16*2*$SZ,$Tbl # size optimization 1904 jmp .Lavx2_00_47 1905 1906.align 16 1907.Lavx2_00_47: 1908___ 1909 1910sub AVX2_256_00_47 () { 1911my $j = shift; 1912my $body = shift; 1913my @X = @_; 1914my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1915my $base = "+2*$PUSH8(%rsp)"; 1916 1917 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1918 foreach (Xupdate_256_AVX()) { # 29 instructions 1919 eval; 1920 eval(shift(@insns)); 1921 eval(shift(@insns)); 1922 eval(shift(@insns)); 1923 } 1924 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1925 foreach (@insns) { eval; } # remaining instructions 1926 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1927} 1928 1929 for ($i=0,$j=0; $j<4; $j++) { 1930 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1931 push(@X,shift(@X)); # rotate(@X) 1932 } 1933 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1934 &cmpb (($SZ-1)."($Tbl)",0); 1935 &jne (".Lavx2_00_47"); 1936 1937 for ($i=0; $i<16; ) { 1938 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1939 foreach(bodyx_00_15()) { eval; } 1940 } 1941 } else { # SHA512 1942 my @X = map("%ymm$_",(0..7)); 1943 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1944 1945$code.=<<___; 1946 jmp .Loop_avx2 1947.align 16 1948.Loop_avx2: 1949 vmovdqu -16*$SZ($inp),%xmm0 1950 vmovdqu -16*$SZ+16($inp),%xmm1 1951 vmovdqu -16*$SZ+32($inp),%xmm2 1952 lea $TABLE+0x80(%rip),$Tbl # size optimization 1953 vmovdqu -16*$SZ+48($inp),%xmm3 1954 vmovdqu -16*$SZ+64($inp),%xmm4 1955 vmovdqu -16*$SZ+80($inp),%xmm5 1956 vmovdqu -16*$SZ+96($inp),%xmm6 1957 vmovdqu -16*$SZ+112($inp),%xmm7 1958 #mov $inp,$_inp # offload $inp 1959 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1960 vinserti128 \$1,(%r12),@X[0],@X[0] 1961 vinserti128 \$1,16(%r12),@X[1],@X[1] 1962 vpshufb $t2,@X[0],@X[0] 1963 vinserti128 \$1,32(%r12),@X[2],@X[2] 1964 vpshufb $t2,@X[1],@X[1] 1965 vinserti128 \$1,48(%r12),@X[3],@X[3] 1966 vpshufb $t2,@X[2],@X[2] 1967 vinserti128 \$1,64(%r12),@X[4],@X[4] 1968 vpshufb $t2,@X[3],@X[3] 1969 vinserti128 \$1,80(%r12),@X[5],@X[5] 1970 vpshufb $t2,@X[4],@X[4] 1971 vinserti128 \$1,96(%r12),@X[6],@X[6] 1972 vpshufb $t2,@X[5],@X[5] 1973 vinserti128 \$1,112(%r12),@X[7],@X[7] 1974 1975 vpaddq -0x80($Tbl),@X[0],$t0 1976 vpshufb $t2,@X[6],@X[6] 1977 vpaddq -0x60($Tbl),@X[1],$t1 1978 vpshufb $t2,@X[7],@X[7] 1979 vpaddq -0x40($Tbl),@X[2],$t2 1980 vpaddq -0x20($Tbl),@X[3],$t3 1981 vmovdqa $t0,0x00(%rsp) 1982 vpaddq 0x00($Tbl),@X[4],$t0 1983 vmovdqa $t1,0x20(%rsp) 1984 vpaddq 0x20($Tbl),@X[5],$t1 1985 vmovdqa $t2,0x40(%rsp) 1986 vpaddq 0x40($Tbl),@X[6],$t2 1987 vmovdqa $t3,0x60(%rsp) 1988 lea -$PUSH8(%rsp),%rsp 1989 vpaddq 0x60($Tbl),@X[7],$t3 1990 vmovdqa $t0,0x00(%rsp) 1991 xor $a1,$a1 1992 vmovdqa $t1,0x20(%rsp) 1993 mov $B,$a3 1994 vmovdqa $t2,0x40(%rsp) 1995 xor $C,$a3 # magic 1996 vmovdqa $t3,0x60(%rsp) 1997 mov $F,$a4 1998 add \$16*2*$SZ,$Tbl 1999 jmp .Lavx2_00_47 2000 2001.align 16 2002.Lavx2_00_47: 2003___ 2004 2005sub AVX2_512_00_47 () { 2006my $j = shift; 2007my $body = shift; 2008my @X = @_; 2009my @insns = (&$body,&$body); # 48 instructions 2010my $base = "+2*$PUSH8(%rsp)"; 2011 2012 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2013 foreach (Xupdate_512_AVX()) { # 23 instructions 2014 eval; 2015 if ($_ !~ /\;$/) { 2016 eval(shift(@insns)); 2017 eval(shift(@insns)); 2018 eval(shift(@insns)); 2019 } 2020 } 2021 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2022 foreach (@insns) { eval; } # remaining instructions 2023 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2024} 2025 2026 for ($i=0,$j=0; $j<8; $j++) { 2027 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2028 push(@X,shift(@X)); # rotate(@X) 2029 } 2030 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2031 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2032 &jne (".Lavx2_00_47"); 2033 2034 for ($i=0; $i<16; ) { 2035 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2036 foreach(bodyx_00_15()) { eval; } 2037 } 2038} 2039$code.=<<___; 2040 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2041 add $a1,$A 2042 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2043 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2044 2045 add $SZ*0($ctx),$A 2046 add $SZ*1($ctx),$B 2047 add $SZ*2($ctx),$C 2048 add $SZ*3($ctx),$D 2049 add $SZ*4($ctx),$E 2050 add $SZ*5($ctx),$F 2051 add $SZ*6($ctx),$G 2052 add $SZ*7($ctx),$H 2053 2054 mov $A,$SZ*0($ctx) 2055 mov $B,$SZ*1($ctx) 2056 mov $C,$SZ*2($ctx) 2057 mov $D,$SZ*3($ctx) 2058 mov $E,$SZ*4($ctx) 2059 mov $F,$SZ*5($ctx) 2060 mov $G,$SZ*6($ctx) 2061 mov $H,$SZ*7($ctx) 2062 2063 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2064 je .Ldone_avx2 2065 2066 xor $a1,$a1 2067 mov $B,$a3 2068 xor $C,$a3 # magic 2069 mov $F,$a4 2070 jmp .Lower_avx2 2071.align 16 2072.Lower_avx2: 2073___ 2074 for ($i=0; $i<8; ) { 2075 my $base="+16($Tbl)"; 2076 foreach(bodyx_00_15()) { eval; } 2077 } 2078$code.=<<___; 2079 lea -$PUSH8($Tbl),$Tbl 2080 cmp %rsp,$Tbl 2081 jae .Lower_avx2 2082 2083 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2084 add $a1,$A 2085 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2086 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2087 2088 add $SZ*0($ctx),$A 2089 add $SZ*1($ctx),$B 2090 add $SZ*2($ctx),$C 2091 add $SZ*3($ctx),$D 2092 add $SZ*4($ctx),$E 2093 add $SZ*5($ctx),$F 2094 lea `2*16*$SZ`($inp),$inp # inp+=2 2095 add $SZ*6($ctx),$G 2096 mov $inp,%r12 2097 add $SZ*7($ctx),$H 2098 cmp $_end,$inp 2099 2100 mov $A,$SZ*0($ctx) 2101 cmove %rsp,%r12 # next block or stale data 2102 mov $B,$SZ*1($ctx) 2103 mov $C,$SZ*2($ctx) 2104 mov $D,$SZ*3($ctx) 2105 mov $E,$SZ*4($ctx) 2106 mov $F,$SZ*5($ctx) 2107 mov $G,$SZ*6($ctx) 2108 mov $H,$SZ*7($ctx) 2109 2110 jbe .Loop_avx2 2111 lea (%rsp),$Tbl 2112 2113.Ldone_avx2: 2114 lea ($Tbl),%rsp 2115 mov $_rsp,%rsi 2116 vzeroupper 2117___ 2118$code.=<<___ if ($win64); 2119 movaps 16*$SZ+32(%rsp),%xmm6 2120 movaps 16*$SZ+48(%rsp),%xmm7 2121 movaps 16*$SZ+64(%rsp),%xmm8 2122 movaps 16*$SZ+80(%rsp),%xmm9 2123___ 2124$code.=<<___ if ($win64 && $SZ>4); 2125 movaps 16*$SZ+96(%rsp),%xmm10 2126 movaps 16*$SZ+112(%rsp),%xmm11 2127___ 2128$code.=<<___; 2129 mov -48(%rsi),%r15 2130 mov -40(%rsi),%r14 2131 mov -32(%rsi),%r13 2132 mov -24(%rsi),%r12 2133 mov -16(%rsi),%rbp 2134 mov -8(%rsi),%rbx 2135 lea (%rsi),%rsp 2136.Lepilogue_avx2: 2137 ret 2138.size ${func}_avx2,.-${func}_avx2 2139___ 2140}} 2141}}}}} 2142 2143# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2144# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2145if ($win64) { 2146$rec="%rcx"; 2147$frame="%rdx"; 2148$context="%r8"; 2149$disp="%r9"; 2150 2151$code.=<<___; 2152.extern __imp_RtlVirtualUnwind 2153.type se_handler,\@abi-omnipotent 2154.align 16 2155se_handler: 2156 push %rsi 2157 push %rdi 2158 push %rbx 2159 push %rbp 2160 push %r12 2161 push %r13 2162 push %r14 2163 push %r15 2164 pushfq 2165 sub \$64,%rsp 2166 2167 mov 120($context),%rax # pull context->Rax 2168 mov 248($context),%rbx # pull context->Rip 2169 2170 mov 8($disp),%rsi # disp->ImageBase 2171 mov 56($disp),%r11 # disp->HanderlData 2172 2173 mov 0(%r11),%r10d # HandlerData[0] 2174 lea (%rsi,%r10),%r10 # prologue label 2175 cmp %r10,%rbx # context->Rip<prologue label 2176 jb .Lin_prologue 2177 2178 mov 152($context),%rax # pull context->Rsp 2179 2180 mov 4(%r11),%r10d # HandlerData[1] 2181 lea (%rsi,%r10),%r10 # epilogue label 2182 cmp %r10,%rbx # context->Rip>=epilogue label 2183 jae .Lin_prologue 2184___ 2185$code.=<<___ if ($avx>1); 2186 lea .Lavx2_shortcut(%rip),%r10 2187 cmp %r10,%rbx # context->Rip<avx2_shortcut 2188 jb .Lnot_in_avx2 2189 2190 and \$-256*$SZ,%rax 2191 add \$`2*$SZ*($rounds-8)`,%rax 2192.Lnot_in_avx2: 2193___ 2194$code.=<<___; 2195 mov %rax,%rsi # put aside Rsp 2196 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2197 2198 mov -8(%rax),%rbx 2199 mov -16(%rax),%rbp 2200 mov -24(%rax),%r12 2201 mov -32(%rax),%r13 2202 mov -40(%rax),%r14 2203 mov -48(%rax),%r15 2204 mov %rbx,144($context) # restore context->Rbx 2205 mov %rbp,160($context) # restore context->Rbp 2206 mov %r12,216($context) # restore context->R12 2207 mov %r13,224($context) # restore context->R13 2208 mov %r14,232($context) # restore context->R14 2209 mov %r15,240($context) # restore context->R15 2210 2211 lea .Lepilogue(%rip),%r10 2212 cmp %r10,%rbx 2213 jb .Lin_prologue # non-AVX code 2214 2215 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2216 lea 512($context),%rdi # &context.Xmm6 2217 mov \$`$SZ==4?8:12`,%ecx 2218 .long 0xa548f3fc # cld; rep movsq 2219 2220.Lin_prologue: 2221 mov 8(%rax),%rdi 2222 mov 16(%rax),%rsi 2223 mov %rax,152($context) # restore context->Rsp 2224 mov %rsi,168($context) # restore context->Rsi 2225 mov %rdi,176($context) # restore context->Rdi 2226 2227 mov 40($disp),%rdi # disp->ContextRecord 2228 mov $context,%rsi # context 2229 mov \$154,%ecx # sizeof(CONTEXT) 2230 .long 0xa548f3fc # cld; rep movsq 2231 2232 mov $disp,%rsi 2233 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2234 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2235 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2236 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2237 mov 40(%rsi),%r10 # disp->ContextRecord 2238 lea 56(%rsi),%r11 # &disp->HandlerData 2239 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2240 mov %r10,32(%rsp) # arg5 2241 mov %r11,40(%rsp) # arg6 2242 mov %r12,48(%rsp) # arg7 2243 mov %rcx,56(%rsp) # arg8, (NULL) 2244 call *__imp_RtlVirtualUnwind(%rip) 2245 2246 mov \$1,%eax # ExceptionContinueSearch 2247 add \$64,%rsp 2248 popfq 2249 pop %r15 2250 pop %r14 2251 pop %r13 2252 pop %r12 2253 pop %rbp 2254 pop %rbx 2255 pop %rdi 2256 pop %rsi 2257 ret 2258.size se_handler,.-se_handler 2259___ 2260 2261$code.=<<___ if ($SZ==4 && $shaext); 2262.type shaext_handler,\@abi-omnipotent 2263.align 16 2264shaext_handler: 2265 push %rsi 2266 push %rdi 2267 push %rbx 2268 push %rbp 2269 push %r12 2270 push %r13 2271 push %r14 2272 push %r15 2273 pushfq 2274 sub \$64,%rsp 2275 2276 mov 120($context),%rax # pull context->Rax 2277 mov 248($context),%rbx # pull context->Rip 2278 2279 lea .Lprologue_shaext(%rip),%r10 2280 cmp %r10,%rbx # context->Rip<.Lprologue 2281 jb .Lin_prologue 2282 2283 lea .Lepilogue_shaext(%rip),%r10 2284 cmp %r10,%rbx # context->Rip>=.Lepilogue 2285 jae .Lin_prologue 2286 2287 lea -8-5*16(%rax),%rsi 2288 lea 512($context),%rdi # &context.Xmm6 2289 mov \$10,%ecx 2290 .long 0xa548f3fc # cld; rep movsq 2291 2292 jmp .Lin_prologue 2293.size shaext_handler,.-shaext_handler 2294___ 2295 2296$code.=<<___; 2297.section .pdata 2298.align 4 2299 .rva .LSEH_begin_$func 2300 .rva .LSEH_end_$func 2301 .rva .LSEH_info_$func 2302___ 2303$code.=<<___ if ($SZ==4 && $shaext); 2304 .rva .LSEH_begin_${func}_shaext 2305 .rva .LSEH_end_${func}_shaext 2306 .rva .LSEH_info_${func}_shaext 2307___ 2308$code.=<<___ if ($SZ==4); 2309 .rva .LSEH_begin_${func}_ssse3 2310 .rva .LSEH_end_${func}_ssse3 2311 .rva .LSEH_info_${func}_ssse3 2312___ 2313$code.=<<___ if ($avx && $SZ==8); 2314 .rva .LSEH_begin_${func}_xop 2315 .rva .LSEH_end_${func}_xop 2316 .rva .LSEH_info_${func}_xop 2317___ 2318$code.=<<___ if ($avx); 2319 .rva .LSEH_begin_${func}_avx 2320 .rva .LSEH_end_${func}_avx 2321 .rva .LSEH_info_${func}_avx 2322___ 2323$code.=<<___ if ($avx>1); 2324 .rva .LSEH_begin_${func}_avx2 2325 .rva .LSEH_end_${func}_avx2 2326 .rva .LSEH_info_${func}_avx2 2327___ 2328$code.=<<___; 2329.section .xdata 2330.align 8 2331.LSEH_info_$func: 2332 .byte 9,0,0,0 2333 .rva se_handler 2334 .rva .Lprologue,.Lepilogue # HandlerData[] 2335___ 2336$code.=<<___ if ($SZ==4 && $shaext); 2337.LSEH_info_${func}_shaext: 2338 .byte 9,0,0,0 2339 .rva shaext_handler 2340___ 2341$code.=<<___ if ($SZ==4); 2342.LSEH_info_${func}_ssse3: 2343 .byte 9,0,0,0 2344 .rva se_handler 2345 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2346___ 2347$code.=<<___ if ($avx && $SZ==8); 2348.LSEH_info_${func}_xop: 2349 .byte 9,0,0,0 2350 .rva se_handler 2351 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2352___ 2353$code.=<<___ if ($avx); 2354.LSEH_info_${func}_avx: 2355 .byte 9,0,0,0 2356 .rva se_handler 2357 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2358___ 2359$code.=<<___ if ($avx>1); 2360.LSEH_info_${func}_avx2: 2361 .byte 9,0,0,0 2362 .rva se_handler 2363 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2364___ 2365} 2366 2367sub sha256op38 { 2368 my $instr = shift; 2369 my %opcodelet = ( 2370 "sha256rnds2" => 0xcb, 2371 "sha256msg1" => 0xcc, 2372 "sha256msg2" => 0xcd ); 2373 2374 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2375 my @opcode=(0x0f,0x38); 2376 push @opcode,$opcodelet{$instr}; 2377 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2378 return ".byte\t".join(',',@opcode); 2379 } else { 2380 return $instr."\t".@_[0]; 2381 } 2382} 2383 2384foreach (split("\n",$code)) { 2385 s/\`([^\`]*)\`/eval $1/geo; 2386 2387 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2388 2389 print $_,"\n"; 2390} 2391close STDOUT; 2392