1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111# 112# Modified from upstream OpenSSL to remove the XOP code. 113 114$flavour = shift; 115$output = shift; 116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 117 118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 119 120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 123die "can't locate x86_64-xlate.pl"; 124 125# In upstream, this is controlled by shelling out to the compiler to check 126# versions, but BoringSSL is intended to be used with pre-generated perlasm 127# output, so this isn't useful anyway. 128# 129# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 130# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 131# did not tie them together until after $shaext was added. 132$avx = 1; 133 134# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 135# been tested. 136$shaext=0; ### set to zero if compiling for 1.0.1 137$avx=1 if (!$shaext && $avx); 138 139open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 140*STDOUT=*OUT; 141 142if ($output =~ /512/) { 143 $func="sha512_block_data_order"; 144 $TABLE="K512"; 145 $SZ=8; 146 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 147 "%r8", "%r9", "%r10","%r11"); 148 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 149 @Sigma0=(28,34,39); 150 @Sigma1=(14,18,41); 151 @sigma0=(1, 8, 7); 152 @sigma1=(19,61, 6); 153 $rounds=80; 154} else { 155 $func="sha256_block_data_order"; 156 $TABLE="K256"; 157 $SZ=4; 158 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 159 "%r8d","%r9d","%r10d","%r11d"); 160 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 161 @Sigma0=( 2,13,22); 162 @Sigma1=( 6,11,25); 163 @sigma0=( 7,18, 3); 164 @sigma1=(17,19,10); 165 $rounds=64; 166} 167 168$ctx="%rdi"; # 1st arg, zapped by $a3 169$inp="%rsi"; # 2nd arg 170$Tbl="%rbp"; 171 172$_ctx="16*$SZ+0*8(%rsp)"; 173$_inp="16*$SZ+1*8(%rsp)"; 174$_end="16*$SZ+2*8(%rsp)"; 175$_rsp="`16*$SZ+3*8`(%rsp)"; 176$framesz="16*$SZ+4*8"; 177 178 179sub ROUND_00_15() 180{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 181 my $STRIDE=$SZ; 182 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 183 184$code.=<<___; 185 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 186 mov $f,$a2 187 188 xor $e,$a0 189 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 190 xor $g,$a2 # f^g 191 192 mov $T1,`$SZ*($i&0xf)`(%rsp) 193 xor $a,$a1 194 and $e,$a2 # (f^g)&e 195 196 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 197 add $h,$T1 # T1+=h 198 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 199 200 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 201 xor $e,$a0 202 add $a2,$T1 # T1+=Ch(e,f,g) 203 204 mov $a,$a2 205 add ($Tbl),$T1 # T1+=K[round] 206 xor $a,$a1 207 208 xor $b,$a2 # a^b, b^c in next round 209 ror \$$Sigma1[0],$a0 # Sigma1(e) 210 mov $b,$h 211 212 and $a2,$a3 213 ror \$$Sigma0[0],$a1 # Sigma0(a) 214 add $a0,$T1 # T1+=Sigma1(e) 215 216 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 217 add $T1,$d # d+=T1 218 add $T1,$h # h+=T1 219 220 lea $STRIDE($Tbl),$Tbl # round++ 221___ 222$code.=<<___ if ($i<15); 223 add $a1,$h # h+=Sigma0(a) 224___ 225 ($a2,$a3) = ($a3,$a2); 226} 227 228sub ROUND_16_XX() 229{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 230 231$code.=<<___; 232 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 233 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 234 235 mov $a0,$T1 236 ror \$`$sigma0[1]-$sigma0[0]`,$a0 237 add $a1,$a # modulo-scheduled h+=Sigma0(a) 238 mov $a2,$a1 239 ror \$`$sigma1[1]-$sigma1[0]`,$a2 240 241 xor $T1,$a0 242 shr \$$sigma0[2],$T1 243 ror \$$sigma0[0],$a0 244 xor $a1,$a2 245 shr \$$sigma1[2],$a1 246 247 ror \$$sigma1[0],$a2 248 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 249 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 250 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 251 252 add `$SZ*($i&0xf)`(%rsp),$T1 253 mov $e,$a0 254 add $a2,$T1 255 mov $a,$a1 256___ 257 &ROUND_00_15(@_); 258} 259 260$code=<<___; 261.text 262 263.extern OPENSSL_ia32cap_P 264.globl $func 265.type $func,\@function,3 266.align 16 267$func: 268.cfi_startproc 269___ 270$code.=<<___ if ($SZ==4 || $avx); 271 leaq OPENSSL_ia32cap_P(%rip),%r11 272 mov 0(%r11),%r9d 273 mov 4(%r11),%r10d 274 mov 8(%r11),%r11d 275___ 276$code.=<<___ if ($SZ==4 && $shaext); 277 test \$`1<<29`,%r11d # check for SHA 278 jnz _shaext_shortcut 279___ 280 # XOP codepath removed. 281$code.=<<___ if ($avx>1); 282 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 283 cmp \$`1<<8|1<<5|1<<3`,%r11d 284 je .Lavx2_shortcut 285___ 286$code.=<<___ if ($avx); 287 and \$`1<<30`,%r9d # mask "Intel CPU" bit 288 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 289 or %r9d,%r10d 290 cmp \$`1<<28|1<<9|1<<30`,%r10d 291 je .Lavx_shortcut 292___ 293$code.=<<___ if ($SZ==4); 294 test \$`1<<9`,%r10d 295 jnz .Lssse3_shortcut 296___ 297$code.=<<___; 298 mov %rsp,%rax # copy %rsp 299.cfi_def_cfa_register %rax 300 push %rbx 301.cfi_push %rbx 302 push %rbp 303.cfi_push %rbp 304 push %r12 305.cfi_push %r12 306 push %r13 307.cfi_push %r13 308 push %r14 309.cfi_push %r14 310 push %r15 311.cfi_push %r15 312 shl \$4,%rdx # num*16 313 sub \$$framesz,%rsp 314 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 315 and \$-64,%rsp # align stack frame 316 mov $ctx,$_ctx # save ctx, 1st arg 317 mov $inp,$_inp # save inp, 2nd arh 318 mov %rdx,$_end # save end pointer, "3rd" arg 319 mov %rax,$_rsp # save copy of %rsp 320.cfi_cfa_expression $_rsp,deref,+8 321.Lprologue: 322 323 mov $SZ*0($ctx),$A 324 mov $SZ*1($ctx),$B 325 mov $SZ*2($ctx),$C 326 mov $SZ*3($ctx),$D 327 mov $SZ*4($ctx),$E 328 mov $SZ*5($ctx),$F 329 mov $SZ*6($ctx),$G 330 mov $SZ*7($ctx),$H 331 jmp .Lloop 332 333.align 16 334.Lloop: 335 mov $B,$a3 336 lea $TABLE(%rip),$Tbl 337 xor $C,$a3 # magic 338___ 339 for($i=0;$i<16;$i++) { 340 $code.=" mov $SZ*$i($inp),$T1\n"; 341 $code.=" mov @ROT[4],$a0\n"; 342 $code.=" mov @ROT[0],$a1\n"; 343 $code.=" bswap $T1\n"; 344 &ROUND_00_15($i,@ROT); 345 unshift(@ROT,pop(@ROT)); 346 } 347$code.=<<___; 348 jmp .Lrounds_16_xx 349.align 16 350.Lrounds_16_xx: 351___ 352 for(;$i<32;$i++) { 353 &ROUND_16_XX($i,@ROT); 354 unshift(@ROT,pop(@ROT)); 355 } 356 357$code.=<<___; 358 cmpb \$0,`$SZ-1`($Tbl) 359 jnz .Lrounds_16_xx 360 361 mov $_ctx,$ctx 362 add $a1,$A # modulo-scheduled h+=Sigma0(a) 363 lea 16*$SZ($inp),$inp 364 365 add $SZ*0($ctx),$A 366 add $SZ*1($ctx),$B 367 add $SZ*2($ctx),$C 368 add $SZ*3($ctx),$D 369 add $SZ*4($ctx),$E 370 add $SZ*5($ctx),$F 371 add $SZ*6($ctx),$G 372 add $SZ*7($ctx),$H 373 374 cmp $_end,$inp 375 376 mov $A,$SZ*0($ctx) 377 mov $B,$SZ*1($ctx) 378 mov $C,$SZ*2($ctx) 379 mov $D,$SZ*3($ctx) 380 mov $E,$SZ*4($ctx) 381 mov $F,$SZ*5($ctx) 382 mov $G,$SZ*6($ctx) 383 mov $H,$SZ*7($ctx) 384 jb .Lloop 385 386 mov $_rsp,%rsi 387.cfi_def_cfa %rsi,8 388 mov -48(%rsi),%r15 389.cfi_restore %r15 390 mov -40(%rsi),%r14 391.cfi_restore %r14 392 mov -32(%rsi),%r13 393.cfi_restore %r13 394 mov -24(%rsi),%r12 395.cfi_restore %r12 396 mov -16(%rsi),%rbp 397.cfi_restore %rbp 398 mov -8(%rsi),%rbx 399.cfi_restore %rbx 400 lea (%rsi),%rsp 401.cfi_def_cfa_register %rsp 402.Lepilogue: 403 ret 404.cfi_endproc 405.size $func,.-$func 406___ 407 408if ($SZ==4) { 409$code.=<<___; 410.align 64 411.type $TABLE,\@object 412$TABLE: 413 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 414 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 415 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 416 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 417 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 418 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 419 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 420 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 421 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 422 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 423 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 424 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 425 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 426 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 427 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 428 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 429 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 430 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 431 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 432 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 433 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 434 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 435 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 436 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 437 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 438 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 439 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 440 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 441 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 442 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 443 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 444 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 445 446 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 447 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 448 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 449 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 450 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 451 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 452 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 453___ 454} else { 455$code.=<<___; 456.align 64 457.type $TABLE,\@object 458$TABLE: 459 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 460 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 461 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 462 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 463 .quad 0x3956c25bf348b538,0x59f111f1b605d019 464 .quad 0x3956c25bf348b538,0x59f111f1b605d019 465 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 466 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 467 .quad 0xd807aa98a3030242,0x12835b0145706fbe 468 .quad 0xd807aa98a3030242,0x12835b0145706fbe 469 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 470 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 471 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 472 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 473 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 474 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 475 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 476 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 477 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 478 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 479 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 480 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 481 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 482 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 483 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 484 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 485 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 486 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 487 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 488 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 489 .quad 0x06ca6351e003826f,0x142929670a0e6e70 490 .quad 0x06ca6351e003826f,0x142929670a0e6e70 491 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 492 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 493 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 494 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 495 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 496 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 497 .quad 0x81c2c92e47edaee6,0x92722c851482353b 498 .quad 0x81c2c92e47edaee6,0x92722c851482353b 499 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 500 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 501 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 502 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 503 .quad 0xd192e819d6ef5218,0xd69906245565a910 504 .quad 0xd192e819d6ef5218,0xd69906245565a910 505 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 506 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 507 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 508 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 509 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 510 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 511 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 512 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 513 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 514 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 515 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 516 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 517 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 518 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 519 .quad 0x90befffa23631e28,0xa4506cebde82bde9 520 .quad 0x90befffa23631e28,0xa4506cebde82bde9 521 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 522 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 523 .quad 0xca273eceea26619c,0xd186b8c721c0c207 524 .quad 0xca273eceea26619c,0xd186b8c721c0c207 525 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 526 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 527 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 528 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 529 .quad 0x113f9804bef90dae,0x1b710b35131c471b 530 .quad 0x113f9804bef90dae,0x1b710b35131c471b 531 .quad 0x28db77f523047d84,0x32caab7b40c72493 532 .quad 0x28db77f523047d84,0x32caab7b40c72493 533 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 534 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 535 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 536 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 537 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 538 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 539 540 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 541 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 542 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 543___ 544} 545 546###################################################################### 547# SIMD code paths 548# 549if ($SZ==4 && $shaext) {{{ 550###################################################################### 551# Intel SHA Extensions implementation of SHA256 update function. 552# 553my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 554 555my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 556my @MSG=map("%xmm$_",(3..6)); 557 558$code.=<<___; 559.type sha256_block_data_order_shaext,\@function,3 560.align 64 561sha256_block_data_order_shaext: 562_shaext_shortcut: 563___ 564$code.=<<___ if ($win64); 565 lea `-8-5*16`(%rsp),%rsp 566 movaps %xmm6,-8-5*16(%rax) 567 movaps %xmm7,-8-4*16(%rax) 568 movaps %xmm8,-8-3*16(%rax) 569 movaps %xmm9,-8-2*16(%rax) 570 movaps %xmm10,-8-1*16(%rax) 571.Lprologue_shaext: 572___ 573$code.=<<___; 574 lea K256+0x80(%rip),$Tbl 575 movdqu ($ctx),$ABEF # DCBA 576 movdqu 16($ctx),$CDGH # HGFE 577 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 578 579 pshufd \$0x1b,$ABEF,$Wi # ABCD 580 pshufd \$0xb1,$ABEF,$ABEF # CDAB 581 pshufd \$0x1b,$CDGH,$CDGH # EFGH 582 movdqa $TMP,$BSWAP # offload 583 palignr \$8,$CDGH,$ABEF # ABEF 584 punpcklqdq $Wi,$CDGH # CDGH 585 jmp .Loop_shaext 586 587.align 16 588.Loop_shaext: 589 movdqu ($inp),@MSG[0] 590 movdqu 0x10($inp),@MSG[1] 591 movdqu 0x20($inp),@MSG[2] 592 pshufb $TMP,@MSG[0] 593 movdqu 0x30($inp),@MSG[3] 594 595 movdqa 0*32-0x80($Tbl),$Wi 596 paddd @MSG[0],$Wi 597 pshufb $TMP,@MSG[1] 598 movdqa $CDGH,$CDGH_SAVE # offload 599 sha256rnds2 $ABEF,$CDGH # 0-3 600 pshufd \$0x0e,$Wi,$Wi 601 nop 602 movdqa $ABEF,$ABEF_SAVE # offload 603 sha256rnds2 $CDGH,$ABEF 604 605 movdqa 1*32-0x80($Tbl),$Wi 606 paddd @MSG[1],$Wi 607 pshufb $TMP,@MSG[2] 608 sha256rnds2 $ABEF,$CDGH # 4-7 609 pshufd \$0x0e,$Wi,$Wi 610 lea 0x40($inp),$inp 611 sha256msg1 @MSG[1],@MSG[0] 612 sha256rnds2 $CDGH,$ABEF 613 614 movdqa 2*32-0x80($Tbl),$Wi 615 paddd @MSG[2],$Wi 616 pshufb $TMP,@MSG[3] 617 sha256rnds2 $ABEF,$CDGH # 8-11 618 pshufd \$0x0e,$Wi,$Wi 619 movdqa @MSG[3],$TMP 620 palignr \$4,@MSG[2],$TMP 621 nop 622 paddd $TMP,@MSG[0] 623 sha256msg1 @MSG[2],@MSG[1] 624 sha256rnds2 $CDGH,$ABEF 625 626 movdqa 3*32-0x80($Tbl),$Wi 627 paddd @MSG[3],$Wi 628 sha256msg2 @MSG[3],@MSG[0] 629 sha256rnds2 $ABEF,$CDGH # 12-15 630 pshufd \$0x0e,$Wi,$Wi 631 movdqa @MSG[0],$TMP 632 palignr \$4,@MSG[3],$TMP 633 nop 634 paddd $TMP,@MSG[1] 635 sha256msg1 @MSG[3],@MSG[2] 636 sha256rnds2 $CDGH,$ABEF 637___ 638for($i=4;$i<16-3;$i++) { 639$code.=<<___; 640 movdqa $i*32-0x80($Tbl),$Wi 641 paddd @MSG[0],$Wi 642 sha256msg2 @MSG[0],@MSG[1] 643 sha256rnds2 $ABEF,$CDGH # 16-19... 644 pshufd \$0x0e,$Wi,$Wi 645 movdqa @MSG[1],$TMP 646 palignr \$4,@MSG[0],$TMP 647 nop 648 paddd $TMP,@MSG[2] 649 sha256msg1 @MSG[0],@MSG[3] 650 sha256rnds2 $CDGH,$ABEF 651___ 652 push(@MSG,shift(@MSG)); 653} 654$code.=<<___; 655 movdqa 13*32-0x80($Tbl),$Wi 656 paddd @MSG[0],$Wi 657 sha256msg2 @MSG[0],@MSG[1] 658 sha256rnds2 $ABEF,$CDGH # 52-55 659 pshufd \$0x0e,$Wi,$Wi 660 movdqa @MSG[1],$TMP 661 palignr \$4,@MSG[0],$TMP 662 sha256rnds2 $CDGH,$ABEF 663 paddd $TMP,@MSG[2] 664 665 movdqa 14*32-0x80($Tbl),$Wi 666 paddd @MSG[1],$Wi 667 sha256rnds2 $ABEF,$CDGH # 56-59 668 pshufd \$0x0e,$Wi,$Wi 669 sha256msg2 @MSG[1],@MSG[2] 670 movdqa $BSWAP,$TMP 671 sha256rnds2 $CDGH,$ABEF 672 673 movdqa 15*32-0x80($Tbl),$Wi 674 paddd @MSG[2],$Wi 675 nop 676 sha256rnds2 $ABEF,$CDGH # 60-63 677 pshufd \$0x0e,$Wi,$Wi 678 dec $num 679 nop 680 sha256rnds2 $CDGH,$ABEF 681 682 paddd $CDGH_SAVE,$CDGH 683 paddd $ABEF_SAVE,$ABEF 684 jnz .Loop_shaext 685 686 pshufd \$0xb1,$CDGH,$CDGH # DCHG 687 pshufd \$0x1b,$ABEF,$TMP # FEBA 688 pshufd \$0xb1,$ABEF,$ABEF # BAFE 689 punpckhqdq $CDGH,$ABEF # DCBA 690 palignr \$8,$TMP,$CDGH # HGFE 691 692 movdqu $ABEF,($ctx) 693 movdqu $CDGH,16($ctx) 694___ 695$code.=<<___ if ($win64); 696 movaps -8-5*16(%rax),%xmm6 697 movaps -8-4*16(%rax),%xmm7 698 movaps -8-3*16(%rax),%xmm8 699 movaps -8-2*16(%rax),%xmm9 700 movaps -8-1*16(%rax),%xmm10 701 mov %rax,%rsp 702.Lepilogue_shaext: 703___ 704$code.=<<___; 705 ret 706.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 707___ 708}}} 709{{{ 710 711my $a4=$T1; 712my ($a,$b,$c,$d,$e,$f,$g,$h); 713 714sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 715{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 716 my $arg = pop; 717 $arg = "\$$arg" if ($arg*1 eq $arg); 718 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 719} 720 721sub body_00_15 () { 722 ( 723 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 724 725 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 726 '&mov ($a,$a1)', 727 '&mov ($a4,$f)', 728 729 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 730 '&xor ($a0,$e)', 731 '&xor ($a4,$g)', # f^g 732 733 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 734 '&xor ($a1,$a)', 735 '&and ($a4,$e)', # (f^g)&e 736 737 '&xor ($a0,$e)', 738 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 739 '&mov ($a2,$a)', 740 741 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 742 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 743 '&xor ($a2,$b)', # a^b, b^c in next round 744 745 '&add ($h,$a4)', # h+=Ch(e,f,g) 746 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 747 '&and ($a3,$a2)', # (b^c)&(a^b) 748 749 '&xor ($a1,$a)', 750 '&add ($h,$a0)', # h+=Sigma1(e) 751 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 752 753 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 754 '&add ($d,$h)', # d+=h 755 '&add ($h,$a3)', # h+=Maj(a,b,c) 756 757 '&mov ($a0,$d)', 758 '&add ($a1,$h);'. # h+=Sigma0(a) 759 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 760 ); 761} 762 763###################################################################### 764# SSSE3 code path 765# 766if ($SZ==4) { # SHA256 only 767my @X = map("%xmm$_",(0..3)); 768my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 769 770$code.=<<___; 771.type ${func}_ssse3,\@function,3 772.align 64 773${func}_ssse3: 774.cfi_startproc 775.Lssse3_shortcut: 776 mov %rsp,%rax # copy %rsp 777.cfi_def_cfa_register %rax 778 push %rbx 779.cfi_push %rbx 780 push %rbp 781.cfi_push %rbp 782 push %r12 783.cfi_push %r12 784 push %r13 785.cfi_push %r13 786 push %r14 787.cfi_push %r14 788 push %r15 789.cfi_push %r15 790 shl \$4,%rdx # num*16 791 sub \$`$framesz+$win64*16*4`,%rsp 792 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 793 and \$-64,%rsp # align stack frame 794 mov $ctx,$_ctx # save ctx, 1st arg 795 mov $inp,$_inp # save inp, 2nd arh 796 mov %rdx,$_end # save end pointer, "3rd" arg 797 mov %rax,$_rsp # save copy of %rsp 798.cfi_cfa_expression $_rsp,deref,+8 799___ 800$code.=<<___ if ($win64); 801 movaps %xmm6,16*$SZ+32(%rsp) 802 movaps %xmm7,16*$SZ+48(%rsp) 803 movaps %xmm8,16*$SZ+64(%rsp) 804 movaps %xmm9,16*$SZ+80(%rsp) 805___ 806$code.=<<___; 807.Lprologue_ssse3: 808 809 mov $SZ*0($ctx),$A 810 mov $SZ*1($ctx),$B 811 mov $SZ*2($ctx),$C 812 mov $SZ*3($ctx),$D 813 mov $SZ*4($ctx),$E 814 mov $SZ*5($ctx),$F 815 mov $SZ*6($ctx),$G 816 mov $SZ*7($ctx),$H 817___ 818 819$code.=<<___; 820 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 821 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 822 jmp .Lloop_ssse3 823.align 16 824.Lloop_ssse3: 825 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 826 movdqu 0x00($inp),@X[0] 827 movdqu 0x10($inp),@X[1] 828 movdqu 0x20($inp),@X[2] 829 pshufb $t3,@X[0] 830 movdqu 0x30($inp),@X[3] 831 lea $TABLE(%rip),$Tbl 832 pshufb $t3,@X[1] 833 movdqa 0x00($Tbl),$t0 834 movdqa 0x20($Tbl),$t1 835 pshufb $t3,@X[2] 836 paddd @X[0],$t0 837 movdqa 0x40($Tbl),$t2 838 pshufb $t3,@X[3] 839 movdqa 0x60($Tbl),$t3 840 paddd @X[1],$t1 841 paddd @X[2],$t2 842 paddd @X[3],$t3 843 movdqa $t0,0x00(%rsp) 844 mov $A,$a1 845 movdqa $t1,0x10(%rsp) 846 mov $B,$a3 847 movdqa $t2,0x20(%rsp) 848 xor $C,$a3 # magic 849 movdqa $t3,0x30(%rsp) 850 mov $E,$a0 851 jmp .Lssse3_00_47 852 853.align 16 854.Lssse3_00_47: 855 sub \$`-16*2*$SZ`,$Tbl # size optimization 856___ 857sub Xupdate_256_SSSE3 () { 858 ( 859 '&movdqa ($t0,@X[1]);', 860 '&movdqa ($t3,@X[3])', 861 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 862 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 863 '&movdqa ($t1,$t0)', 864 '&movdqa ($t2,$t0);', 865 '&psrld ($t0,$sigma0[2])', 866 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 867 '&psrld ($t2,$sigma0[0])', 868 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 869 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 870 '&pxor ($t0,$t2)', 871 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 872 '&pxor ($t0,$t1)', 873 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 874 '&pxor ($t0,$t2);', 875 '&movdqa ($t2,$t3)', 876 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 877 '&psrld ($t3,$sigma1[2])', 878 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 879 '&psrlq ($t2,$sigma1[0])', 880 '&pxor ($t3,$t2);', 881 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 882 '&pxor ($t3,$t2)', 883 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 884 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 885 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 886 '&movdqa ($t2,$t3);', 887 '&psrld ($t3,$sigma1[2])', 888 '&psrlq ($t2,$sigma1[0])', 889 '&pxor ($t3,$t2);', 890 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 891 '&pxor ($t3,$t2);', 892 '&movdqa ($t2,16*2*$j."($Tbl)")', 893 '&pshufb ($t3,$t5)', 894 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 895 ); 896} 897 898sub SSSE3_256_00_47 () { 899my $j = shift; 900my $body = shift; 901my @X = @_; 902my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 903 904 if (0) { 905 foreach (Xupdate_256_SSSE3()) { # 36 instructions 906 eval; 907 eval(shift(@insns)); 908 eval(shift(@insns)); 909 eval(shift(@insns)); 910 } 911 } else { # squeeze extra 4% on Westmere and 19% on Atom 912 eval(shift(@insns)); #@ 913 &movdqa ($t0,@X[1]); 914 eval(shift(@insns)); 915 eval(shift(@insns)); 916 &movdqa ($t3,@X[3]); 917 eval(shift(@insns)); #@ 918 eval(shift(@insns)); 919 eval(shift(@insns)); 920 eval(shift(@insns)); #@ 921 eval(shift(@insns)); 922 &palignr ($t0,@X[0],$SZ); # X[1..4] 923 eval(shift(@insns)); 924 eval(shift(@insns)); 925 &palignr ($t3,@X[2],$SZ); # X[9..12] 926 eval(shift(@insns)); 927 eval(shift(@insns)); 928 eval(shift(@insns)); 929 eval(shift(@insns)); #@ 930 &movdqa ($t1,$t0); 931 eval(shift(@insns)); 932 eval(shift(@insns)); 933 &movdqa ($t2,$t0); 934 eval(shift(@insns)); #@ 935 eval(shift(@insns)); 936 &psrld ($t0,$sigma0[2]); 937 eval(shift(@insns)); 938 eval(shift(@insns)); 939 eval(shift(@insns)); 940 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 941 eval(shift(@insns)); #@ 942 eval(shift(@insns)); 943 &psrld ($t2,$sigma0[0]); 944 eval(shift(@insns)); 945 eval(shift(@insns)); 946 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 947 eval(shift(@insns)); 948 eval(shift(@insns)); #@ 949 &pslld ($t1,8*$SZ-$sigma0[1]); 950 eval(shift(@insns)); 951 eval(shift(@insns)); 952 &pxor ($t0,$t2); 953 eval(shift(@insns)); #@ 954 eval(shift(@insns)); 955 eval(shift(@insns)); 956 eval(shift(@insns)); #@ 957 &psrld ($t2,$sigma0[1]-$sigma0[0]); 958 eval(shift(@insns)); 959 &pxor ($t0,$t1); 960 eval(shift(@insns)); 961 eval(shift(@insns)); 962 &pslld ($t1,$sigma0[1]-$sigma0[0]); 963 eval(shift(@insns)); 964 eval(shift(@insns)); 965 &pxor ($t0,$t2); 966 eval(shift(@insns)); 967 eval(shift(@insns)); #@ 968 &movdqa ($t2,$t3); 969 eval(shift(@insns)); 970 eval(shift(@insns)); 971 &pxor ($t0,$t1); # sigma0(X[1..4]) 972 eval(shift(@insns)); #@ 973 eval(shift(@insns)); 974 eval(shift(@insns)); 975 &psrld ($t3,$sigma1[2]); 976 eval(shift(@insns)); 977 eval(shift(@insns)); 978 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 979 eval(shift(@insns)); #@ 980 eval(shift(@insns)); 981 &psrlq ($t2,$sigma1[0]); 982 eval(shift(@insns)); 983 eval(shift(@insns)); 984 eval(shift(@insns)); 985 &pxor ($t3,$t2); 986 eval(shift(@insns)); #@ 987 eval(shift(@insns)); 988 eval(shift(@insns)); 989 eval(shift(@insns)); #@ 990 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 991 eval(shift(@insns)); 992 eval(shift(@insns)); 993 &pxor ($t3,$t2); 994 eval(shift(@insns)); #@ 995 eval(shift(@insns)); 996 eval(shift(@insns)); 997 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 998 &pshufd ($t3,$t3,0b10000000); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); 1002 &psrldq ($t3,8); 1003 eval(shift(@insns)); 1004 eval(shift(@insns)); #@ 1005 eval(shift(@insns)); 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); #@ 1008 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1009 eval(shift(@insns)); 1010 eval(shift(@insns)); 1011 eval(shift(@insns)); 1012 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); #@ 1015 eval(shift(@insns)); 1016 &movdqa ($t2,$t3); 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 &psrld ($t3,$sigma1[2]); 1020 eval(shift(@insns)); 1021 eval(shift(@insns)); #@ 1022 &psrlq ($t2,$sigma1[0]); 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); 1025 &pxor ($t3,$t2); 1026 eval(shift(@insns)); #@ 1027 eval(shift(@insns)); 1028 eval(shift(@insns)); 1029 eval(shift(@insns)); #@ 1030 eval(shift(@insns)); 1031 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1032 eval(shift(@insns)); 1033 eval(shift(@insns)); 1034 eval(shift(@insns)); 1035 &pxor ($t3,$t2); 1036 eval(shift(@insns)); 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); #@ 1039 #&pshufb ($t3,$t5); 1040 &pshufd ($t3,$t3,0b00001000); 1041 eval(shift(@insns)); 1042 eval(shift(@insns)); 1043 &movdqa ($t2,16*2*$j."($Tbl)"); 1044 eval(shift(@insns)); #@ 1045 eval(shift(@insns)); 1046 &pslldq ($t3,8); 1047 eval(shift(@insns)); 1048 eval(shift(@insns)); 1049 eval(shift(@insns)); 1050 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1051 eval(shift(@insns)); #@ 1052 eval(shift(@insns)); 1053 eval(shift(@insns)); 1054 } 1055 &paddd ($t2,@X[0]); 1056 foreach (@insns) { eval; } # remaining instructions 1057 &movdqa (16*$j."(%rsp)",$t2); 1058} 1059 1060 for ($i=0,$j=0; $j<4; $j++) { 1061 &SSSE3_256_00_47($j,\&body_00_15,@X); 1062 push(@X,shift(@X)); # rotate(@X) 1063 } 1064 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1065 &jne (".Lssse3_00_47"); 1066 1067 for ($i=0; $i<16; ) { 1068 foreach(body_00_15()) { eval; } 1069 } 1070$code.=<<___; 1071 mov $_ctx,$ctx 1072 mov $a1,$A 1073 1074 add $SZ*0($ctx),$A 1075 lea 16*$SZ($inp),$inp 1076 add $SZ*1($ctx),$B 1077 add $SZ*2($ctx),$C 1078 add $SZ*3($ctx),$D 1079 add $SZ*4($ctx),$E 1080 add $SZ*5($ctx),$F 1081 add $SZ*6($ctx),$G 1082 add $SZ*7($ctx),$H 1083 1084 cmp $_end,$inp 1085 1086 mov $A,$SZ*0($ctx) 1087 mov $B,$SZ*1($ctx) 1088 mov $C,$SZ*2($ctx) 1089 mov $D,$SZ*3($ctx) 1090 mov $E,$SZ*4($ctx) 1091 mov $F,$SZ*5($ctx) 1092 mov $G,$SZ*6($ctx) 1093 mov $H,$SZ*7($ctx) 1094 jb .Lloop_ssse3 1095 1096 mov $_rsp,%rsi 1097.cfi_def_cfa %rsi,8 1098___ 1099$code.=<<___ if ($win64); 1100 movaps 16*$SZ+32(%rsp),%xmm6 1101 movaps 16*$SZ+48(%rsp),%xmm7 1102 movaps 16*$SZ+64(%rsp),%xmm8 1103 movaps 16*$SZ+80(%rsp),%xmm9 1104___ 1105$code.=<<___; 1106 mov -48(%rsi),%r15 1107.cfi_restore %r15 1108 mov -40(%rsi),%r14 1109.cfi_restore %r14 1110 mov -32(%rsi),%r13 1111.cfi_restore %r13 1112 mov -24(%rsi),%r12 1113.cfi_restore %r12 1114 mov -16(%rsi),%rbp 1115.cfi_restore %rbp 1116 mov -8(%rsi),%rbx 1117.cfi_restore %rbx 1118 lea (%rsi),%rsp 1119.cfi_def_cfa_register %rsp 1120.Lepilogue_ssse3: 1121 ret 1122.cfi_endproc 1123.size ${func}_ssse3,.-${func}_ssse3 1124___ 1125} 1126 1127if ($avx) {{ 1128###################################################################### 1129# AVX+shrd code path 1130# 1131local *ror = sub { &shrd(@_[0],@_) }; 1132 1133$code.=<<___; 1134.type ${func}_avx,\@function,3 1135.align 64 1136${func}_avx: 1137.cfi_startproc 1138.Lavx_shortcut: 1139 mov %rsp,%rax # copy %rsp 1140.cfi_def_cfa_register %rax 1141 push %rbx 1142.cfi_push %rbx 1143 push %rbp 1144.cfi_push %rbp 1145 push %r12 1146.cfi_push %r12 1147 push %r13 1148.cfi_push %r13 1149 push %r14 1150.cfi_push %r14 1151 push %r15 1152.cfi_push %r15 1153 shl \$4,%rdx # num*16 1154 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1155 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1156 and \$-64,%rsp # align stack frame 1157 mov $ctx,$_ctx # save ctx, 1st arg 1158 mov $inp,$_inp # save inp, 2nd arh 1159 mov %rdx,$_end # save end pointer, "3rd" arg 1160 mov %rax,$_rsp # save copy of %rsp 1161.cfi_cfa_expression $_rsp,deref,+8 1162___ 1163$code.=<<___ if ($win64); 1164 movaps %xmm6,16*$SZ+32(%rsp) 1165 movaps %xmm7,16*$SZ+48(%rsp) 1166 movaps %xmm8,16*$SZ+64(%rsp) 1167 movaps %xmm9,16*$SZ+80(%rsp) 1168___ 1169$code.=<<___ if ($win64 && $SZ>4); 1170 movaps %xmm10,16*$SZ+96(%rsp) 1171 movaps %xmm11,16*$SZ+112(%rsp) 1172___ 1173$code.=<<___; 1174.Lprologue_avx: 1175 1176 vzeroupper 1177 mov $SZ*0($ctx),$A 1178 mov $SZ*1($ctx),$B 1179 mov $SZ*2($ctx),$C 1180 mov $SZ*3($ctx),$D 1181 mov $SZ*4($ctx),$E 1182 mov $SZ*5($ctx),$F 1183 mov $SZ*6($ctx),$G 1184 mov $SZ*7($ctx),$H 1185___ 1186 if ($SZ==4) { # SHA256 1187 my @X = map("%xmm$_",(0..3)); 1188 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1189 1190$code.=<<___; 1191 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1192 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1193 jmp .Lloop_avx 1194.align 16 1195.Lloop_avx: 1196 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1197 vmovdqu 0x00($inp),@X[0] 1198 vmovdqu 0x10($inp),@X[1] 1199 vmovdqu 0x20($inp),@X[2] 1200 vmovdqu 0x30($inp),@X[3] 1201 vpshufb $t3,@X[0],@X[0] 1202 lea $TABLE(%rip),$Tbl 1203 vpshufb $t3,@X[1],@X[1] 1204 vpshufb $t3,@X[2],@X[2] 1205 vpaddd 0x00($Tbl),@X[0],$t0 1206 vpshufb $t3,@X[3],@X[3] 1207 vpaddd 0x20($Tbl),@X[1],$t1 1208 vpaddd 0x40($Tbl),@X[2],$t2 1209 vpaddd 0x60($Tbl),@X[3],$t3 1210 vmovdqa $t0,0x00(%rsp) 1211 mov $A,$a1 1212 vmovdqa $t1,0x10(%rsp) 1213 mov $B,$a3 1214 vmovdqa $t2,0x20(%rsp) 1215 xor $C,$a3 # magic 1216 vmovdqa $t3,0x30(%rsp) 1217 mov $E,$a0 1218 jmp .Lavx_00_47 1219 1220.align 16 1221.Lavx_00_47: 1222 sub \$`-16*2*$SZ`,$Tbl # size optimization 1223___ 1224sub Xupdate_256_AVX () { 1225 ( 1226 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1227 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1228 '&vpsrld ($t2,$t0,$sigma0[0]);', 1229 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1230 '&vpsrld ($t3,$t0,$sigma0[2])', 1231 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1232 '&vpxor ($t0,$t3,$t2)', 1233 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1234 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1235 '&vpxor ($t0,$t0,$t1)', 1236 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1237 '&vpxor ($t0,$t0,$t2)', 1238 '&vpsrld ($t2,$t3,$sigma1[2]);', 1239 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1240 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1241 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1242 '&vpxor ($t2,$t2,$t3);', 1243 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1244 '&vpxor ($t2,$t2,$t3)', 1245 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1246 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1247 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1248 '&vpsrld ($t2,$t3,$sigma1[2])', 1249 '&vpsrlq ($t3,$t3,$sigma1[0])', 1250 '&vpxor ($t2,$t2,$t3);', 1251 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1252 '&vpxor ($t2,$t2,$t3)', 1253 '&vpshufb ($t2,$t2,$t5)', 1254 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1255 ); 1256} 1257 1258sub AVX_256_00_47 () { 1259my $j = shift; 1260my $body = shift; 1261my @X = @_; 1262my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1263 1264 foreach (Xupdate_256_AVX()) { # 29 instructions 1265 eval; 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 eval(shift(@insns)); 1269 } 1270 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1271 foreach (@insns) { eval; } # remaining instructions 1272 &vmovdqa (16*$j."(%rsp)",$t2); 1273} 1274 1275 for ($i=0,$j=0; $j<4; $j++) { 1276 &AVX_256_00_47($j,\&body_00_15,@X); 1277 push(@X,shift(@X)); # rotate(@X) 1278 } 1279 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1280 &jne (".Lavx_00_47"); 1281 1282 for ($i=0; $i<16; ) { 1283 foreach(body_00_15()) { eval; } 1284 } 1285 1286 } else { # SHA512 1287 my @X = map("%xmm$_",(0..7)); 1288 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1289 1290$code.=<<___; 1291 jmp .Lloop_avx 1292.align 16 1293.Lloop_avx: 1294 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1295 vmovdqu 0x00($inp),@X[0] 1296 lea $TABLE+0x80(%rip),$Tbl # size optimization 1297 vmovdqu 0x10($inp),@X[1] 1298 vmovdqu 0x20($inp),@X[2] 1299 vpshufb $t3,@X[0],@X[0] 1300 vmovdqu 0x30($inp),@X[3] 1301 vpshufb $t3,@X[1],@X[1] 1302 vmovdqu 0x40($inp),@X[4] 1303 vpshufb $t3,@X[2],@X[2] 1304 vmovdqu 0x50($inp),@X[5] 1305 vpshufb $t3,@X[3],@X[3] 1306 vmovdqu 0x60($inp),@X[6] 1307 vpshufb $t3,@X[4],@X[4] 1308 vmovdqu 0x70($inp),@X[7] 1309 vpshufb $t3,@X[5],@X[5] 1310 vpaddq -0x80($Tbl),@X[0],$t0 1311 vpshufb $t3,@X[6],@X[6] 1312 vpaddq -0x60($Tbl),@X[1],$t1 1313 vpshufb $t3,@X[7],@X[7] 1314 vpaddq -0x40($Tbl),@X[2],$t2 1315 vpaddq -0x20($Tbl),@X[3],$t3 1316 vmovdqa $t0,0x00(%rsp) 1317 vpaddq 0x00($Tbl),@X[4],$t0 1318 vmovdqa $t1,0x10(%rsp) 1319 vpaddq 0x20($Tbl),@X[5],$t1 1320 vmovdqa $t2,0x20(%rsp) 1321 vpaddq 0x40($Tbl),@X[6],$t2 1322 vmovdqa $t3,0x30(%rsp) 1323 vpaddq 0x60($Tbl),@X[7],$t3 1324 vmovdqa $t0,0x40(%rsp) 1325 mov $A,$a1 1326 vmovdqa $t1,0x50(%rsp) 1327 mov $B,$a3 1328 vmovdqa $t2,0x60(%rsp) 1329 xor $C,$a3 # magic 1330 vmovdqa $t3,0x70(%rsp) 1331 mov $E,$a0 1332 jmp .Lavx_00_47 1333 1334.align 16 1335.Lavx_00_47: 1336 add \$`16*2*$SZ`,$Tbl 1337___ 1338sub Xupdate_512_AVX () { 1339 ( 1340 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1341 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1342 '&vpsrlq ($t2,$t0,$sigma0[0])', 1343 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1344 '&vpsrlq ($t3,$t0,$sigma0[2])', 1345 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1346 '&vpxor ($t0,$t3,$t2)', 1347 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1348 '&vpxor ($t0,$t0,$t1)', 1349 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1350 '&vpxor ($t0,$t0,$t2)', 1351 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1352 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1353 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1354 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1355 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1356 '&vpxor ($t3,$t3,$t2)', 1357 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1358 '&vpxor ($t3,$t3,$t1)', 1359 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1360 '&vpxor ($t3,$t3,$t2)', 1361 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1362 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1363 ); 1364} 1365 1366sub AVX_512_00_47 () { 1367my $j = shift; 1368my $body = shift; 1369my @X = @_; 1370my @insns = (&$body,&$body); # 52 instructions 1371 1372 foreach (Xupdate_512_AVX()) { # 23 instructions 1373 eval; 1374 eval(shift(@insns)); 1375 eval(shift(@insns)); 1376 } 1377 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1378 foreach (@insns) { eval; } # remaining instructions 1379 &vmovdqa (16*$j."(%rsp)",$t2); 1380} 1381 1382 for ($i=0,$j=0; $j<8; $j++) { 1383 &AVX_512_00_47($j,\&body_00_15,@X); 1384 push(@X,shift(@X)); # rotate(@X) 1385 } 1386 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1387 &jne (".Lavx_00_47"); 1388 1389 for ($i=0; $i<16; ) { 1390 foreach(body_00_15()) { eval; } 1391 } 1392} 1393$code.=<<___; 1394 mov $_ctx,$ctx 1395 mov $a1,$A 1396 1397 add $SZ*0($ctx),$A 1398 lea 16*$SZ($inp),$inp 1399 add $SZ*1($ctx),$B 1400 add $SZ*2($ctx),$C 1401 add $SZ*3($ctx),$D 1402 add $SZ*4($ctx),$E 1403 add $SZ*5($ctx),$F 1404 add $SZ*6($ctx),$G 1405 add $SZ*7($ctx),$H 1406 1407 cmp $_end,$inp 1408 1409 mov $A,$SZ*0($ctx) 1410 mov $B,$SZ*1($ctx) 1411 mov $C,$SZ*2($ctx) 1412 mov $D,$SZ*3($ctx) 1413 mov $E,$SZ*4($ctx) 1414 mov $F,$SZ*5($ctx) 1415 mov $G,$SZ*6($ctx) 1416 mov $H,$SZ*7($ctx) 1417 jb .Lloop_avx 1418 1419 mov $_rsp,%rsi 1420.cfi_def_cfa %rsi,8 1421 vzeroupper 1422___ 1423$code.=<<___ if ($win64); 1424 movaps 16*$SZ+32(%rsp),%xmm6 1425 movaps 16*$SZ+48(%rsp),%xmm7 1426 movaps 16*$SZ+64(%rsp),%xmm8 1427 movaps 16*$SZ+80(%rsp),%xmm9 1428___ 1429$code.=<<___ if ($win64 && $SZ>4); 1430 movaps 16*$SZ+96(%rsp),%xmm10 1431 movaps 16*$SZ+112(%rsp),%xmm11 1432___ 1433$code.=<<___; 1434 mov -48(%rsi),%r15 1435.cfi_restore %r15 1436 mov -40(%rsi),%r14 1437.cfi_restore %r14 1438 mov -32(%rsi),%r13 1439.cfi_restore %r13 1440 mov -24(%rsi),%r12 1441.cfi_restore %r12 1442 mov -16(%rsi),%rbp 1443.cfi_restore %rbp 1444 mov -8(%rsi),%rbx 1445.cfi_restore %rbx 1446 lea (%rsi),%rsp 1447.cfi_def_cfa_register %rsp 1448.Lepilogue_avx: 1449 ret 1450.cfi_endproc 1451.size ${func}_avx,.-${func}_avx 1452___ 1453 1454if ($avx>1) {{ 1455###################################################################### 1456# AVX2+BMI code path 1457# 1458my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1459my $PUSH8=8*2*$SZ; 1460use integer; 1461 1462sub bodyx_00_15 () { 1463 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1464 ( 1465 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1466 1467 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1468 '&and ($a4,$e)', # f&e 1469 '&rorx ($a0,$e,$Sigma1[2])', 1470 '&rorx ($a2,$e,$Sigma1[1])', 1471 1472 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1473 '&lea ($h,"($h,$a4)")', 1474 '&andn ($a4,$e,$g)', # ~e&g 1475 '&xor ($a0,$a2)', 1476 1477 '&rorx ($a1,$e,$Sigma1[0])', 1478 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1479 '&xor ($a0,$a1)', # Sigma1(e) 1480 '&mov ($a2,$a)', 1481 1482 '&rorx ($a4,$a,$Sigma0[2])', 1483 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1484 '&xor ($a2,$b)', # a^b, b^c in next round 1485 '&rorx ($a1,$a,$Sigma0[1])', 1486 1487 '&rorx ($a0,$a,$Sigma0[0])', 1488 '&lea ($d,"($d,$h)")', # d+=h 1489 '&and ($a3,$a2)', # (b^c)&(a^b) 1490 '&xor ($a1,$a4)', 1491 1492 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1493 '&xor ($a1,$a0)', # Sigma0(a) 1494 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1495 '&mov ($a4,$e)', # copy of f in future 1496 1497 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1498 ); 1499 # and at the finish one has to $a+=$a1 1500} 1501 1502$code.=<<___; 1503.type ${func}_avx2,\@function,3 1504.align 64 1505${func}_avx2: 1506.cfi_startproc 1507.Lavx2_shortcut: 1508 mov %rsp,%rax # copy %rsp 1509.cfi_def_cfa_register %rax 1510 push %rbx 1511.cfi_push %rbx 1512 push %rbp 1513.cfi_push %rbp 1514 push %r12 1515.cfi_push %r12 1516 push %r13 1517.cfi_push %r13 1518 push %r14 1519.cfi_push %r14 1520 push %r15 1521.cfi_push %r15 1522 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1523 shl \$4,%rdx # num*16 1524 and \$-256*$SZ,%rsp # align stack frame 1525 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1526 add \$`2*$SZ*($rounds-8)`,%rsp 1527 mov $ctx,$_ctx # save ctx, 1st arg 1528 mov $inp,$_inp # save inp, 2nd arh 1529 mov %rdx,$_end # save end pointer, "3rd" arg 1530 mov %rax,$_rsp # save copy of %rsp 1531.cfi_cfa_expression $_rsp,deref,+8 1532___ 1533$code.=<<___ if ($win64); 1534 movaps %xmm6,16*$SZ+32(%rsp) 1535 movaps %xmm7,16*$SZ+48(%rsp) 1536 movaps %xmm8,16*$SZ+64(%rsp) 1537 movaps %xmm9,16*$SZ+80(%rsp) 1538___ 1539$code.=<<___ if ($win64 && $SZ>4); 1540 movaps %xmm10,16*$SZ+96(%rsp) 1541 movaps %xmm11,16*$SZ+112(%rsp) 1542___ 1543$code.=<<___; 1544.Lprologue_avx2: 1545 1546 vzeroupper 1547 sub \$-16*$SZ,$inp # inp++, size optimization 1548 mov $SZ*0($ctx),$A 1549 mov $inp,%r12 # borrow $T1 1550 mov $SZ*1($ctx),$B 1551 cmp %rdx,$inp # $_end 1552 mov $SZ*2($ctx),$C 1553 cmove %rsp,%r12 # next block or random data 1554 mov $SZ*3($ctx),$D 1555 mov $SZ*4($ctx),$E 1556 mov $SZ*5($ctx),$F 1557 mov $SZ*6($ctx),$G 1558 mov $SZ*7($ctx),$H 1559___ 1560 if ($SZ==4) { # SHA256 1561 my @X = map("%ymm$_",(0..3)); 1562 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1563 1564$code.=<<___; 1565 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1566 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1567 jmp .Loop_avx2 1568.align 16 1569.Loop_avx2: 1570 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1571 vmovdqu -16*$SZ+0($inp),%xmm0 1572 vmovdqu -16*$SZ+16($inp),%xmm1 1573 vmovdqu -16*$SZ+32($inp),%xmm2 1574 vmovdqu -16*$SZ+48($inp),%xmm3 1575 #mov $inp,$_inp # offload $inp 1576 vinserti128 \$1,(%r12),@X[0],@X[0] 1577 vinserti128 \$1,16(%r12),@X[1],@X[1] 1578 vpshufb $t3,@X[0],@X[0] 1579 vinserti128 \$1,32(%r12),@X[2],@X[2] 1580 vpshufb $t3,@X[1],@X[1] 1581 vinserti128 \$1,48(%r12),@X[3],@X[3] 1582 1583 lea $TABLE(%rip),$Tbl 1584 vpshufb $t3,@X[2],@X[2] 1585 vpaddd 0x00($Tbl),@X[0],$t0 1586 vpshufb $t3,@X[3],@X[3] 1587 vpaddd 0x20($Tbl),@X[1],$t1 1588 vpaddd 0x40($Tbl),@X[2],$t2 1589 vpaddd 0x60($Tbl),@X[3],$t3 1590 vmovdqa $t0,0x00(%rsp) 1591 xor $a1,$a1 1592 vmovdqa $t1,0x20(%rsp) 1593 lea -$PUSH8(%rsp),%rsp 1594 mov $B,$a3 1595 vmovdqa $t2,0x00(%rsp) 1596 xor $C,$a3 # magic 1597 vmovdqa $t3,0x20(%rsp) 1598 mov $F,$a4 1599 sub \$-16*2*$SZ,$Tbl # size optimization 1600 jmp .Lavx2_00_47 1601 1602.align 16 1603.Lavx2_00_47: 1604___ 1605 1606sub AVX2_256_00_47 () { 1607my $j = shift; 1608my $body = shift; 1609my @X = @_; 1610my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1611my $base = "+2*$PUSH8(%rsp)"; 1612 1613 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1614 foreach (Xupdate_256_AVX()) { # 29 instructions 1615 eval; 1616 eval(shift(@insns)); 1617 eval(shift(@insns)); 1618 eval(shift(@insns)); 1619 } 1620 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1621 foreach (@insns) { eval; } # remaining instructions 1622 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1623} 1624 1625 for ($i=0,$j=0; $j<4; $j++) { 1626 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1627 push(@X,shift(@X)); # rotate(@X) 1628 } 1629 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1630 &cmpb (($SZ-1)."($Tbl)",0); 1631 &jne (".Lavx2_00_47"); 1632 1633 for ($i=0; $i<16; ) { 1634 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1635 foreach(bodyx_00_15()) { eval; } 1636 } 1637 } else { # SHA512 1638 my @X = map("%ymm$_",(0..7)); 1639 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1640 1641$code.=<<___; 1642 jmp .Loop_avx2 1643.align 16 1644.Loop_avx2: 1645 vmovdqu -16*$SZ($inp),%xmm0 1646 vmovdqu -16*$SZ+16($inp),%xmm1 1647 vmovdqu -16*$SZ+32($inp),%xmm2 1648 lea $TABLE+0x80(%rip),$Tbl # size optimization 1649 vmovdqu -16*$SZ+48($inp),%xmm3 1650 vmovdqu -16*$SZ+64($inp),%xmm4 1651 vmovdqu -16*$SZ+80($inp),%xmm5 1652 vmovdqu -16*$SZ+96($inp),%xmm6 1653 vmovdqu -16*$SZ+112($inp),%xmm7 1654 #mov $inp,$_inp # offload $inp 1655 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1656 vinserti128 \$1,(%r12),@X[0],@X[0] 1657 vinserti128 \$1,16(%r12),@X[1],@X[1] 1658 vpshufb $t2,@X[0],@X[0] 1659 vinserti128 \$1,32(%r12),@X[2],@X[2] 1660 vpshufb $t2,@X[1],@X[1] 1661 vinserti128 \$1,48(%r12),@X[3],@X[3] 1662 vpshufb $t2,@X[2],@X[2] 1663 vinserti128 \$1,64(%r12),@X[4],@X[4] 1664 vpshufb $t2,@X[3],@X[3] 1665 vinserti128 \$1,80(%r12),@X[5],@X[5] 1666 vpshufb $t2,@X[4],@X[4] 1667 vinserti128 \$1,96(%r12),@X[6],@X[6] 1668 vpshufb $t2,@X[5],@X[5] 1669 vinserti128 \$1,112(%r12),@X[7],@X[7] 1670 1671 vpaddq -0x80($Tbl),@X[0],$t0 1672 vpshufb $t2,@X[6],@X[6] 1673 vpaddq -0x60($Tbl),@X[1],$t1 1674 vpshufb $t2,@X[7],@X[7] 1675 vpaddq -0x40($Tbl),@X[2],$t2 1676 vpaddq -0x20($Tbl),@X[3],$t3 1677 vmovdqa $t0,0x00(%rsp) 1678 vpaddq 0x00($Tbl),@X[4],$t0 1679 vmovdqa $t1,0x20(%rsp) 1680 vpaddq 0x20($Tbl),@X[5],$t1 1681 vmovdqa $t2,0x40(%rsp) 1682 vpaddq 0x40($Tbl),@X[6],$t2 1683 vmovdqa $t3,0x60(%rsp) 1684 lea -$PUSH8(%rsp),%rsp 1685 vpaddq 0x60($Tbl),@X[7],$t3 1686 vmovdqa $t0,0x00(%rsp) 1687 xor $a1,$a1 1688 vmovdqa $t1,0x20(%rsp) 1689 mov $B,$a3 1690 vmovdqa $t2,0x40(%rsp) 1691 xor $C,$a3 # magic 1692 vmovdqa $t3,0x60(%rsp) 1693 mov $F,$a4 1694 add \$16*2*$SZ,$Tbl 1695 jmp .Lavx2_00_47 1696 1697.align 16 1698.Lavx2_00_47: 1699___ 1700 1701sub AVX2_512_00_47 () { 1702my $j = shift; 1703my $body = shift; 1704my @X = @_; 1705my @insns = (&$body,&$body); # 48 instructions 1706my $base = "+2*$PUSH8(%rsp)"; 1707 1708 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 1709 foreach (Xupdate_512_AVX()) { # 23 instructions 1710 eval; 1711 if ($_ !~ /\;$/) { 1712 eval(shift(@insns)); 1713 eval(shift(@insns)); 1714 eval(shift(@insns)); 1715 } 1716 } 1717 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1718 foreach (@insns) { eval; } # remaining instructions 1719 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1720} 1721 1722 for ($i=0,$j=0; $j<8; $j++) { 1723 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 1724 push(@X,shift(@X)); # rotate(@X) 1725 } 1726 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1727 &cmpb (($SZ-1-0x80)."($Tbl)",0); 1728 &jne (".Lavx2_00_47"); 1729 1730 for ($i=0; $i<16; ) { 1731 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1732 foreach(bodyx_00_15()) { eval; } 1733 } 1734} 1735$code.=<<___; 1736 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1737 add $a1,$A 1738 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1739 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1740 1741 add $SZ*0($ctx),$A 1742 add $SZ*1($ctx),$B 1743 add $SZ*2($ctx),$C 1744 add $SZ*3($ctx),$D 1745 add $SZ*4($ctx),$E 1746 add $SZ*5($ctx),$F 1747 add $SZ*6($ctx),$G 1748 add $SZ*7($ctx),$H 1749 1750 mov $A,$SZ*0($ctx) 1751 mov $B,$SZ*1($ctx) 1752 mov $C,$SZ*2($ctx) 1753 mov $D,$SZ*3($ctx) 1754 mov $E,$SZ*4($ctx) 1755 mov $F,$SZ*5($ctx) 1756 mov $G,$SZ*6($ctx) 1757 mov $H,$SZ*7($ctx) 1758 1759 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 1760 je .Ldone_avx2 1761 1762 xor $a1,$a1 1763 mov $B,$a3 1764 xor $C,$a3 # magic 1765 mov $F,$a4 1766 jmp .Lower_avx2 1767.align 16 1768.Lower_avx2: 1769___ 1770 for ($i=0; $i<8; ) { 1771 my $base="+16($Tbl)"; 1772 foreach(bodyx_00_15()) { eval; } 1773 } 1774$code.=<<___; 1775 lea -$PUSH8($Tbl),$Tbl 1776 cmp %rsp,$Tbl 1777 jae .Lower_avx2 1778 1779 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1780 add $a1,$A 1781 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1782 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1783 1784 add $SZ*0($ctx),$A 1785 add $SZ*1($ctx),$B 1786 add $SZ*2($ctx),$C 1787 add $SZ*3($ctx),$D 1788 add $SZ*4($ctx),$E 1789 add $SZ*5($ctx),$F 1790 lea `2*16*$SZ`($inp),$inp # inp+=2 1791 add $SZ*6($ctx),$G 1792 mov $inp,%r12 1793 add $SZ*7($ctx),$H 1794 cmp $_end,$inp 1795 1796 mov $A,$SZ*0($ctx) 1797 cmove %rsp,%r12 # next block or stale data 1798 mov $B,$SZ*1($ctx) 1799 mov $C,$SZ*2($ctx) 1800 mov $D,$SZ*3($ctx) 1801 mov $E,$SZ*4($ctx) 1802 mov $F,$SZ*5($ctx) 1803 mov $G,$SZ*6($ctx) 1804 mov $H,$SZ*7($ctx) 1805 1806 jbe .Loop_avx2 1807 lea (%rsp),$Tbl 1808 1809.Ldone_avx2: 1810 lea ($Tbl),%rsp 1811 mov $_rsp,%rsi 1812.cfi_def_cfa %rsi,8 1813 vzeroupper 1814___ 1815$code.=<<___ if ($win64); 1816 movaps 16*$SZ+32(%rsp),%xmm6 1817 movaps 16*$SZ+48(%rsp),%xmm7 1818 movaps 16*$SZ+64(%rsp),%xmm8 1819 movaps 16*$SZ+80(%rsp),%xmm9 1820___ 1821$code.=<<___ if ($win64 && $SZ>4); 1822 movaps 16*$SZ+96(%rsp),%xmm10 1823 movaps 16*$SZ+112(%rsp),%xmm11 1824___ 1825$code.=<<___; 1826 mov -48(%rsi),%r15 1827.cfi_restore %r15 1828 mov -40(%rsi),%r14 1829.cfi_restore %r14 1830 mov -32(%rsi),%r13 1831.cfi_restore %r13 1832 mov -24(%rsi),%r12 1833.cfi_restore %r12 1834 mov -16(%rsi),%rbp 1835.cfi_restore %rbp 1836 mov -8(%rsi),%rbx 1837.cfi_restore %rbx 1838 lea (%rsi),%rsp 1839.cfi_def_cfa_register %rsp 1840.Lepilogue_avx2: 1841 ret 1842.cfi_endproc 1843.size ${func}_avx2,.-${func}_avx2 1844___ 1845}} 1846}}}}} 1847 1848# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1849# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1850if ($win64) { 1851$rec="%rcx"; 1852$frame="%rdx"; 1853$context="%r8"; 1854$disp="%r9"; 1855 1856$code.=<<___; 1857.extern __imp_RtlVirtualUnwind 1858.type se_handler,\@abi-omnipotent 1859.align 16 1860se_handler: 1861 push %rsi 1862 push %rdi 1863 push %rbx 1864 push %rbp 1865 push %r12 1866 push %r13 1867 push %r14 1868 push %r15 1869 pushfq 1870 sub \$64,%rsp 1871 1872 mov 120($context),%rax # pull context->Rax 1873 mov 248($context),%rbx # pull context->Rip 1874 1875 mov 8($disp),%rsi # disp->ImageBase 1876 mov 56($disp),%r11 # disp->HanderlData 1877 1878 mov 0(%r11),%r10d # HandlerData[0] 1879 lea (%rsi,%r10),%r10 # prologue label 1880 cmp %r10,%rbx # context->Rip<prologue label 1881 jb .Lin_prologue 1882 1883 mov 152($context),%rax # pull context->Rsp 1884 1885 mov 4(%r11),%r10d # HandlerData[1] 1886 lea (%rsi,%r10),%r10 # epilogue label 1887 cmp %r10,%rbx # context->Rip>=epilogue label 1888 jae .Lin_prologue 1889___ 1890$code.=<<___ if ($avx>1); 1891 lea .Lavx2_shortcut(%rip),%r10 1892 cmp %r10,%rbx # context->Rip<avx2_shortcut 1893 jb .Lnot_in_avx2 1894 1895 and \$-256*$SZ,%rax 1896 add \$`2*$SZ*($rounds-8)`,%rax 1897.Lnot_in_avx2: 1898___ 1899$code.=<<___; 1900 mov %rax,%rsi # put aside Rsp 1901 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 1902 1903 mov -8(%rax),%rbx 1904 mov -16(%rax),%rbp 1905 mov -24(%rax),%r12 1906 mov -32(%rax),%r13 1907 mov -40(%rax),%r14 1908 mov -48(%rax),%r15 1909 mov %rbx,144($context) # restore context->Rbx 1910 mov %rbp,160($context) # restore context->Rbp 1911 mov %r12,216($context) # restore context->R12 1912 mov %r13,224($context) # restore context->R13 1913 mov %r14,232($context) # restore context->R14 1914 mov %r15,240($context) # restore context->R15 1915 1916 lea .Lepilogue(%rip),%r10 1917 cmp %r10,%rbx 1918 jb .Lin_prologue # non-AVX code 1919 1920 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 1921 lea 512($context),%rdi # &context.Xmm6 1922 mov \$`$SZ==4?8:12`,%ecx 1923 .long 0xa548f3fc # cld; rep movsq 1924 1925.Lin_prologue: 1926 mov 8(%rax),%rdi 1927 mov 16(%rax),%rsi 1928 mov %rax,152($context) # restore context->Rsp 1929 mov %rsi,168($context) # restore context->Rsi 1930 mov %rdi,176($context) # restore context->Rdi 1931 1932 mov 40($disp),%rdi # disp->ContextRecord 1933 mov $context,%rsi # context 1934 mov \$154,%ecx # sizeof(CONTEXT) 1935 .long 0xa548f3fc # cld; rep movsq 1936 1937 mov $disp,%rsi 1938 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1939 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1940 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1941 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1942 mov 40(%rsi),%r10 # disp->ContextRecord 1943 lea 56(%rsi),%r11 # &disp->HandlerData 1944 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1945 mov %r10,32(%rsp) # arg5 1946 mov %r11,40(%rsp) # arg6 1947 mov %r12,48(%rsp) # arg7 1948 mov %rcx,56(%rsp) # arg8, (NULL) 1949 call *__imp_RtlVirtualUnwind(%rip) 1950 1951 mov \$1,%eax # ExceptionContinueSearch 1952 add \$64,%rsp 1953 popfq 1954 pop %r15 1955 pop %r14 1956 pop %r13 1957 pop %r12 1958 pop %rbp 1959 pop %rbx 1960 pop %rdi 1961 pop %rsi 1962 ret 1963.size se_handler,.-se_handler 1964___ 1965 1966$code.=<<___ if ($SZ==4 && $shaext); 1967.type shaext_handler,\@abi-omnipotent 1968.align 16 1969shaext_handler: 1970 push %rsi 1971 push %rdi 1972 push %rbx 1973 push %rbp 1974 push %r12 1975 push %r13 1976 push %r14 1977 push %r15 1978 pushfq 1979 sub \$64,%rsp 1980 1981 mov 120($context),%rax # pull context->Rax 1982 mov 248($context),%rbx # pull context->Rip 1983 1984 lea .Lprologue_shaext(%rip),%r10 1985 cmp %r10,%rbx # context->Rip<.Lprologue 1986 jb .Lin_prologue 1987 1988 lea .Lepilogue_shaext(%rip),%r10 1989 cmp %r10,%rbx # context->Rip>=.Lepilogue 1990 jae .Lin_prologue 1991 1992 lea -8-5*16(%rax),%rsi 1993 lea 512($context),%rdi # &context.Xmm6 1994 mov \$10,%ecx 1995 .long 0xa548f3fc # cld; rep movsq 1996 1997 jmp .Lin_prologue 1998.size shaext_handler,.-shaext_handler 1999___ 2000 2001$code.=<<___; 2002.section .pdata 2003.align 4 2004 .rva .LSEH_begin_$func 2005 .rva .LSEH_end_$func 2006 .rva .LSEH_info_$func 2007___ 2008$code.=<<___ if ($SZ==4 && $shaext); 2009 .rva .LSEH_begin_${func}_shaext 2010 .rva .LSEH_end_${func}_shaext 2011 .rva .LSEH_info_${func}_shaext 2012___ 2013$code.=<<___ if ($SZ==4); 2014 .rva .LSEH_begin_${func}_ssse3 2015 .rva .LSEH_end_${func}_ssse3 2016 .rva .LSEH_info_${func}_ssse3 2017___ 2018$code.=<<___ if ($avx); 2019 .rva .LSEH_begin_${func}_avx 2020 .rva .LSEH_end_${func}_avx 2021 .rva .LSEH_info_${func}_avx 2022___ 2023$code.=<<___ if ($avx>1); 2024 .rva .LSEH_begin_${func}_avx2 2025 .rva .LSEH_end_${func}_avx2 2026 .rva .LSEH_info_${func}_avx2 2027___ 2028$code.=<<___; 2029.section .xdata 2030.align 8 2031.LSEH_info_$func: 2032 .byte 9,0,0,0 2033 .rva se_handler 2034 .rva .Lprologue,.Lepilogue # HandlerData[] 2035___ 2036$code.=<<___ if ($SZ==4 && $shaext); 2037.LSEH_info_${func}_shaext: 2038 .byte 9,0,0,0 2039 .rva shaext_handler 2040___ 2041$code.=<<___ if ($SZ==4); 2042.LSEH_info_${func}_ssse3: 2043 .byte 9,0,0,0 2044 .rva se_handler 2045 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2046___ 2047$code.=<<___ if ($avx); 2048.LSEH_info_${func}_avx: 2049 .byte 9,0,0,0 2050 .rva se_handler 2051 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2052___ 2053$code.=<<___ if ($avx>1); 2054.LSEH_info_${func}_avx2: 2055 .byte 9,0,0,0 2056 .rva se_handler 2057 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2058___ 2059} 2060 2061sub sha256op38 { 2062 my $instr = shift; 2063 my %opcodelet = ( 2064 "sha256rnds2" => 0xcb, 2065 "sha256msg1" => 0xcc, 2066 "sha256msg2" => 0xcd ); 2067 2068 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2069 my @opcode=(0x0f,0x38); 2070 push @opcode,$opcodelet{$instr}; 2071 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2072 return ".byte\t".join(',',@opcode); 2073 } else { 2074 return $instr."\t".@_[0]; 2075 } 2076} 2077 2078foreach (split("\n",$code)) { 2079 s/\`([^\`]*)\`/eval $1/geo; 2080 2081 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2082 2083 print $_,"\n"; 2084} 2085close STDOUT or die "error closing STDOUT"; 2086