1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111# 112# Modified from upstream OpenSSL to remove the XOP code. 113 114my ($flavour, $output) = @ARGV; 115 116if ($output =~ /sha512-x86_64/) { 117 $func="sha512_block_data_order"; 118 $TABLE="K512"; 119 $SZ=8; 120 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 121 "%r8", "%r9", "%r10","%r11"); 122 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 123 @Sigma0=(28,34,39); 124 @Sigma1=(14,18,41); 125 @sigma0=(1, 8, 7); 126 @sigma1=(19,61, 6); 127 $rounds=80; 128} else { 129 $func="sha256_block_data_order"; 130 $TABLE="K256"; 131 $SZ=4; 132 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 133 "%r8d","%r9d","%r10d","%r11d"); 134 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 135 @Sigma0=( 2,13,22); 136 @Sigma1=( 6,11,25); 137 @sigma0=( 7,18, 3); 138 @sigma1=(17,19,10); 139 $rounds=64; 140} 141 142$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 143 144$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 145( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 146( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 147die "can't locate x86_64-xlate.pl"; 148 149# In upstream, this is controlled by shelling out to the compiler to check 150# versions, but BoringSSL is intended to be used with pre-generated perlasm 151# output, so this isn't useful anyway. 152# 153# This file also has an AVX2 implementation, controlled by setting $avx to 2. 154# For now, we intentionally disable it. While it gives a 13-16% perf boost, the 155# CFI annotations are wrong. It allocates stack in a loop and should be 156# rewritten to avoid this. 157$avx = 1; 158$shaext = 1; 159 160open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 161*STDOUT=*OUT; 162 163$ctx="%rdi"; # 1st arg, zapped by $a3 164$inp="%rsi"; # 2nd arg 165$Tbl="%rbp"; 166 167$_ctx="16*$SZ+0*8(%rsp)"; 168$_inp="16*$SZ+1*8(%rsp)"; 169$_end="16*$SZ+2*8(%rsp)"; 170$_rsp="`16*$SZ+3*8`(%rsp)"; 171$framesz="16*$SZ+4*8"; 172 173 174sub ROUND_00_15() 175{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 176 my $STRIDE=$SZ; 177 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 178 179$code.=<<___; 180 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 181 mov $f,$a2 182 183 xor $e,$a0 184 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 185 xor $g,$a2 # f^g 186 187 mov $T1,`$SZ*($i&0xf)`(%rsp) 188 xor $a,$a1 189 and $e,$a2 # (f^g)&e 190 191 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 192 add $h,$T1 # T1+=h 193 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 194 195 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 196 xor $e,$a0 197 add $a2,$T1 # T1+=Ch(e,f,g) 198 199 mov $a,$a2 200 add ($Tbl),$T1 # T1+=K[round] 201 xor $a,$a1 202 203 xor $b,$a2 # a^b, b^c in next round 204 ror \$$Sigma1[0],$a0 # Sigma1(e) 205 mov $b,$h 206 207 and $a2,$a3 208 ror \$$Sigma0[0],$a1 # Sigma0(a) 209 add $a0,$T1 # T1+=Sigma1(e) 210 211 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 212 add $T1,$d # d+=T1 213 add $T1,$h # h+=T1 214 215 lea $STRIDE($Tbl),$Tbl # round++ 216___ 217$code.=<<___ if ($i<15); 218 add $a1,$h # h+=Sigma0(a) 219___ 220 ($a2,$a3) = ($a3,$a2); 221} 222 223sub ROUND_16_XX() 224{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 225 226$code.=<<___; 227 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 228 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 229 230 mov $a0,$T1 231 ror \$`$sigma0[1]-$sigma0[0]`,$a0 232 add $a1,$a # modulo-scheduled h+=Sigma0(a) 233 mov $a2,$a1 234 ror \$`$sigma1[1]-$sigma1[0]`,$a2 235 236 xor $T1,$a0 237 shr \$$sigma0[2],$T1 238 ror \$$sigma0[0],$a0 239 xor $a1,$a2 240 shr \$$sigma1[2],$a1 241 242 ror \$$sigma1[0],$a2 243 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 244 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 245 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 246 247 add `$SZ*($i&0xf)`(%rsp),$T1 248 mov $e,$a0 249 add $a2,$T1 250 mov $a,$a1 251___ 252 &ROUND_00_15(@_); 253} 254 255$code=<<___; 256.text 257 258.extern OPENSSL_ia32cap_P 259.globl $func 260.type $func,\@function,3 261.align 16 262$func: 263.cfi_startproc 264 _CET_ENDBR 265___ 266$code.=<<___ if ($SZ==4 || $avx); 267 leaq OPENSSL_ia32cap_P(%rip),%r11 268 mov 0(%r11),%r9d 269 mov 4(%r11),%r10d 270 mov 8(%r11),%r11d 271___ 272$code.=<<___ if ($SZ==4 && $shaext); 273 test \$`1<<29`,%r11d # check for SHA 274 jnz .Lshaext_shortcut 275___ 276 # XOP codepath removed. 277$code.=<<___ if ($avx>1); 278 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 279 cmp \$`1<<8|1<<5|1<<3`,%r11d 280 je .Lavx2_shortcut 281___ 282$code.=<<___ if ($avx); 283 and \$`1<<30`,%r9d # mask "Intel CPU" bit 284 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 285 or %r9d,%r10d 286 cmp \$`1<<28|1<<9|1<<30`,%r10d 287 je .Lavx_shortcut 288___ 289$code.=<<___ if ($SZ==4); 290 test \$`1<<9`,%r10d 291 jnz .Lssse3_shortcut 292___ 293$code.=<<___; 294 mov %rsp,%rax # copy %rsp 295.cfi_def_cfa_register %rax 296 push %rbx 297.cfi_push %rbx 298 push %rbp 299.cfi_push %rbp 300 push %r12 301.cfi_push %r12 302 push %r13 303.cfi_push %r13 304 push %r14 305.cfi_push %r14 306 push %r15 307.cfi_push %r15 308 shl \$4,%rdx # num*16 309 sub \$$framesz,%rsp 310 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 311 and \$-64,%rsp # align stack frame 312 mov $ctx,$_ctx # save ctx, 1st arg 313 mov $inp,$_inp # save inp, 2nd arh 314 mov %rdx,$_end # save end pointer, "3rd" arg 315 mov %rax,$_rsp # save copy of %rsp 316.cfi_cfa_expression $_rsp,deref,+8 317.Lprologue: 318 319 mov $SZ*0($ctx),$A 320 mov $SZ*1($ctx),$B 321 mov $SZ*2($ctx),$C 322 mov $SZ*3($ctx),$D 323 mov $SZ*4($ctx),$E 324 mov $SZ*5($ctx),$F 325 mov $SZ*6($ctx),$G 326 mov $SZ*7($ctx),$H 327 jmp .Lloop 328 329.align 16 330.Lloop: 331 mov $B,$a3 332 lea $TABLE(%rip),$Tbl 333 xor $C,$a3 # magic 334___ 335 for($i=0;$i<16;$i++) { 336 $code.=" mov $SZ*$i($inp),$T1\n"; 337 $code.=" mov @ROT[4],$a0\n"; 338 $code.=" mov @ROT[0],$a1\n"; 339 $code.=" bswap $T1\n"; 340 &ROUND_00_15($i,@ROT); 341 unshift(@ROT,pop(@ROT)); 342 } 343$code.=<<___; 344 jmp .Lrounds_16_xx 345.align 16 346.Lrounds_16_xx: 347___ 348 for(;$i<32;$i++) { 349 &ROUND_16_XX($i,@ROT); 350 unshift(@ROT,pop(@ROT)); 351 } 352 353$code.=<<___; 354 cmpb \$0,`$SZ-1`($Tbl) 355 jnz .Lrounds_16_xx 356 357 mov $_ctx,$ctx 358 add $a1,$A # modulo-scheduled h+=Sigma0(a) 359 lea 16*$SZ($inp),$inp 360 361 add $SZ*0($ctx),$A 362 add $SZ*1($ctx),$B 363 add $SZ*2($ctx),$C 364 add $SZ*3($ctx),$D 365 add $SZ*4($ctx),$E 366 add $SZ*5($ctx),$F 367 add $SZ*6($ctx),$G 368 add $SZ*7($ctx),$H 369 370 cmp $_end,$inp 371 372 mov $A,$SZ*0($ctx) 373 mov $B,$SZ*1($ctx) 374 mov $C,$SZ*2($ctx) 375 mov $D,$SZ*3($ctx) 376 mov $E,$SZ*4($ctx) 377 mov $F,$SZ*5($ctx) 378 mov $G,$SZ*6($ctx) 379 mov $H,$SZ*7($ctx) 380 jb .Lloop 381 382 mov $_rsp,%rsi 383.cfi_def_cfa %rsi,8 384 mov -48(%rsi),%r15 385.cfi_restore %r15 386 mov -40(%rsi),%r14 387.cfi_restore %r14 388 mov -32(%rsi),%r13 389.cfi_restore %r13 390 mov -24(%rsi),%r12 391.cfi_restore %r12 392 mov -16(%rsi),%rbp 393.cfi_restore %rbp 394 mov -8(%rsi),%rbx 395.cfi_restore %rbx 396 lea (%rsi),%rsp 397.cfi_def_cfa_register %rsp 398.Lepilogue: 399 ret 400.cfi_endproc 401.size $func,.-$func 402___ 403 404if ($SZ==4) { 405$code.=<<___; 406.section .rodata 407.align 64 408.type $TABLE,\@object 409$TABLE: 410 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 411 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 412 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 413 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 414 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 415 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 416 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 417 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 418 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 419 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 420 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 421 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 422 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 423 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 424 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 425 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 426 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 427 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 428 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 429 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 430 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 431 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 432 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 433 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 434 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 435 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 436 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 437 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 438 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 439 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 440 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 441 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 442 443 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 444 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 445 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 446 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 447 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 448 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 449 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 450.text 451___ 452} else { 453$code.=<<___; 454.section .rodata 455.align 64 456.type $TABLE,\@object 457$TABLE: 458 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 459 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 460 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 461 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 462 .quad 0x3956c25bf348b538,0x59f111f1b605d019 463 .quad 0x3956c25bf348b538,0x59f111f1b605d019 464 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 465 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 466 .quad 0xd807aa98a3030242,0x12835b0145706fbe 467 .quad 0xd807aa98a3030242,0x12835b0145706fbe 468 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 469 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 470 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 471 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 472 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 473 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 474 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 475 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 476 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 477 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 478 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 479 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 480 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 481 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 482 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 483 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 484 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 485 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 486 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 487 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 488 .quad 0x06ca6351e003826f,0x142929670a0e6e70 489 .quad 0x06ca6351e003826f,0x142929670a0e6e70 490 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 491 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 492 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 493 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 494 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 495 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 496 .quad 0x81c2c92e47edaee6,0x92722c851482353b 497 .quad 0x81c2c92e47edaee6,0x92722c851482353b 498 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 499 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 500 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 501 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 502 .quad 0xd192e819d6ef5218,0xd69906245565a910 503 .quad 0xd192e819d6ef5218,0xd69906245565a910 504 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 505 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 506 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 507 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 508 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 509 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 510 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 511 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 512 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 513 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 514 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 515 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 516 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 517 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 518 .quad 0x90befffa23631e28,0xa4506cebde82bde9 519 .quad 0x90befffa23631e28,0xa4506cebde82bde9 520 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 521 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 522 .quad 0xca273eceea26619c,0xd186b8c721c0c207 523 .quad 0xca273eceea26619c,0xd186b8c721c0c207 524 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 525 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 526 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 527 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 528 .quad 0x113f9804bef90dae,0x1b710b35131c471b 529 .quad 0x113f9804bef90dae,0x1b710b35131c471b 530 .quad 0x28db77f523047d84,0x32caab7b40c72493 531 .quad 0x28db77f523047d84,0x32caab7b40c72493 532 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 533 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 534 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 535 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 536 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 537 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 538 539 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 540 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 541 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 542.text 543___ 544} 545 546###################################################################### 547# SIMD code paths 548# 549if ($SZ==4 && $shaext) {{{ 550###################################################################### 551# Intel SHA Extensions implementation of SHA256 update function. 552# 553my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 554 555my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 556my @MSG=map("%xmm$_",(3..6)); 557 558$code.=<<___; 559.type sha256_block_data_order_shaext,\@function,3 560.align 64 561sha256_block_data_order_shaext: 562.cfi_startproc 563.Lshaext_shortcut: 564___ 565$code.=<<___ if ($win64); 566 lea `-8-5*16`(%rsp),%rsp 567 movaps %xmm6,-8-5*16(%rax) 568 movaps %xmm7,-8-4*16(%rax) 569 movaps %xmm8,-8-3*16(%rax) 570 movaps %xmm9,-8-2*16(%rax) 571 movaps %xmm10,-8-1*16(%rax) 572.Lprologue_shaext: 573___ 574$code.=<<___; 575 lea K256+0x80(%rip),$Tbl 576 movdqu ($ctx),$ABEF # DCBA 577 movdqu 16($ctx),$CDGH # HGFE 578 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 579 580 pshufd \$0x1b,$ABEF,$Wi # ABCD 581 pshufd \$0xb1,$ABEF,$ABEF # CDAB 582 pshufd \$0x1b,$CDGH,$CDGH # EFGH 583 movdqa $TMP,$BSWAP # offload 584 palignr \$8,$CDGH,$ABEF # ABEF 585 punpcklqdq $Wi,$CDGH # CDGH 586 jmp .Loop_shaext 587 588.align 16 589.Loop_shaext: 590 movdqu ($inp),@MSG[0] 591 movdqu 0x10($inp),@MSG[1] 592 movdqu 0x20($inp),@MSG[2] 593 pshufb $TMP,@MSG[0] 594 movdqu 0x30($inp),@MSG[3] 595 596 movdqa 0*32-0x80($Tbl),$Wi 597 paddd @MSG[0],$Wi 598 pshufb $TMP,@MSG[1] 599 movdqa $CDGH,$CDGH_SAVE # offload 600 sha256rnds2 $ABEF,$CDGH # 0-3 601 pshufd \$0x0e,$Wi,$Wi 602 nop 603 movdqa $ABEF,$ABEF_SAVE # offload 604 sha256rnds2 $CDGH,$ABEF 605 606 movdqa 1*32-0x80($Tbl),$Wi 607 paddd @MSG[1],$Wi 608 pshufb $TMP,@MSG[2] 609 sha256rnds2 $ABEF,$CDGH # 4-7 610 pshufd \$0x0e,$Wi,$Wi 611 lea 0x40($inp),$inp 612 sha256msg1 @MSG[1],@MSG[0] 613 sha256rnds2 $CDGH,$ABEF 614 615 movdqa 2*32-0x80($Tbl),$Wi 616 paddd @MSG[2],$Wi 617 pshufb $TMP,@MSG[3] 618 sha256rnds2 $ABEF,$CDGH # 8-11 619 pshufd \$0x0e,$Wi,$Wi 620 movdqa @MSG[3],$TMP 621 palignr \$4,@MSG[2],$TMP 622 nop 623 paddd $TMP,@MSG[0] 624 sha256msg1 @MSG[2],@MSG[1] 625 sha256rnds2 $CDGH,$ABEF 626 627 movdqa 3*32-0x80($Tbl),$Wi 628 paddd @MSG[3],$Wi 629 sha256msg2 @MSG[3],@MSG[0] 630 sha256rnds2 $ABEF,$CDGH # 12-15 631 pshufd \$0x0e,$Wi,$Wi 632 movdqa @MSG[0],$TMP 633 palignr \$4,@MSG[3],$TMP 634 nop 635 paddd $TMP,@MSG[1] 636 sha256msg1 @MSG[3],@MSG[2] 637 sha256rnds2 $CDGH,$ABEF 638___ 639for($i=4;$i<16-3;$i++) { 640$code.=<<___; 641 movdqa $i*32-0x80($Tbl),$Wi 642 paddd @MSG[0],$Wi 643 sha256msg2 @MSG[0],@MSG[1] 644 sha256rnds2 $ABEF,$CDGH # 16-19... 645 pshufd \$0x0e,$Wi,$Wi 646 movdqa @MSG[1],$TMP 647 palignr \$4,@MSG[0],$TMP 648 nop 649 paddd $TMP,@MSG[2] 650 sha256msg1 @MSG[0],@MSG[3] 651 sha256rnds2 $CDGH,$ABEF 652___ 653 push(@MSG,shift(@MSG)); 654} 655$code.=<<___; 656 movdqa 13*32-0x80($Tbl),$Wi 657 paddd @MSG[0],$Wi 658 sha256msg2 @MSG[0],@MSG[1] 659 sha256rnds2 $ABEF,$CDGH # 52-55 660 pshufd \$0x0e,$Wi,$Wi 661 movdqa @MSG[1],$TMP 662 palignr \$4,@MSG[0],$TMP 663 sha256rnds2 $CDGH,$ABEF 664 paddd $TMP,@MSG[2] 665 666 movdqa 14*32-0x80($Tbl),$Wi 667 paddd @MSG[1],$Wi 668 sha256rnds2 $ABEF,$CDGH # 56-59 669 pshufd \$0x0e,$Wi,$Wi 670 sha256msg2 @MSG[1],@MSG[2] 671 movdqa $BSWAP,$TMP 672 sha256rnds2 $CDGH,$ABEF 673 674 movdqa 15*32-0x80($Tbl),$Wi 675 paddd @MSG[2],$Wi 676 nop 677 sha256rnds2 $ABEF,$CDGH # 60-63 678 pshufd \$0x0e,$Wi,$Wi 679 dec $num 680 nop 681 sha256rnds2 $CDGH,$ABEF 682 683 paddd $CDGH_SAVE,$CDGH 684 paddd $ABEF_SAVE,$ABEF 685 jnz .Loop_shaext 686 687 pshufd \$0xb1,$CDGH,$CDGH # DCHG 688 pshufd \$0x1b,$ABEF,$TMP # FEBA 689 pshufd \$0xb1,$ABEF,$ABEF # BAFE 690 punpckhqdq $CDGH,$ABEF # DCBA 691 palignr \$8,$TMP,$CDGH # HGFE 692 693 movdqu $ABEF,($ctx) 694 movdqu $CDGH,16($ctx) 695___ 696$code.=<<___ if ($win64); 697 movaps -8-5*16(%rax),%xmm6 698 movaps -8-4*16(%rax),%xmm7 699 movaps -8-3*16(%rax),%xmm8 700 movaps -8-2*16(%rax),%xmm9 701 movaps -8-1*16(%rax),%xmm10 702 mov %rax,%rsp 703.Lepilogue_shaext: 704___ 705$code.=<<___; 706 ret 707.cfi_endproc 708.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 709___ 710}}} 711{{{ 712 713my $a4=$T1; 714my ($a,$b,$c,$d,$e,$f,$g,$h); 715 716sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 717{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 718 my $arg = pop; 719 $arg = "\$$arg" if ($arg*1 eq $arg); 720 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 721} 722 723sub body_00_15 () { 724 ( 725 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 726 727 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 728 '&mov ($a,$a1)', 729 '&mov ($a4,$f)', 730 731 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 732 '&xor ($a0,$e)', 733 '&xor ($a4,$g)', # f^g 734 735 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 736 '&xor ($a1,$a)', 737 '&and ($a4,$e)', # (f^g)&e 738 739 '&xor ($a0,$e)', 740 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 741 '&mov ($a2,$a)', 742 743 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 744 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 745 '&xor ($a2,$b)', # a^b, b^c in next round 746 747 '&add ($h,$a4)', # h+=Ch(e,f,g) 748 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 749 '&and ($a3,$a2)', # (b^c)&(a^b) 750 751 '&xor ($a1,$a)', 752 '&add ($h,$a0)', # h+=Sigma1(e) 753 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 754 755 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 756 '&add ($d,$h)', # d+=h 757 '&add ($h,$a3)', # h+=Maj(a,b,c) 758 759 '&mov ($a0,$d)', 760 '&add ($a1,$h);'. # h+=Sigma0(a) 761 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 762 ); 763} 764 765###################################################################### 766# SSSE3 code path 767# 768if ($SZ==4) { # SHA256 only 769my @X = map("%xmm$_",(0..3)); 770my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 771 772$code.=<<___; 773.type ${func}_ssse3,\@function,3 774.align 64 775${func}_ssse3: 776.cfi_startproc 777.Lssse3_shortcut: 778 mov %rsp,%rax # copy %rsp 779.cfi_def_cfa_register %rax 780 push %rbx 781.cfi_push %rbx 782 push %rbp 783.cfi_push %rbp 784 push %r12 785.cfi_push %r12 786 push %r13 787.cfi_push %r13 788 push %r14 789.cfi_push %r14 790 push %r15 791.cfi_push %r15 792 shl \$4,%rdx # num*16 793 sub \$`$framesz+$win64*16*4`,%rsp 794 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 795 and \$-64,%rsp # align stack frame 796 mov $ctx,$_ctx # save ctx, 1st arg 797 mov $inp,$_inp # save inp, 2nd arh 798 mov %rdx,$_end # save end pointer, "3rd" arg 799 mov %rax,$_rsp # save copy of %rsp 800.cfi_cfa_expression $_rsp,deref,+8 801___ 802$code.=<<___ if ($win64); 803 movaps %xmm6,16*$SZ+32(%rsp) 804 movaps %xmm7,16*$SZ+48(%rsp) 805 movaps %xmm8,16*$SZ+64(%rsp) 806 movaps %xmm9,16*$SZ+80(%rsp) 807___ 808$code.=<<___; 809.Lprologue_ssse3: 810 811 mov $SZ*0($ctx),$A 812 mov $SZ*1($ctx),$B 813 mov $SZ*2($ctx),$C 814 mov $SZ*3($ctx),$D 815 mov $SZ*4($ctx),$E 816 mov $SZ*5($ctx),$F 817 mov $SZ*6($ctx),$G 818 mov $SZ*7($ctx),$H 819___ 820 821$code.=<<___; 822 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 823 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 824 jmp .Lloop_ssse3 825.align 16 826.Lloop_ssse3: 827 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 828 movdqu 0x00($inp),@X[0] 829 movdqu 0x10($inp),@X[1] 830 movdqu 0x20($inp),@X[2] 831 pshufb $t3,@X[0] 832 movdqu 0x30($inp),@X[3] 833 lea $TABLE(%rip),$Tbl 834 pshufb $t3,@X[1] 835 movdqa 0x00($Tbl),$t0 836 movdqa 0x20($Tbl),$t1 837 pshufb $t3,@X[2] 838 paddd @X[0],$t0 839 movdqa 0x40($Tbl),$t2 840 pshufb $t3,@X[3] 841 movdqa 0x60($Tbl),$t3 842 paddd @X[1],$t1 843 paddd @X[2],$t2 844 paddd @X[3],$t3 845 movdqa $t0,0x00(%rsp) 846 mov $A,$a1 847 movdqa $t1,0x10(%rsp) 848 mov $B,$a3 849 movdqa $t2,0x20(%rsp) 850 xor $C,$a3 # magic 851 movdqa $t3,0x30(%rsp) 852 mov $E,$a0 853 jmp .Lssse3_00_47 854 855.align 16 856.Lssse3_00_47: 857 sub \$`-16*2*$SZ`,$Tbl # size optimization 858___ 859sub Xupdate_256_SSSE3 () { 860 ( 861 '&movdqa ($t0,@X[1]);', 862 '&movdqa ($t3,@X[3])', 863 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 864 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 865 '&movdqa ($t1,$t0)', 866 '&movdqa ($t2,$t0);', 867 '&psrld ($t0,$sigma0[2])', 868 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 869 '&psrld ($t2,$sigma0[0])', 870 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 871 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 872 '&pxor ($t0,$t2)', 873 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 874 '&pxor ($t0,$t1)', 875 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 876 '&pxor ($t0,$t2);', 877 '&movdqa ($t2,$t3)', 878 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 879 '&psrld ($t3,$sigma1[2])', 880 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 881 '&psrlq ($t2,$sigma1[0])', 882 '&pxor ($t3,$t2);', 883 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 884 '&pxor ($t3,$t2)', 885 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 886 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 887 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 888 '&movdqa ($t2,$t3);', 889 '&psrld ($t3,$sigma1[2])', 890 '&psrlq ($t2,$sigma1[0])', 891 '&pxor ($t3,$t2);', 892 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 893 '&pxor ($t3,$t2);', 894 '&movdqa ($t2,16*2*$j."($Tbl)")', 895 '&pshufb ($t3,$t5)', 896 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 897 ); 898} 899 900sub SSSE3_256_00_47 () { 901my $j = shift; 902my $body = shift; 903my @X = @_; 904my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 905 906 if (0) { 907 foreach (Xupdate_256_SSSE3()) { # 36 instructions 908 eval; 909 eval(shift(@insns)); 910 eval(shift(@insns)); 911 eval(shift(@insns)); 912 } 913 } else { # squeeze extra 4% on Westmere and 19% on Atom 914 eval(shift(@insns)); #@ 915 &movdqa ($t0,@X[1]); 916 eval(shift(@insns)); 917 eval(shift(@insns)); 918 &movdqa ($t3,@X[3]); 919 eval(shift(@insns)); #@ 920 eval(shift(@insns)); 921 eval(shift(@insns)); 922 eval(shift(@insns)); #@ 923 eval(shift(@insns)); 924 &palignr ($t0,@X[0],$SZ); # X[1..4] 925 eval(shift(@insns)); 926 eval(shift(@insns)); 927 &palignr ($t3,@X[2],$SZ); # X[9..12] 928 eval(shift(@insns)); 929 eval(shift(@insns)); 930 eval(shift(@insns)); 931 eval(shift(@insns)); #@ 932 &movdqa ($t1,$t0); 933 eval(shift(@insns)); 934 eval(shift(@insns)); 935 &movdqa ($t2,$t0); 936 eval(shift(@insns)); #@ 937 eval(shift(@insns)); 938 &psrld ($t0,$sigma0[2]); 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 eval(shift(@insns)); 942 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 943 eval(shift(@insns)); #@ 944 eval(shift(@insns)); 945 &psrld ($t2,$sigma0[0]); 946 eval(shift(@insns)); 947 eval(shift(@insns)); 948 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 949 eval(shift(@insns)); 950 eval(shift(@insns)); #@ 951 &pslld ($t1,8*$SZ-$sigma0[1]); 952 eval(shift(@insns)); 953 eval(shift(@insns)); 954 &pxor ($t0,$t2); 955 eval(shift(@insns)); #@ 956 eval(shift(@insns)); 957 eval(shift(@insns)); 958 eval(shift(@insns)); #@ 959 &psrld ($t2,$sigma0[1]-$sigma0[0]); 960 eval(shift(@insns)); 961 &pxor ($t0,$t1); 962 eval(shift(@insns)); 963 eval(shift(@insns)); 964 &pslld ($t1,$sigma0[1]-$sigma0[0]); 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 &pxor ($t0,$t2); 968 eval(shift(@insns)); 969 eval(shift(@insns)); #@ 970 &movdqa ($t2,$t3); 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 &pxor ($t0,$t1); # sigma0(X[1..4]) 974 eval(shift(@insns)); #@ 975 eval(shift(@insns)); 976 eval(shift(@insns)); 977 &psrld ($t3,$sigma1[2]); 978 eval(shift(@insns)); 979 eval(shift(@insns)); 980 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 981 eval(shift(@insns)); #@ 982 eval(shift(@insns)); 983 &psrlq ($t2,$sigma1[0]); 984 eval(shift(@insns)); 985 eval(shift(@insns)); 986 eval(shift(@insns)); 987 &pxor ($t3,$t2); 988 eval(shift(@insns)); #@ 989 eval(shift(@insns)); 990 eval(shift(@insns)); 991 eval(shift(@insns)); #@ 992 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 993 eval(shift(@insns)); 994 eval(shift(@insns)); 995 &pxor ($t3,$t2); 996 eval(shift(@insns)); #@ 997 eval(shift(@insns)); 998 eval(shift(@insns)); 999 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1000 &pshufd ($t3,$t3,0b10000000); 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); 1004 &psrldq ($t3,8); 1005 eval(shift(@insns)); 1006 eval(shift(@insns)); #@ 1007 eval(shift(@insns)); 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); #@ 1010 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1015 eval(shift(@insns)); 1016 eval(shift(@insns)); #@ 1017 eval(shift(@insns)); 1018 &movdqa ($t2,$t3); 1019 eval(shift(@insns)); 1020 eval(shift(@insns)); 1021 &psrld ($t3,$sigma1[2]); 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); #@ 1024 &psrlq ($t2,$sigma1[0]); 1025 eval(shift(@insns)); 1026 eval(shift(@insns)); 1027 &pxor ($t3,$t2); 1028 eval(shift(@insns)); #@ 1029 eval(shift(@insns)); 1030 eval(shift(@insns)); 1031 eval(shift(@insns)); #@ 1032 eval(shift(@insns)); 1033 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1034 eval(shift(@insns)); 1035 eval(shift(@insns)); 1036 eval(shift(@insns)); 1037 &pxor ($t3,$t2); 1038 eval(shift(@insns)); 1039 eval(shift(@insns)); 1040 eval(shift(@insns)); #@ 1041 #&pshufb ($t3,$t5); 1042 &pshufd ($t3,$t3,0b00001000); 1043 eval(shift(@insns)); 1044 eval(shift(@insns)); 1045 &movdqa ($t2,16*2*$j."($Tbl)"); 1046 eval(shift(@insns)); #@ 1047 eval(shift(@insns)); 1048 &pslldq ($t3,8); 1049 eval(shift(@insns)); 1050 eval(shift(@insns)); 1051 eval(shift(@insns)); 1052 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1053 eval(shift(@insns)); #@ 1054 eval(shift(@insns)); 1055 eval(shift(@insns)); 1056 } 1057 &paddd ($t2,@X[0]); 1058 foreach (@insns) { eval; } # remaining instructions 1059 &movdqa (16*$j."(%rsp)",$t2); 1060} 1061 1062 for ($i=0,$j=0; $j<4; $j++) { 1063 &SSSE3_256_00_47($j,\&body_00_15,@X); 1064 push(@X,shift(@X)); # rotate(@X) 1065 } 1066 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1067 &jne (".Lssse3_00_47"); 1068 1069 for ($i=0; $i<16; ) { 1070 foreach(body_00_15()) { eval; } 1071 } 1072$code.=<<___; 1073 mov $_ctx,$ctx 1074 mov $a1,$A 1075 1076 add $SZ*0($ctx),$A 1077 lea 16*$SZ($inp),$inp 1078 add $SZ*1($ctx),$B 1079 add $SZ*2($ctx),$C 1080 add $SZ*3($ctx),$D 1081 add $SZ*4($ctx),$E 1082 add $SZ*5($ctx),$F 1083 add $SZ*6($ctx),$G 1084 add $SZ*7($ctx),$H 1085 1086 cmp $_end,$inp 1087 1088 mov $A,$SZ*0($ctx) 1089 mov $B,$SZ*1($ctx) 1090 mov $C,$SZ*2($ctx) 1091 mov $D,$SZ*3($ctx) 1092 mov $E,$SZ*4($ctx) 1093 mov $F,$SZ*5($ctx) 1094 mov $G,$SZ*6($ctx) 1095 mov $H,$SZ*7($ctx) 1096 jb .Lloop_ssse3 1097 1098 mov $_rsp,%rsi 1099.cfi_def_cfa %rsi,8 1100___ 1101$code.=<<___ if ($win64); 1102 movaps 16*$SZ+32(%rsp),%xmm6 1103 movaps 16*$SZ+48(%rsp),%xmm7 1104 movaps 16*$SZ+64(%rsp),%xmm8 1105 movaps 16*$SZ+80(%rsp),%xmm9 1106___ 1107$code.=<<___; 1108 mov -48(%rsi),%r15 1109.cfi_restore %r15 1110 mov -40(%rsi),%r14 1111.cfi_restore %r14 1112 mov -32(%rsi),%r13 1113.cfi_restore %r13 1114 mov -24(%rsi),%r12 1115.cfi_restore %r12 1116 mov -16(%rsi),%rbp 1117.cfi_restore %rbp 1118 mov -8(%rsi),%rbx 1119.cfi_restore %rbx 1120 lea (%rsi),%rsp 1121.cfi_def_cfa_register %rsp 1122.Lepilogue_ssse3: 1123 ret 1124.cfi_endproc 1125.size ${func}_ssse3,.-${func}_ssse3 1126___ 1127} 1128 1129if ($avx) {{ 1130###################################################################### 1131# AVX+shrd code path 1132# 1133local *ror = sub { &shrd(@_[0],@_) }; 1134 1135$code.=<<___; 1136.type ${func}_avx,\@function,3 1137.align 64 1138${func}_avx: 1139.cfi_startproc 1140.Lavx_shortcut: 1141 mov %rsp,%rax # copy %rsp 1142.cfi_def_cfa_register %rax 1143 push %rbx 1144.cfi_push %rbx 1145 push %rbp 1146.cfi_push %rbp 1147 push %r12 1148.cfi_push %r12 1149 push %r13 1150.cfi_push %r13 1151 push %r14 1152.cfi_push %r14 1153 push %r15 1154.cfi_push %r15 1155 shl \$4,%rdx # num*16 1156 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1157 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1158 and \$-64,%rsp # align stack frame 1159 mov $ctx,$_ctx # save ctx, 1st arg 1160 mov $inp,$_inp # save inp, 2nd arh 1161 mov %rdx,$_end # save end pointer, "3rd" arg 1162 mov %rax,$_rsp # save copy of %rsp 1163.cfi_cfa_expression $_rsp,deref,+8 1164___ 1165$code.=<<___ if ($win64); 1166 movaps %xmm6,16*$SZ+32(%rsp) 1167 movaps %xmm7,16*$SZ+48(%rsp) 1168 movaps %xmm8,16*$SZ+64(%rsp) 1169 movaps %xmm9,16*$SZ+80(%rsp) 1170___ 1171$code.=<<___ if ($win64 && $SZ>4); 1172 movaps %xmm10,16*$SZ+96(%rsp) 1173 movaps %xmm11,16*$SZ+112(%rsp) 1174___ 1175$code.=<<___; 1176.Lprologue_avx: 1177 1178 vzeroupper 1179 mov $SZ*0($ctx),$A 1180 mov $SZ*1($ctx),$B 1181 mov $SZ*2($ctx),$C 1182 mov $SZ*3($ctx),$D 1183 mov $SZ*4($ctx),$E 1184 mov $SZ*5($ctx),$F 1185 mov $SZ*6($ctx),$G 1186 mov $SZ*7($ctx),$H 1187___ 1188 if ($SZ==4) { # SHA256 1189 my @X = map("%xmm$_",(0..3)); 1190 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1191 1192$code.=<<___; 1193 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1194 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1195 jmp .Lloop_avx 1196.align 16 1197.Lloop_avx: 1198 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1199 vmovdqu 0x00($inp),@X[0] 1200 vmovdqu 0x10($inp),@X[1] 1201 vmovdqu 0x20($inp),@X[2] 1202 vmovdqu 0x30($inp),@X[3] 1203 vpshufb $t3,@X[0],@X[0] 1204 lea $TABLE(%rip),$Tbl 1205 vpshufb $t3,@X[1],@X[1] 1206 vpshufb $t3,@X[2],@X[2] 1207 vpaddd 0x00($Tbl),@X[0],$t0 1208 vpshufb $t3,@X[3],@X[3] 1209 vpaddd 0x20($Tbl),@X[1],$t1 1210 vpaddd 0x40($Tbl),@X[2],$t2 1211 vpaddd 0x60($Tbl),@X[3],$t3 1212 vmovdqa $t0,0x00(%rsp) 1213 mov $A,$a1 1214 vmovdqa $t1,0x10(%rsp) 1215 mov $B,$a3 1216 vmovdqa $t2,0x20(%rsp) 1217 xor $C,$a3 # magic 1218 vmovdqa $t3,0x30(%rsp) 1219 mov $E,$a0 1220 jmp .Lavx_00_47 1221 1222.align 16 1223.Lavx_00_47: 1224 sub \$`-16*2*$SZ`,$Tbl # size optimization 1225___ 1226sub Xupdate_256_AVX () { 1227 ( 1228 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1229 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1230 '&vpsrld ($t2,$t0,$sigma0[0]);', 1231 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1232 '&vpsrld ($t3,$t0,$sigma0[2])', 1233 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1234 '&vpxor ($t0,$t3,$t2)', 1235 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1236 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1237 '&vpxor ($t0,$t0,$t1)', 1238 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1239 '&vpxor ($t0,$t0,$t2)', 1240 '&vpsrld ($t2,$t3,$sigma1[2]);', 1241 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1242 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1243 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1244 '&vpxor ($t2,$t2,$t3);', 1245 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1246 '&vpxor ($t2,$t2,$t3)', 1247 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1248 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1249 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1250 '&vpsrld ($t2,$t3,$sigma1[2])', 1251 '&vpsrlq ($t3,$t3,$sigma1[0])', 1252 '&vpxor ($t2,$t2,$t3);', 1253 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1254 '&vpxor ($t2,$t2,$t3)', 1255 '&vpshufb ($t2,$t2,$t5)', 1256 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1257 ); 1258} 1259 1260sub AVX_256_00_47 () { 1261my $j = shift; 1262my $body = shift; 1263my @X = @_; 1264my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1265 1266 foreach (Xupdate_256_AVX()) { # 29 instructions 1267 eval; 1268 eval(shift(@insns)); 1269 eval(shift(@insns)); 1270 eval(shift(@insns)); 1271 } 1272 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1273 foreach (@insns) { eval; } # remaining instructions 1274 &vmovdqa (16*$j."(%rsp)",$t2); 1275} 1276 1277 for ($i=0,$j=0; $j<4; $j++) { 1278 &AVX_256_00_47($j,\&body_00_15,@X); 1279 push(@X,shift(@X)); # rotate(@X) 1280 } 1281 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1282 &jne (".Lavx_00_47"); 1283 1284 for ($i=0; $i<16; ) { 1285 foreach(body_00_15()) { eval; } 1286 } 1287 1288 } else { # SHA512 1289 my @X = map("%xmm$_",(0..7)); 1290 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1291 1292$code.=<<___; 1293 jmp .Lloop_avx 1294.align 16 1295.Lloop_avx: 1296 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1297 vmovdqu 0x00($inp),@X[0] 1298 lea $TABLE+0x80(%rip),$Tbl # size optimization 1299 vmovdqu 0x10($inp),@X[1] 1300 vmovdqu 0x20($inp),@X[2] 1301 vpshufb $t3,@X[0],@X[0] 1302 vmovdqu 0x30($inp),@X[3] 1303 vpshufb $t3,@X[1],@X[1] 1304 vmovdqu 0x40($inp),@X[4] 1305 vpshufb $t3,@X[2],@X[2] 1306 vmovdqu 0x50($inp),@X[5] 1307 vpshufb $t3,@X[3],@X[3] 1308 vmovdqu 0x60($inp),@X[6] 1309 vpshufb $t3,@X[4],@X[4] 1310 vmovdqu 0x70($inp),@X[7] 1311 vpshufb $t3,@X[5],@X[5] 1312 vpaddq -0x80($Tbl),@X[0],$t0 1313 vpshufb $t3,@X[6],@X[6] 1314 vpaddq -0x60($Tbl),@X[1],$t1 1315 vpshufb $t3,@X[7],@X[7] 1316 vpaddq -0x40($Tbl),@X[2],$t2 1317 vpaddq -0x20($Tbl),@X[3],$t3 1318 vmovdqa $t0,0x00(%rsp) 1319 vpaddq 0x00($Tbl),@X[4],$t0 1320 vmovdqa $t1,0x10(%rsp) 1321 vpaddq 0x20($Tbl),@X[5],$t1 1322 vmovdqa $t2,0x20(%rsp) 1323 vpaddq 0x40($Tbl),@X[6],$t2 1324 vmovdqa $t3,0x30(%rsp) 1325 vpaddq 0x60($Tbl),@X[7],$t3 1326 vmovdqa $t0,0x40(%rsp) 1327 mov $A,$a1 1328 vmovdqa $t1,0x50(%rsp) 1329 mov $B,$a3 1330 vmovdqa $t2,0x60(%rsp) 1331 xor $C,$a3 # magic 1332 vmovdqa $t3,0x70(%rsp) 1333 mov $E,$a0 1334 jmp .Lavx_00_47 1335 1336.align 16 1337.Lavx_00_47: 1338 add \$`16*2*$SZ`,$Tbl 1339___ 1340sub Xupdate_512_AVX () { 1341 ( 1342 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1343 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1344 '&vpsrlq ($t2,$t0,$sigma0[0])', 1345 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1346 '&vpsrlq ($t3,$t0,$sigma0[2])', 1347 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1348 '&vpxor ($t0,$t3,$t2)', 1349 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1350 '&vpxor ($t0,$t0,$t1)', 1351 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1352 '&vpxor ($t0,$t0,$t2)', 1353 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1354 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1355 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1356 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1357 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1358 '&vpxor ($t3,$t3,$t2)', 1359 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1360 '&vpxor ($t3,$t3,$t1)', 1361 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1362 '&vpxor ($t3,$t3,$t2)', 1363 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1364 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1365 ); 1366} 1367 1368sub AVX_512_00_47 () { 1369my $j = shift; 1370my $body = shift; 1371my @X = @_; 1372my @insns = (&$body,&$body); # 52 instructions 1373 1374 foreach (Xupdate_512_AVX()) { # 23 instructions 1375 eval; 1376 eval(shift(@insns)); 1377 eval(shift(@insns)); 1378 } 1379 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1380 foreach (@insns) { eval; } # remaining instructions 1381 &vmovdqa (16*$j."(%rsp)",$t2); 1382} 1383 1384 for ($i=0,$j=0; $j<8; $j++) { 1385 &AVX_512_00_47($j,\&body_00_15,@X); 1386 push(@X,shift(@X)); # rotate(@X) 1387 } 1388 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1389 &jne (".Lavx_00_47"); 1390 1391 for ($i=0; $i<16; ) { 1392 foreach(body_00_15()) { eval; } 1393 } 1394} 1395$code.=<<___; 1396 mov $_ctx,$ctx 1397 mov $a1,$A 1398 1399 add $SZ*0($ctx),$A 1400 lea 16*$SZ($inp),$inp 1401 add $SZ*1($ctx),$B 1402 add $SZ*2($ctx),$C 1403 add $SZ*3($ctx),$D 1404 add $SZ*4($ctx),$E 1405 add $SZ*5($ctx),$F 1406 add $SZ*6($ctx),$G 1407 add $SZ*7($ctx),$H 1408 1409 cmp $_end,$inp 1410 1411 mov $A,$SZ*0($ctx) 1412 mov $B,$SZ*1($ctx) 1413 mov $C,$SZ*2($ctx) 1414 mov $D,$SZ*3($ctx) 1415 mov $E,$SZ*4($ctx) 1416 mov $F,$SZ*5($ctx) 1417 mov $G,$SZ*6($ctx) 1418 mov $H,$SZ*7($ctx) 1419 jb .Lloop_avx 1420 1421 mov $_rsp,%rsi 1422.cfi_def_cfa %rsi,8 1423 vzeroupper 1424___ 1425$code.=<<___ if ($win64); 1426 movaps 16*$SZ+32(%rsp),%xmm6 1427 movaps 16*$SZ+48(%rsp),%xmm7 1428 movaps 16*$SZ+64(%rsp),%xmm8 1429 movaps 16*$SZ+80(%rsp),%xmm9 1430___ 1431$code.=<<___ if ($win64 && $SZ>4); 1432 movaps 16*$SZ+96(%rsp),%xmm10 1433 movaps 16*$SZ+112(%rsp),%xmm11 1434___ 1435$code.=<<___; 1436 mov -48(%rsi),%r15 1437.cfi_restore %r15 1438 mov -40(%rsi),%r14 1439.cfi_restore %r14 1440 mov -32(%rsi),%r13 1441.cfi_restore %r13 1442 mov -24(%rsi),%r12 1443.cfi_restore %r12 1444 mov -16(%rsi),%rbp 1445.cfi_restore %rbp 1446 mov -8(%rsi),%rbx 1447.cfi_restore %rbx 1448 lea (%rsi),%rsp 1449.cfi_def_cfa_register %rsp 1450.Lepilogue_avx: 1451 ret 1452.cfi_endproc 1453.size ${func}_avx,.-${func}_avx 1454___ 1455 1456}}}}} 1457 1458# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1459# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1460if ($win64) { 1461$rec="%rcx"; 1462$frame="%rdx"; 1463$context="%r8"; 1464$disp="%r9"; 1465 1466$code.=<<___; 1467.extern __imp_RtlVirtualUnwind 1468.type se_handler,\@abi-omnipotent 1469.align 16 1470se_handler: 1471 push %rsi 1472 push %rdi 1473 push %rbx 1474 push %rbp 1475 push %r12 1476 push %r13 1477 push %r14 1478 push %r15 1479 pushfq 1480 sub \$64,%rsp 1481 1482 mov 120($context),%rax # pull context->Rax 1483 mov 248($context),%rbx # pull context->Rip 1484 1485 mov 8($disp),%rsi # disp->ImageBase 1486 mov 56($disp),%r11 # disp->HanderlData 1487 1488 mov 0(%r11),%r10d # HandlerData[0] 1489 lea (%rsi,%r10),%r10 # prologue label 1490 cmp %r10,%rbx # context->Rip<prologue label 1491 jb .Lin_prologue 1492 1493 mov 152($context),%rax # pull context->Rsp 1494 1495 mov 4(%r11),%r10d # HandlerData[1] 1496 lea (%rsi,%r10),%r10 # epilogue label 1497 cmp %r10,%rbx # context->Rip>=epilogue label 1498 jae .Lin_prologue 1499___ 1500$code.=<<___; 1501 mov %rax,%rsi # put aside Rsp 1502 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 1503 1504 mov -8(%rax),%rbx 1505 mov -16(%rax),%rbp 1506 mov -24(%rax),%r12 1507 mov -32(%rax),%r13 1508 mov -40(%rax),%r14 1509 mov -48(%rax),%r15 1510 mov %rbx,144($context) # restore context->Rbx 1511 mov %rbp,160($context) # restore context->Rbp 1512 mov %r12,216($context) # restore context->R12 1513 mov %r13,224($context) # restore context->R13 1514 mov %r14,232($context) # restore context->R14 1515 mov %r15,240($context) # restore context->R15 1516 1517 lea .Lepilogue(%rip),%r10 1518 cmp %r10,%rbx 1519 jb .Lin_prologue # non-AVX code 1520 1521 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 1522 lea 512($context),%rdi # &context.Xmm6 1523 mov \$`$SZ==4?8:12`,%ecx 1524 .long 0xa548f3fc # cld; rep movsq 1525 1526.Lin_prologue: 1527 mov 8(%rax),%rdi 1528 mov 16(%rax),%rsi 1529 mov %rax,152($context) # restore context->Rsp 1530 mov %rsi,168($context) # restore context->Rsi 1531 mov %rdi,176($context) # restore context->Rdi 1532 1533 mov 40($disp),%rdi # disp->ContextRecord 1534 mov $context,%rsi # context 1535 mov \$154,%ecx # sizeof(CONTEXT) 1536 .long 0xa548f3fc # cld; rep movsq 1537 1538 mov $disp,%rsi 1539 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1540 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1541 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1542 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1543 mov 40(%rsi),%r10 # disp->ContextRecord 1544 lea 56(%rsi),%r11 # &disp->HandlerData 1545 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1546 mov %r10,32(%rsp) # arg5 1547 mov %r11,40(%rsp) # arg6 1548 mov %r12,48(%rsp) # arg7 1549 mov %rcx,56(%rsp) # arg8, (NULL) 1550 call *__imp_RtlVirtualUnwind(%rip) 1551 1552 mov \$1,%eax # ExceptionContinueSearch 1553 add \$64,%rsp 1554 popfq 1555 pop %r15 1556 pop %r14 1557 pop %r13 1558 pop %r12 1559 pop %rbp 1560 pop %rbx 1561 pop %rdi 1562 pop %rsi 1563 ret 1564.size se_handler,.-se_handler 1565___ 1566 1567$code.=<<___ if ($SZ==4 && $shaext); 1568.type shaext_handler,\@abi-omnipotent 1569.align 16 1570shaext_handler: 1571 push %rsi 1572 push %rdi 1573 push %rbx 1574 push %rbp 1575 push %r12 1576 push %r13 1577 push %r14 1578 push %r15 1579 pushfq 1580 sub \$64,%rsp 1581 1582 mov 120($context),%rax # pull context->Rax 1583 mov 248($context),%rbx # pull context->Rip 1584 1585 lea .Lprologue_shaext(%rip),%r10 1586 cmp %r10,%rbx # context->Rip<.Lprologue 1587 jb .Lin_prologue 1588 1589 lea .Lepilogue_shaext(%rip),%r10 1590 cmp %r10,%rbx # context->Rip>=.Lepilogue 1591 jae .Lin_prologue 1592 1593 lea -8-5*16(%rax),%rsi 1594 lea 512($context),%rdi # &context.Xmm6 1595 mov \$10,%ecx 1596 .long 0xa548f3fc # cld; rep movsq 1597 1598 jmp .Lin_prologue 1599.size shaext_handler,.-shaext_handler 1600___ 1601 1602$code.=<<___; 1603.section .pdata 1604.align 4 1605 .rva .LSEH_begin_$func 1606 .rva .LSEH_end_$func 1607 .rva .LSEH_info_$func 1608___ 1609$code.=<<___ if ($SZ==4 && $shaext); 1610 .rva .LSEH_begin_${func}_shaext 1611 .rva .LSEH_end_${func}_shaext 1612 .rva .LSEH_info_${func}_shaext 1613___ 1614$code.=<<___ if ($SZ==4); 1615 .rva .LSEH_begin_${func}_ssse3 1616 .rva .LSEH_end_${func}_ssse3 1617 .rva .LSEH_info_${func}_ssse3 1618___ 1619$code.=<<___ if ($avx); 1620 .rva .LSEH_begin_${func}_avx 1621 .rva .LSEH_end_${func}_avx 1622 .rva .LSEH_info_${func}_avx 1623___ 1624$code.=<<___; 1625.section .xdata 1626.align 8 1627.LSEH_info_$func: 1628 .byte 9,0,0,0 1629 .rva se_handler 1630 .rva .Lprologue,.Lepilogue # HandlerData[] 1631___ 1632$code.=<<___ if ($SZ==4 && $shaext); 1633.LSEH_info_${func}_shaext: 1634 .byte 9,0,0,0 1635 .rva shaext_handler 1636___ 1637$code.=<<___ if ($SZ==4); 1638.LSEH_info_${func}_ssse3: 1639 .byte 9,0,0,0 1640 .rva se_handler 1641 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1642___ 1643$code.=<<___ if ($avx); 1644.LSEH_info_${func}_avx: 1645 .byte 9,0,0,0 1646 .rva se_handler 1647 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1648___ 1649} 1650 1651sub sha256op38 { 1652 my $instr = shift; 1653 my %opcodelet = ( 1654 "sha256rnds2" => 0xcb, 1655 "sha256msg1" => 0xcc, 1656 "sha256msg2" => 0xcd ); 1657 1658 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 1659 my @opcode=(0x0f,0x38); 1660 push @opcode,$opcodelet{$instr}; 1661 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1662 return ".byte\t".join(',',@opcode); 1663 } else { 1664 return $instr."\t".@_[0]; 1665 } 1666} 1667 1668foreach (split("\n",$code)) { 1669 s/\`([^\`]*)\`/eval $1/geo; 1670 1671 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 1672 1673 print $_,"\n"; 1674} 1675close STDOUT or die "error closing STDOUT: $!"; 1676