1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111# 112# Modified from upstream OpenSSL to remove the XOP code. 113 114my ($flavour, $hash, $output) = @ARGV; 115 116if ($hash eq "sha512") { 117 $func="sha512_block_data_order"; 118 $TABLE="K512"; 119 $SZ=8; 120 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 121 "%r8", "%r9", "%r10","%r11"); 122 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 123 @Sigma0=(28,34,39); 124 @Sigma1=(14,18,41); 125 @sigma0=(1, 8, 7); 126 @sigma1=(19,61, 6); 127 $rounds=80; 128} elsif ($hash eq "sha256") { 129 $func="sha256_block_data_order"; 130 $TABLE="K256"; 131 $SZ=4; 132 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 133 "%r8d","%r9d","%r10d","%r11d"); 134 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 135 @Sigma0=( 2,13,22); 136 @Sigma1=( 6,11,25); 137 @sigma0=( 7,18, 3); 138 @sigma1=(17,19,10); 139 $rounds=64; 140} else { 141 die "unknown hash: $hash"; 142} 143 144$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 145 146$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 147( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 148( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 149die "can't locate x86_64-xlate.pl"; 150 151# In upstream, this is controlled by shelling out to the compiler to check 152# versions, but BoringSSL is intended to be used with pre-generated perlasm 153# output, so this isn't useful anyway. 154# 155# This file also has an AVX2 implementation, controlled by setting $avx to 2. 156# For now, we intentionally disable it. While it gives a 13-16% perf boost, the 157# CFI annotations are wrong. It allocates stack in a loop and should be 158# rewritten to avoid this. 159$avx = 1; 160$shaext = 1; 161 162open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 163*STDOUT=*OUT; 164 165$ctx="%rdi"; # 1st arg, zapped by $a3 166$inp="%rsi"; # 2nd arg 167$Tbl="%rbp"; 168 169$_ctx="16*$SZ+0*8(%rsp)"; 170$_inp="16*$SZ+1*8(%rsp)"; 171$_end="16*$SZ+2*8(%rsp)"; 172$_rsp="`16*$SZ+3*8`(%rsp)"; 173$framesz="16*$SZ+4*8"; 174 175 176sub ROUND_00_15() 177{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 178 my $STRIDE=$SZ; 179 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 180 181$code.=<<___; 182 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 183 mov $f,$a2 184 185 xor $e,$a0 186 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 187 xor $g,$a2 # f^g 188 189 mov $T1,`$SZ*($i&0xf)`(%rsp) 190 xor $a,$a1 191 and $e,$a2 # (f^g)&e 192 193 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 194 add $h,$T1 # T1+=h 195 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 196 197 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 198 xor $e,$a0 199 add $a2,$T1 # T1+=Ch(e,f,g) 200 201 mov $a,$a2 202 add ($Tbl),$T1 # T1+=K[round] 203 xor $a,$a1 204 205 xor $b,$a2 # a^b, b^c in next round 206 ror \$$Sigma1[0],$a0 # Sigma1(e) 207 mov $b,$h 208 209 and $a2,$a3 210 ror \$$Sigma0[0],$a1 # Sigma0(a) 211 add $a0,$T1 # T1+=Sigma1(e) 212 213 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 214 add $T1,$d # d+=T1 215 add $T1,$h # h+=T1 216 217 lea $STRIDE($Tbl),$Tbl # round++ 218___ 219$code.=<<___ if ($i<15); 220 add $a1,$h # h+=Sigma0(a) 221___ 222 ($a2,$a3) = ($a3,$a2); 223} 224 225sub ROUND_16_XX() 226{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 227 228$code.=<<___; 229 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 230 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 231 232 mov $a0,$T1 233 ror \$`$sigma0[1]-$sigma0[0]`,$a0 234 add $a1,$a # modulo-scheduled h+=Sigma0(a) 235 mov $a2,$a1 236 ror \$`$sigma1[1]-$sigma1[0]`,$a2 237 238 xor $T1,$a0 239 shr \$$sigma0[2],$T1 240 ror \$$sigma0[0],$a0 241 xor $a1,$a2 242 shr \$$sigma1[2],$a1 243 244 ror \$$sigma1[0],$a2 245 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 246 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 247 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 248 249 add `$SZ*($i&0xf)`(%rsp),$T1 250 mov $e,$a0 251 add $a2,$T1 252 mov $a,$a1 253___ 254 &ROUND_00_15(@_); 255} 256 257$code=<<___; 258.text 259 260.globl ${func}_nohw 261.type ${func}_nohw,\@function,3 262.align 16 263${func}_nohw: 264.cfi_startproc 265 _CET_ENDBR 266 mov %rsp,%rax # copy %rsp 267.cfi_def_cfa_register %rax 268 push %rbx 269.cfi_push %rbx 270 push %rbp 271.cfi_push %rbp 272 push %r12 273.cfi_push %r12 274 push %r13 275.cfi_push %r13 276 push %r14 277.cfi_push %r14 278 push %r15 279.cfi_push %r15 280 shl \$4,%rdx # num*16 281 sub \$$framesz,%rsp 282 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 283 and \$-64,%rsp # align stack frame 284 mov $ctx,$_ctx # save ctx, 1st arg 285 mov $inp,$_inp # save inp, 2nd arh 286 mov %rdx,$_end # save end pointer, "3rd" arg 287 mov %rax,$_rsp # save copy of %rsp 288.cfi_cfa_expression $_rsp,deref,+8 289.Lprologue: 290 291 mov $SZ*0($ctx),$A 292 mov $SZ*1($ctx),$B 293 mov $SZ*2($ctx),$C 294 mov $SZ*3($ctx),$D 295 mov $SZ*4($ctx),$E 296 mov $SZ*5($ctx),$F 297 mov $SZ*6($ctx),$G 298 mov $SZ*7($ctx),$H 299 jmp .Lloop 300 301.align 16 302.Lloop: 303 mov $B,$a3 304 lea $TABLE(%rip),$Tbl 305 xor $C,$a3 # magic 306___ 307 for($i=0;$i<16;$i++) { 308 $code.=" mov $SZ*$i($inp),$T1\n"; 309 $code.=" mov @ROT[4],$a0\n"; 310 $code.=" mov @ROT[0],$a1\n"; 311 $code.=" bswap $T1\n"; 312 &ROUND_00_15($i,@ROT); 313 unshift(@ROT,pop(@ROT)); 314 } 315$code.=<<___; 316 jmp .Lrounds_16_xx 317.align 16 318.Lrounds_16_xx: 319___ 320 for(;$i<32;$i++) { 321 &ROUND_16_XX($i,@ROT); 322 unshift(@ROT,pop(@ROT)); 323 } 324 325$code.=<<___; 326 cmpb \$0,`$SZ-1`($Tbl) 327 jnz .Lrounds_16_xx 328 329 mov $_ctx,$ctx 330 add $a1,$A # modulo-scheduled h+=Sigma0(a) 331 lea 16*$SZ($inp),$inp 332 333 add $SZ*0($ctx),$A 334 add $SZ*1($ctx),$B 335 add $SZ*2($ctx),$C 336 add $SZ*3($ctx),$D 337 add $SZ*4($ctx),$E 338 add $SZ*5($ctx),$F 339 add $SZ*6($ctx),$G 340 add $SZ*7($ctx),$H 341 342 cmp $_end,$inp 343 344 mov $A,$SZ*0($ctx) 345 mov $B,$SZ*1($ctx) 346 mov $C,$SZ*2($ctx) 347 mov $D,$SZ*3($ctx) 348 mov $E,$SZ*4($ctx) 349 mov $F,$SZ*5($ctx) 350 mov $G,$SZ*6($ctx) 351 mov $H,$SZ*7($ctx) 352 jb .Lloop 353 354 mov $_rsp,%rsi 355.cfi_def_cfa %rsi,8 356 mov -48(%rsi),%r15 357.cfi_restore %r15 358 mov -40(%rsi),%r14 359.cfi_restore %r14 360 mov -32(%rsi),%r13 361.cfi_restore %r13 362 mov -24(%rsi),%r12 363.cfi_restore %r12 364 mov -16(%rsi),%rbp 365.cfi_restore %rbp 366 mov -8(%rsi),%rbx 367.cfi_restore %rbx 368 lea (%rsi),%rsp 369.cfi_def_cfa_register %rsp 370.Lepilogue: 371 ret 372.cfi_endproc 373.size ${func}_nohw,.-${func}_nohw 374___ 375 376if ($SZ==4) { 377$code.=<<___; 378.section .rodata 379.align 64 380.type $TABLE,\@object 381$TABLE: 382 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 383 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 384 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 385 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 386 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 387 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 388 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 389 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 390 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 391 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 392 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 393 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 394 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 395 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 396 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 397 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 398 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 399 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 400 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 401 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 402 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 403 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 404 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 405 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 406 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 407 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 408 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 409 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 410 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 411 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 412 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 413 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 414 415 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 416 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 417 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 418 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 419 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 420 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 421 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 422.text 423___ 424} else { 425$code.=<<___; 426.section .rodata 427.align 64 428.type $TABLE,\@object 429$TABLE: 430 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 431 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 432 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 433 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 434 .quad 0x3956c25bf348b538,0x59f111f1b605d019 435 .quad 0x3956c25bf348b538,0x59f111f1b605d019 436 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 437 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 438 .quad 0xd807aa98a3030242,0x12835b0145706fbe 439 .quad 0xd807aa98a3030242,0x12835b0145706fbe 440 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 441 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 442 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 443 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 444 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 445 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 446 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 447 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 448 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 449 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 450 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 451 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 452 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 453 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 454 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 455 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 456 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 457 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 458 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 459 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 460 .quad 0x06ca6351e003826f,0x142929670a0e6e70 461 .quad 0x06ca6351e003826f,0x142929670a0e6e70 462 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 463 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 464 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 465 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 466 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 467 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 468 .quad 0x81c2c92e47edaee6,0x92722c851482353b 469 .quad 0x81c2c92e47edaee6,0x92722c851482353b 470 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 471 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 472 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 473 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 474 .quad 0xd192e819d6ef5218,0xd69906245565a910 475 .quad 0xd192e819d6ef5218,0xd69906245565a910 476 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 477 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 478 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 479 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 480 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 481 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 482 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 483 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 484 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 485 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 486 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 487 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 488 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 489 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 490 .quad 0x90befffa23631e28,0xa4506cebde82bde9 491 .quad 0x90befffa23631e28,0xa4506cebde82bde9 492 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 493 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 494 .quad 0xca273eceea26619c,0xd186b8c721c0c207 495 .quad 0xca273eceea26619c,0xd186b8c721c0c207 496 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 497 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 498 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 499 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 500 .quad 0x113f9804bef90dae,0x1b710b35131c471b 501 .quad 0x113f9804bef90dae,0x1b710b35131c471b 502 .quad 0x28db77f523047d84,0x32caab7b40c72493 503 .quad 0x28db77f523047d84,0x32caab7b40c72493 504 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 505 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 506 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 507 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 508 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 509 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 510 511 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 512 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 513 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 514.text 515___ 516} 517 518###################################################################### 519# SIMD code paths 520# 521if ($SZ==4 && $shaext) {{{ 522###################################################################### 523# Intel SHA Extensions implementation of SHA256 update function. 524# 525my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 526 527my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 528my @MSG=map("%xmm$_",(3..6)); 529 530$code.=<<___; 531.globl sha256_block_data_order_hw 532.type sha256_block_data_order_hw,\@function,3 533.align 64 534sha256_block_data_order_hw: 535.cfi_startproc 536 _CET_ENDBR 537___ 538$code.=<<___ if ($win64); 539 lea `-8-5*16`(%rsp),%rsp 540 movaps %xmm6,-8-5*16(%rax) 541 movaps %xmm7,-8-4*16(%rax) 542 movaps %xmm8,-8-3*16(%rax) 543 movaps %xmm9,-8-2*16(%rax) 544 movaps %xmm10,-8-1*16(%rax) 545.Lprologue_shaext: 546___ 547$code.=<<___; 548 lea K256+0x80(%rip),$Tbl 549 movdqu ($ctx),$ABEF # DCBA 550 movdqu 16($ctx),$CDGH # HGFE 551 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 552 553 pshufd \$0x1b,$ABEF,$Wi # ABCD 554 pshufd \$0xb1,$ABEF,$ABEF # CDAB 555 pshufd \$0x1b,$CDGH,$CDGH # EFGH 556 movdqa $TMP,$BSWAP # offload 557 palignr \$8,$CDGH,$ABEF # ABEF 558 punpcklqdq $Wi,$CDGH # CDGH 559 jmp .Loop_shaext 560 561.align 16 562.Loop_shaext: 563 movdqu ($inp),@MSG[0] 564 movdqu 0x10($inp),@MSG[1] 565 movdqu 0x20($inp),@MSG[2] 566 pshufb $TMP,@MSG[0] 567 movdqu 0x30($inp),@MSG[3] 568 569 movdqa 0*32-0x80($Tbl),$Wi 570 paddd @MSG[0],$Wi 571 pshufb $TMP,@MSG[1] 572 movdqa $CDGH,$CDGH_SAVE # offload 573 sha256rnds2 $ABEF,$CDGH # 0-3 574 pshufd \$0x0e,$Wi,$Wi 575 nop 576 movdqa $ABEF,$ABEF_SAVE # offload 577 sha256rnds2 $CDGH,$ABEF 578 579 movdqa 1*32-0x80($Tbl),$Wi 580 paddd @MSG[1],$Wi 581 pshufb $TMP,@MSG[2] 582 sha256rnds2 $ABEF,$CDGH # 4-7 583 pshufd \$0x0e,$Wi,$Wi 584 lea 0x40($inp),$inp 585 sha256msg1 @MSG[1],@MSG[0] 586 sha256rnds2 $CDGH,$ABEF 587 588 movdqa 2*32-0x80($Tbl),$Wi 589 paddd @MSG[2],$Wi 590 pshufb $TMP,@MSG[3] 591 sha256rnds2 $ABEF,$CDGH # 8-11 592 pshufd \$0x0e,$Wi,$Wi 593 movdqa @MSG[3],$TMP 594 palignr \$4,@MSG[2],$TMP 595 nop 596 paddd $TMP,@MSG[0] 597 sha256msg1 @MSG[2],@MSG[1] 598 sha256rnds2 $CDGH,$ABEF 599 600 movdqa 3*32-0x80($Tbl),$Wi 601 paddd @MSG[3],$Wi 602 sha256msg2 @MSG[3],@MSG[0] 603 sha256rnds2 $ABEF,$CDGH # 12-15 604 pshufd \$0x0e,$Wi,$Wi 605 movdqa @MSG[0],$TMP 606 palignr \$4,@MSG[3],$TMP 607 nop 608 paddd $TMP,@MSG[1] 609 sha256msg1 @MSG[3],@MSG[2] 610 sha256rnds2 $CDGH,$ABEF 611___ 612for($i=4;$i<16-3;$i++) { 613$code.=<<___; 614 movdqa $i*32-0x80($Tbl),$Wi 615 paddd @MSG[0],$Wi 616 sha256msg2 @MSG[0],@MSG[1] 617 sha256rnds2 $ABEF,$CDGH # 16-19... 618 pshufd \$0x0e,$Wi,$Wi 619 movdqa @MSG[1],$TMP 620 palignr \$4,@MSG[0],$TMP 621 nop 622 paddd $TMP,@MSG[2] 623 sha256msg1 @MSG[0],@MSG[3] 624 sha256rnds2 $CDGH,$ABEF 625___ 626 push(@MSG,shift(@MSG)); 627} 628$code.=<<___; 629 movdqa 13*32-0x80($Tbl),$Wi 630 paddd @MSG[0],$Wi 631 sha256msg2 @MSG[0],@MSG[1] 632 sha256rnds2 $ABEF,$CDGH # 52-55 633 pshufd \$0x0e,$Wi,$Wi 634 movdqa @MSG[1],$TMP 635 palignr \$4,@MSG[0],$TMP 636 sha256rnds2 $CDGH,$ABEF 637 paddd $TMP,@MSG[2] 638 639 movdqa 14*32-0x80($Tbl),$Wi 640 paddd @MSG[1],$Wi 641 sha256rnds2 $ABEF,$CDGH # 56-59 642 pshufd \$0x0e,$Wi,$Wi 643 sha256msg2 @MSG[1],@MSG[2] 644 movdqa $BSWAP,$TMP 645 sha256rnds2 $CDGH,$ABEF 646 647 movdqa 15*32-0x80($Tbl),$Wi 648 paddd @MSG[2],$Wi 649 nop 650 sha256rnds2 $ABEF,$CDGH # 60-63 651 pshufd \$0x0e,$Wi,$Wi 652 dec $num 653 nop 654 sha256rnds2 $CDGH,$ABEF 655 656 paddd $CDGH_SAVE,$CDGH 657 paddd $ABEF_SAVE,$ABEF 658 jnz .Loop_shaext 659 660 pshufd \$0xb1,$CDGH,$CDGH # DCHG 661 pshufd \$0x1b,$ABEF,$TMP # FEBA 662 pshufd \$0xb1,$ABEF,$ABEF # BAFE 663 punpckhqdq $CDGH,$ABEF # DCBA 664 palignr \$8,$TMP,$CDGH # HGFE 665 666 movdqu $ABEF,($ctx) 667 movdqu $CDGH,16($ctx) 668___ 669$code.=<<___ if ($win64); 670 movaps -8-5*16(%rax),%xmm6 671 movaps -8-4*16(%rax),%xmm7 672 movaps -8-3*16(%rax),%xmm8 673 movaps -8-2*16(%rax),%xmm9 674 movaps -8-1*16(%rax),%xmm10 675 mov %rax,%rsp 676.Lepilogue_shaext: 677___ 678$code.=<<___; 679 ret 680.cfi_endproc 681.size sha256_block_data_order_hw,.-sha256_block_data_order_hw 682___ 683}}} 684{{{ 685 686my $a4=$T1; 687my ($a,$b,$c,$d,$e,$f,$g,$h); 688 689sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 690{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 691 my $arg = pop; 692 $arg = "\$$arg" if ($arg*1 eq $arg); 693 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 694} 695 696sub body_00_15 () { 697 ( 698 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 699 700 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 701 '&mov ($a,$a1)', 702 '&mov ($a4,$f)', 703 704 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 705 '&xor ($a0,$e)', 706 '&xor ($a4,$g)', # f^g 707 708 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 709 '&xor ($a1,$a)', 710 '&and ($a4,$e)', # (f^g)&e 711 712 '&xor ($a0,$e)', 713 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 714 '&mov ($a2,$a)', 715 716 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 717 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 718 '&xor ($a2,$b)', # a^b, b^c in next round 719 720 '&add ($h,$a4)', # h+=Ch(e,f,g) 721 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 722 '&and ($a3,$a2)', # (b^c)&(a^b) 723 724 '&xor ($a1,$a)', 725 '&add ($h,$a0)', # h+=Sigma1(e) 726 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 727 728 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 729 '&add ($d,$h)', # d+=h 730 '&add ($h,$a3)', # h+=Maj(a,b,c) 731 732 '&mov ($a0,$d)', 733 '&add ($a1,$h);'. # h+=Sigma0(a) 734 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 735 ); 736} 737 738###################################################################### 739# SSSE3 code path 740# 741if ($SZ==4) { # SHA256 only 742my @X = map("%xmm$_",(0..3)); 743my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 744 745$code.=<<___; 746.globl ${func}_ssse3 747.type ${func}_ssse3,\@function,3 748.align 64 749${func}_ssse3: 750.cfi_startproc 751 _CET_ENDBR 752 mov %rsp,%rax # copy %rsp 753.cfi_def_cfa_register %rax 754 push %rbx 755.cfi_push %rbx 756 push %rbp 757.cfi_push %rbp 758 push %r12 759.cfi_push %r12 760 push %r13 761.cfi_push %r13 762 push %r14 763.cfi_push %r14 764 push %r15 765.cfi_push %r15 766 shl \$4,%rdx # num*16 767 sub \$`$framesz+$win64*16*4`,%rsp 768 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 769 and \$-64,%rsp # align stack frame 770 mov $ctx,$_ctx # save ctx, 1st arg 771 mov $inp,$_inp # save inp, 2nd arh 772 mov %rdx,$_end # save end pointer, "3rd" arg 773 mov %rax,$_rsp # save copy of %rsp 774.cfi_cfa_expression $_rsp,deref,+8 775___ 776$code.=<<___ if ($win64); 777 movaps %xmm6,16*$SZ+32(%rsp) 778 movaps %xmm7,16*$SZ+48(%rsp) 779 movaps %xmm8,16*$SZ+64(%rsp) 780 movaps %xmm9,16*$SZ+80(%rsp) 781___ 782$code.=<<___; 783.Lprologue_ssse3: 784 785 mov $SZ*0($ctx),$A 786 mov $SZ*1($ctx),$B 787 mov $SZ*2($ctx),$C 788 mov $SZ*3($ctx),$D 789 mov $SZ*4($ctx),$E 790 mov $SZ*5($ctx),$F 791 mov $SZ*6($ctx),$G 792 mov $SZ*7($ctx),$H 793___ 794 795$code.=<<___; 796 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 797 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 798 jmp .Lloop_ssse3 799.align 16 800.Lloop_ssse3: 801 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 802 movdqu 0x00($inp),@X[0] 803 movdqu 0x10($inp),@X[1] 804 movdqu 0x20($inp),@X[2] 805 pshufb $t3,@X[0] 806 movdqu 0x30($inp),@X[3] 807 lea $TABLE(%rip),$Tbl 808 pshufb $t3,@X[1] 809 movdqa 0x00($Tbl),$t0 810 movdqa 0x20($Tbl),$t1 811 pshufb $t3,@X[2] 812 paddd @X[0],$t0 813 movdqa 0x40($Tbl),$t2 814 pshufb $t3,@X[3] 815 movdqa 0x60($Tbl),$t3 816 paddd @X[1],$t1 817 paddd @X[2],$t2 818 paddd @X[3],$t3 819 movdqa $t0,0x00(%rsp) 820 mov $A,$a1 821 movdqa $t1,0x10(%rsp) 822 mov $B,$a3 823 movdqa $t2,0x20(%rsp) 824 xor $C,$a3 # magic 825 movdqa $t3,0x30(%rsp) 826 mov $E,$a0 827 jmp .Lssse3_00_47 828 829.align 16 830.Lssse3_00_47: 831 sub \$`-16*2*$SZ`,$Tbl # size optimization 832___ 833sub Xupdate_256_SSSE3 () { 834 ( 835 '&movdqa ($t0,@X[1]);', 836 '&movdqa ($t3,@X[3])', 837 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 838 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 839 '&movdqa ($t1,$t0)', 840 '&movdqa ($t2,$t0);', 841 '&psrld ($t0,$sigma0[2])', 842 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 843 '&psrld ($t2,$sigma0[0])', 844 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 845 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 846 '&pxor ($t0,$t2)', 847 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 848 '&pxor ($t0,$t1)', 849 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 850 '&pxor ($t0,$t2);', 851 '&movdqa ($t2,$t3)', 852 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 853 '&psrld ($t3,$sigma1[2])', 854 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 855 '&psrlq ($t2,$sigma1[0])', 856 '&pxor ($t3,$t2);', 857 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 858 '&pxor ($t3,$t2)', 859 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 860 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 861 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 862 '&movdqa ($t2,$t3);', 863 '&psrld ($t3,$sigma1[2])', 864 '&psrlq ($t2,$sigma1[0])', 865 '&pxor ($t3,$t2);', 866 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 867 '&pxor ($t3,$t2);', 868 '&movdqa ($t2,16*2*$j."($Tbl)")', 869 '&pshufb ($t3,$t5)', 870 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 871 ); 872} 873 874sub SSSE3_256_00_47 () { 875my $j = shift; 876my $body = shift; 877my @X = @_; 878my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 879 880 if (0) { 881 foreach (Xupdate_256_SSSE3()) { # 36 instructions 882 eval; 883 eval(shift(@insns)); 884 eval(shift(@insns)); 885 eval(shift(@insns)); 886 } 887 } else { # squeeze extra 4% on Westmere and 19% on Atom 888 eval(shift(@insns)); #@ 889 &movdqa ($t0,@X[1]); 890 eval(shift(@insns)); 891 eval(shift(@insns)); 892 &movdqa ($t3,@X[3]); 893 eval(shift(@insns)); #@ 894 eval(shift(@insns)); 895 eval(shift(@insns)); 896 eval(shift(@insns)); #@ 897 eval(shift(@insns)); 898 &palignr ($t0,@X[0],$SZ); # X[1..4] 899 eval(shift(@insns)); 900 eval(shift(@insns)); 901 &palignr ($t3,@X[2],$SZ); # X[9..12] 902 eval(shift(@insns)); 903 eval(shift(@insns)); 904 eval(shift(@insns)); 905 eval(shift(@insns)); #@ 906 &movdqa ($t1,$t0); 907 eval(shift(@insns)); 908 eval(shift(@insns)); 909 &movdqa ($t2,$t0); 910 eval(shift(@insns)); #@ 911 eval(shift(@insns)); 912 &psrld ($t0,$sigma0[2]); 913 eval(shift(@insns)); 914 eval(shift(@insns)); 915 eval(shift(@insns)); 916 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 917 eval(shift(@insns)); #@ 918 eval(shift(@insns)); 919 &psrld ($t2,$sigma0[0]); 920 eval(shift(@insns)); 921 eval(shift(@insns)); 922 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 923 eval(shift(@insns)); 924 eval(shift(@insns)); #@ 925 &pslld ($t1,8*$SZ-$sigma0[1]); 926 eval(shift(@insns)); 927 eval(shift(@insns)); 928 &pxor ($t0,$t2); 929 eval(shift(@insns)); #@ 930 eval(shift(@insns)); 931 eval(shift(@insns)); 932 eval(shift(@insns)); #@ 933 &psrld ($t2,$sigma0[1]-$sigma0[0]); 934 eval(shift(@insns)); 935 &pxor ($t0,$t1); 936 eval(shift(@insns)); 937 eval(shift(@insns)); 938 &pslld ($t1,$sigma0[1]-$sigma0[0]); 939 eval(shift(@insns)); 940 eval(shift(@insns)); 941 &pxor ($t0,$t2); 942 eval(shift(@insns)); 943 eval(shift(@insns)); #@ 944 &movdqa ($t2,$t3); 945 eval(shift(@insns)); 946 eval(shift(@insns)); 947 &pxor ($t0,$t1); # sigma0(X[1..4]) 948 eval(shift(@insns)); #@ 949 eval(shift(@insns)); 950 eval(shift(@insns)); 951 &psrld ($t3,$sigma1[2]); 952 eval(shift(@insns)); 953 eval(shift(@insns)); 954 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 955 eval(shift(@insns)); #@ 956 eval(shift(@insns)); 957 &psrlq ($t2,$sigma1[0]); 958 eval(shift(@insns)); 959 eval(shift(@insns)); 960 eval(shift(@insns)); 961 &pxor ($t3,$t2); 962 eval(shift(@insns)); #@ 963 eval(shift(@insns)); 964 eval(shift(@insns)); 965 eval(shift(@insns)); #@ 966 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 &pxor ($t3,$t2); 970 eval(shift(@insns)); #@ 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 974 &pshufd ($t3,$t3,0b10000000); 975 eval(shift(@insns)); 976 eval(shift(@insns)); 977 eval(shift(@insns)); 978 &psrldq ($t3,8); 979 eval(shift(@insns)); 980 eval(shift(@insns)); #@ 981 eval(shift(@insns)); 982 eval(shift(@insns)); 983 eval(shift(@insns)); #@ 984 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 985 eval(shift(@insns)); 986 eval(shift(@insns)); 987 eval(shift(@insns)); 988 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 989 eval(shift(@insns)); 990 eval(shift(@insns)); #@ 991 eval(shift(@insns)); 992 &movdqa ($t2,$t3); 993 eval(shift(@insns)); 994 eval(shift(@insns)); 995 &psrld ($t3,$sigma1[2]); 996 eval(shift(@insns)); 997 eval(shift(@insns)); #@ 998 &psrlq ($t2,$sigma1[0]); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); 1001 &pxor ($t3,$t2); 1002 eval(shift(@insns)); #@ 1003 eval(shift(@insns)); 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); #@ 1006 eval(shift(@insns)); 1007 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); 1010 eval(shift(@insns)); 1011 &pxor ($t3,$t2); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 eval(shift(@insns)); #@ 1015 #&pshufb ($t3,$t5); 1016 &pshufd ($t3,$t3,0b00001000); 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 &movdqa ($t2,16*2*$j."($Tbl)"); 1020 eval(shift(@insns)); #@ 1021 eval(shift(@insns)); 1022 &pslldq ($t3,8); 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); 1025 eval(shift(@insns)); 1026 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1027 eval(shift(@insns)); #@ 1028 eval(shift(@insns)); 1029 eval(shift(@insns)); 1030 } 1031 &paddd ($t2,@X[0]); 1032 foreach (@insns) { eval; } # remaining instructions 1033 &movdqa (16*$j."(%rsp)",$t2); 1034} 1035 1036 for ($i=0,$j=0; $j<4; $j++) { 1037 &SSSE3_256_00_47($j,\&body_00_15,@X); 1038 push(@X,shift(@X)); # rotate(@X) 1039 } 1040 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1041 &jne (".Lssse3_00_47"); 1042 1043 for ($i=0; $i<16; ) { 1044 foreach(body_00_15()) { eval; } 1045 } 1046$code.=<<___; 1047 mov $_ctx,$ctx 1048 mov $a1,$A 1049 1050 add $SZ*0($ctx),$A 1051 lea 16*$SZ($inp),$inp 1052 add $SZ*1($ctx),$B 1053 add $SZ*2($ctx),$C 1054 add $SZ*3($ctx),$D 1055 add $SZ*4($ctx),$E 1056 add $SZ*5($ctx),$F 1057 add $SZ*6($ctx),$G 1058 add $SZ*7($ctx),$H 1059 1060 cmp $_end,$inp 1061 1062 mov $A,$SZ*0($ctx) 1063 mov $B,$SZ*1($ctx) 1064 mov $C,$SZ*2($ctx) 1065 mov $D,$SZ*3($ctx) 1066 mov $E,$SZ*4($ctx) 1067 mov $F,$SZ*5($ctx) 1068 mov $G,$SZ*6($ctx) 1069 mov $H,$SZ*7($ctx) 1070 jb .Lloop_ssse3 1071 1072 mov $_rsp,%rsi 1073.cfi_def_cfa %rsi,8 1074___ 1075$code.=<<___ if ($win64); 1076 movaps 16*$SZ+32(%rsp),%xmm6 1077 movaps 16*$SZ+48(%rsp),%xmm7 1078 movaps 16*$SZ+64(%rsp),%xmm8 1079 movaps 16*$SZ+80(%rsp),%xmm9 1080___ 1081$code.=<<___; 1082 mov -48(%rsi),%r15 1083.cfi_restore %r15 1084 mov -40(%rsi),%r14 1085.cfi_restore %r14 1086 mov -32(%rsi),%r13 1087.cfi_restore %r13 1088 mov -24(%rsi),%r12 1089.cfi_restore %r12 1090 mov -16(%rsi),%rbp 1091.cfi_restore %rbp 1092 mov -8(%rsi),%rbx 1093.cfi_restore %rbx 1094 lea (%rsi),%rsp 1095.cfi_def_cfa_register %rsp 1096.Lepilogue_ssse3: 1097 ret 1098.cfi_endproc 1099.size ${func}_ssse3,.-${func}_ssse3 1100___ 1101} 1102 1103if ($avx) {{ 1104###################################################################### 1105# AVX+shrd code path 1106# 1107local *ror = sub { &shrd(@_[0],@_) }; 1108 1109$code.=<<___; 1110.globl ${func}_avx 1111.type ${func}_avx,\@function,3 1112.align 64 1113${func}_avx: 1114.cfi_startproc 1115 _CET_ENDBR 1116 mov %rsp,%rax # copy %rsp 1117.cfi_def_cfa_register %rax 1118 push %rbx 1119.cfi_push %rbx 1120 push %rbp 1121.cfi_push %rbp 1122 push %r12 1123.cfi_push %r12 1124 push %r13 1125.cfi_push %r13 1126 push %r14 1127.cfi_push %r14 1128 push %r15 1129.cfi_push %r15 1130 shl \$4,%rdx # num*16 1131 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1132 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1133 and \$-64,%rsp # align stack frame 1134 mov $ctx,$_ctx # save ctx, 1st arg 1135 mov $inp,$_inp # save inp, 2nd arh 1136 mov %rdx,$_end # save end pointer, "3rd" arg 1137 mov %rax,$_rsp # save copy of %rsp 1138.cfi_cfa_expression $_rsp,deref,+8 1139___ 1140$code.=<<___ if ($win64); 1141 movaps %xmm6,16*$SZ+32(%rsp) 1142 movaps %xmm7,16*$SZ+48(%rsp) 1143 movaps %xmm8,16*$SZ+64(%rsp) 1144 movaps %xmm9,16*$SZ+80(%rsp) 1145___ 1146$code.=<<___ if ($win64 && $SZ>4); 1147 movaps %xmm10,16*$SZ+96(%rsp) 1148 movaps %xmm11,16*$SZ+112(%rsp) 1149___ 1150$code.=<<___; 1151.Lprologue_avx: 1152 1153 vzeroupper 1154 mov $SZ*0($ctx),$A 1155 mov $SZ*1($ctx),$B 1156 mov $SZ*2($ctx),$C 1157 mov $SZ*3($ctx),$D 1158 mov $SZ*4($ctx),$E 1159 mov $SZ*5($ctx),$F 1160 mov $SZ*6($ctx),$G 1161 mov $SZ*7($ctx),$H 1162___ 1163 if ($SZ==4) { # SHA256 1164 my @X = map("%xmm$_",(0..3)); 1165 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1166 1167$code.=<<___; 1168 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1169 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1170 jmp .Lloop_avx 1171.align 16 1172.Lloop_avx: 1173 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1174 vmovdqu 0x00($inp),@X[0] 1175 vmovdqu 0x10($inp),@X[1] 1176 vmovdqu 0x20($inp),@X[2] 1177 vmovdqu 0x30($inp),@X[3] 1178 vpshufb $t3,@X[0],@X[0] 1179 lea $TABLE(%rip),$Tbl 1180 vpshufb $t3,@X[1],@X[1] 1181 vpshufb $t3,@X[2],@X[2] 1182 vpaddd 0x00($Tbl),@X[0],$t0 1183 vpshufb $t3,@X[3],@X[3] 1184 vpaddd 0x20($Tbl),@X[1],$t1 1185 vpaddd 0x40($Tbl),@X[2],$t2 1186 vpaddd 0x60($Tbl),@X[3],$t3 1187 vmovdqa $t0,0x00(%rsp) 1188 mov $A,$a1 1189 vmovdqa $t1,0x10(%rsp) 1190 mov $B,$a3 1191 vmovdqa $t2,0x20(%rsp) 1192 xor $C,$a3 # magic 1193 vmovdqa $t3,0x30(%rsp) 1194 mov $E,$a0 1195 jmp .Lavx_00_47 1196 1197.align 16 1198.Lavx_00_47: 1199 sub \$`-16*2*$SZ`,$Tbl # size optimization 1200___ 1201sub Xupdate_256_AVX () { 1202 ( 1203 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1204 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1205 '&vpsrld ($t2,$t0,$sigma0[0]);', 1206 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1207 '&vpsrld ($t3,$t0,$sigma0[2])', 1208 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1209 '&vpxor ($t0,$t3,$t2)', 1210 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1211 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1212 '&vpxor ($t0,$t0,$t1)', 1213 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1214 '&vpxor ($t0,$t0,$t2)', 1215 '&vpsrld ($t2,$t3,$sigma1[2]);', 1216 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1217 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1218 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1219 '&vpxor ($t2,$t2,$t3);', 1220 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1221 '&vpxor ($t2,$t2,$t3)', 1222 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1223 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1224 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1225 '&vpsrld ($t2,$t3,$sigma1[2])', 1226 '&vpsrlq ($t3,$t3,$sigma1[0])', 1227 '&vpxor ($t2,$t2,$t3);', 1228 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1229 '&vpxor ($t2,$t2,$t3)', 1230 '&vpshufb ($t2,$t2,$t5)', 1231 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1232 ); 1233} 1234 1235sub AVX_256_00_47 () { 1236my $j = shift; 1237my $body = shift; 1238my @X = @_; 1239my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1240 1241 foreach (Xupdate_256_AVX()) { # 29 instructions 1242 eval; 1243 eval(shift(@insns)); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 } 1247 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1248 foreach (@insns) { eval; } # remaining instructions 1249 &vmovdqa (16*$j."(%rsp)",$t2); 1250} 1251 1252 for ($i=0,$j=0; $j<4; $j++) { 1253 &AVX_256_00_47($j,\&body_00_15,@X); 1254 push(@X,shift(@X)); # rotate(@X) 1255 } 1256 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1257 &jne (".Lavx_00_47"); 1258 1259 for ($i=0; $i<16; ) { 1260 foreach(body_00_15()) { eval; } 1261 } 1262 1263 } else { # SHA512 1264 my @X = map("%xmm$_",(0..7)); 1265 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1266 1267$code.=<<___; 1268 jmp .Lloop_avx 1269.align 16 1270.Lloop_avx: 1271 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1272 vmovdqu 0x00($inp),@X[0] 1273 lea $TABLE+0x80(%rip),$Tbl # size optimization 1274 vmovdqu 0x10($inp),@X[1] 1275 vmovdqu 0x20($inp),@X[2] 1276 vpshufb $t3,@X[0],@X[0] 1277 vmovdqu 0x30($inp),@X[3] 1278 vpshufb $t3,@X[1],@X[1] 1279 vmovdqu 0x40($inp),@X[4] 1280 vpshufb $t3,@X[2],@X[2] 1281 vmovdqu 0x50($inp),@X[5] 1282 vpshufb $t3,@X[3],@X[3] 1283 vmovdqu 0x60($inp),@X[6] 1284 vpshufb $t3,@X[4],@X[4] 1285 vmovdqu 0x70($inp),@X[7] 1286 vpshufb $t3,@X[5],@X[5] 1287 vpaddq -0x80($Tbl),@X[0],$t0 1288 vpshufb $t3,@X[6],@X[6] 1289 vpaddq -0x60($Tbl),@X[1],$t1 1290 vpshufb $t3,@X[7],@X[7] 1291 vpaddq -0x40($Tbl),@X[2],$t2 1292 vpaddq -0x20($Tbl),@X[3],$t3 1293 vmovdqa $t0,0x00(%rsp) 1294 vpaddq 0x00($Tbl),@X[4],$t0 1295 vmovdqa $t1,0x10(%rsp) 1296 vpaddq 0x20($Tbl),@X[5],$t1 1297 vmovdqa $t2,0x20(%rsp) 1298 vpaddq 0x40($Tbl),@X[6],$t2 1299 vmovdqa $t3,0x30(%rsp) 1300 vpaddq 0x60($Tbl),@X[7],$t3 1301 vmovdqa $t0,0x40(%rsp) 1302 mov $A,$a1 1303 vmovdqa $t1,0x50(%rsp) 1304 mov $B,$a3 1305 vmovdqa $t2,0x60(%rsp) 1306 xor $C,$a3 # magic 1307 vmovdqa $t3,0x70(%rsp) 1308 mov $E,$a0 1309 jmp .Lavx_00_47 1310 1311.align 16 1312.Lavx_00_47: 1313 add \$`16*2*$SZ`,$Tbl 1314___ 1315sub Xupdate_512_AVX () { 1316 ( 1317 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1318 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1319 '&vpsrlq ($t2,$t0,$sigma0[0])', 1320 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1321 '&vpsrlq ($t3,$t0,$sigma0[2])', 1322 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1323 '&vpxor ($t0,$t3,$t2)', 1324 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1325 '&vpxor ($t0,$t0,$t1)', 1326 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1327 '&vpxor ($t0,$t0,$t2)', 1328 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1329 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1330 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1331 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1332 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1333 '&vpxor ($t3,$t3,$t2)', 1334 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1335 '&vpxor ($t3,$t3,$t1)', 1336 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1337 '&vpxor ($t3,$t3,$t2)', 1338 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1339 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1340 ); 1341} 1342 1343sub AVX_512_00_47 () { 1344my $j = shift; 1345my $body = shift; 1346my @X = @_; 1347my @insns = (&$body,&$body); # 52 instructions 1348 1349 foreach (Xupdate_512_AVX()) { # 23 instructions 1350 eval; 1351 eval(shift(@insns)); 1352 eval(shift(@insns)); 1353 } 1354 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1355 foreach (@insns) { eval; } # remaining instructions 1356 &vmovdqa (16*$j."(%rsp)",$t2); 1357} 1358 1359 for ($i=0,$j=0; $j<8; $j++) { 1360 &AVX_512_00_47($j,\&body_00_15,@X); 1361 push(@X,shift(@X)); # rotate(@X) 1362 } 1363 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1364 &jne (".Lavx_00_47"); 1365 1366 for ($i=0; $i<16; ) { 1367 foreach(body_00_15()) { eval; } 1368 } 1369} 1370$code.=<<___; 1371 mov $_ctx,$ctx 1372 mov $a1,$A 1373 1374 add $SZ*0($ctx),$A 1375 lea 16*$SZ($inp),$inp 1376 add $SZ*1($ctx),$B 1377 add $SZ*2($ctx),$C 1378 add $SZ*3($ctx),$D 1379 add $SZ*4($ctx),$E 1380 add $SZ*5($ctx),$F 1381 add $SZ*6($ctx),$G 1382 add $SZ*7($ctx),$H 1383 1384 cmp $_end,$inp 1385 1386 mov $A,$SZ*0($ctx) 1387 mov $B,$SZ*1($ctx) 1388 mov $C,$SZ*2($ctx) 1389 mov $D,$SZ*3($ctx) 1390 mov $E,$SZ*4($ctx) 1391 mov $F,$SZ*5($ctx) 1392 mov $G,$SZ*6($ctx) 1393 mov $H,$SZ*7($ctx) 1394 jb .Lloop_avx 1395 1396 mov $_rsp,%rsi 1397.cfi_def_cfa %rsi,8 1398 vzeroupper 1399___ 1400$code.=<<___ if ($win64); 1401 movaps 16*$SZ+32(%rsp),%xmm6 1402 movaps 16*$SZ+48(%rsp),%xmm7 1403 movaps 16*$SZ+64(%rsp),%xmm8 1404 movaps 16*$SZ+80(%rsp),%xmm9 1405___ 1406$code.=<<___ if ($win64 && $SZ>4); 1407 movaps 16*$SZ+96(%rsp),%xmm10 1408 movaps 16*$SZ+112(%rsp),%xmm11 1409___ 1410$code.=<<___; 1411 mov -48(%rsi),%r15 1412.cfi_restore %r15 1413 mov -40(%rsi),%r14 1414.cfi_restore %r14 1415 mov -32(%rsi),%r13 1416.cfi_restore %r13 1417 mov -24(%rsi),%r12 1418.cfi_restore %r12 1419 mov -16(%rsi),%rbp 1420.cfi_restore %rbp 1421 mov -8(%rsi),%rbx 1422.cfi_restore %rbx 1423 lea (%rsi),%rsp 1424.cfi_def_cfa_register %rsp 1425.Lepilogue_avx: 1426 ret 1427.cfi_endproc 1428.size ${func}_avx,.-${func}_avx 1429___ 1430 1431if ($avx>1) {{ 1432###################################################################### 1433# AVX2+BMI code path 1434# 1435my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1436my $PUSH8=8*2*$SZ; 1437use integer; 1438 1439sub bodyx_00_15 () { 1440 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1441 ( 1442 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1443 1444 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1445 '&and ($a4,$e)', # f&e 1446 '&rorx ($a0,$e,$Sigma1[2])', 1447 '&rorx ($a2,$e,$Sigma1[1])', 1448 1449 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1450 '&lea ($h,"($h,$a4)")', 1451 '&andn ($a4,$e,$g)', # ~e&g 1452 '&xor ($a0,$a2)', 1453 1454 '&rorx ($a1,$e,$Sigma1[0])', 1455 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1456 '&xor ($a0,$a1)', # Sigma1(e) 1457 '&mov ($a2,$a)', 1458 1459 '&rorx ($a4,$a,$Sigma0[2])', 1460 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1461 '&xor ($a2,$b)', # a^b, b^c in next round 1462 '&rorx ($a1,$a,$Sigma0[1])', 1463 1464 '&rorx ($a0,$a,$Sigma0[0])', 1465 '&lea ($d,"($d,$h)")', # d+=h 1466 '&and ($a3,$a2)', # (b^c)&(a^b) 1467 '&xor ($a1,$a4)', 1468 1469 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1470 '&xor ($a1,$a0)', # Sigma0(a) 1471 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1472 '&mov ($a4,$e)', # copy of f in future 1473 1474 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1475 ); 1476 # and at the finish one has to $a+=$a1 1477} 1478 1479$code.=<<___; 1480.type ${func}_avx2,\@function,3 1481.align 64 1482${func}_avx2: 1483.cfi_startproc 1484.Lavx2_shortcut: 1485 mov %rsp,%rax # copy %rsp 1486.cfi_def_cfa_register %rax 1487 push %rbx 1488.cfi_push %rbx 1489 push %rbp 1490.cfi_push %rbp 1491 push %r12 1492.cfi_push %r12 1493 push %r13 1494.cfi_push %r13 1495 push %r14 1496.cfi_push %r14 1497 push %r15 1498.cfi_push %r15 1499 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1500 shl \$4,%rdx # num*16 1501 and \$-256*$SZ,%rsp # align stack frame 1502 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1503 add \$`2*$SZ*($rounds-8)`,%rsp 1504 mov $ctx,$_ctx # save ctx, 1st arg 1505 mov $inp,$_inp # save inp, 2nd arh 1506 mov %rdx,$_end # save end pointer, "3rd" arg 1507 mov %rax,$_rsp # save copy of %rsp 1508.cfi_cfa_expression $_rsp,deref,+8 1509___ 1510$code.=<<___ if ($win64); 1511 movaps %xmm6,16*$SZ+32(%rsp) 1512 movaps %xmm7,16*$SZ+48(%rsp) 1513 movaps %xmm8,16*$SZ+64(%rsp) 1514 movaps %xmm9,16*$SZ+80(%rsp) 1515___ 1516$code.=<<___ if ($win64 && $SZ>4); 1517 movaps %xmm10,16*$SZ+96(%rsp) 1518 movaps %xmm11,16*$SZ+112(%rsp) 1519___ 1520$code.=<<___; 1521.Lprologue_avx2: 1522 1523 vzeroupper 1524 sub \$-16*$SZ,$inp # inp++, size optimization 1525 mov $SZ*0($ctx),$A 1526 mov $inp,%r12 # borrow $T1 1527 mov $SZ*1($ctx),$B 1528 cmp %rdx,$inp # $_end 1529 mov $SZ*2($ctx),$C 1530 cmove %rsp,%r12 # next block or random data 1531 mov $SZ*3($ctx),$D 1532 mov $SZ*4($ctx),$E 1533 mov $SZ*5($ctx),$F 1534 mov $SZ*6($ctx),$G 1535 mov $SZ*7($ctx),$H 1536___ 1537 if ($SZ==4) { # SHA256 1538 my @X = map("%ymm$_",(0..3)); 1539 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1540 1541$code.=<<___; 1542 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1543 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1544 jmp .Loop_avx2 1545.align 16 1546.Loop_avx2: 1547 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1548 vmovdqu -16*$SZ+0($inp),%xmm0 1549 vmovdqu -16*$SZ+16($inp),%xmm1 1550 vmovdqu -16*$SZ+32($inp),%xmm2 1551 vmovdqu -16*$SZ+48($inp),%xmm3 1552 #mov $inp,$_inp # offload $inp 1553 vinserti128 \$1,(%r12),@X[0],@X[0] 1554 vinserti128 \$1,16(%r12),@X[1],@X[1] 1555 vpshufb $t3,@X[0],@X[0] 1556 vinserti128 \$1,32(%r12),@X[2],@X[2] 1557 vpshufb $t3,@X[1],@X[1] 1558 vinserti128 \$1,48(%r12),@X[3],@X[3] 1559 1560 lea $TABLE(%rip),$Tbl 1561 vpshufb $t3,@X[2],@X[2] 1562 vpaddd 0x00($Tbl),@X[0],$t0 1563 vpshufb $t3,@X[3],@X[3] 1564 vpaddd 0x20($Tbl),@X[1],$t1 1565 vpaddd 0x40($Tbl),@X[2],$t2 1566 vpaddd 0x60($Tbl),@X[3],$t3 1567 vmovdqa $t0,0x00(%rsp) 1568 xor $a1,$a1 1569 vmovdqa $t1,0x20(%rsp) 1570 lea -$PUSH8(%rsp),%rsp 1571 mov $B,$a3 1572 vmovdqa $t2,0x00(%rsp) 1573 xor $C,$a3 # magic 1574 vmovdqa $t3,0x20(%rsp) 1575 mov $F,$a4 1576 sub \$-16*2*$SZ,$Tbl # size optimization 1577 jmp .Lavx2_00_47 1578 1579.align 16 1580.Lavx2_00_47: 1581___ 1582 1583sub AVX2_256_00_47 () { 1584my $j = shift; 1585my $body = shift; 1586my @X = @_; 1587my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1588my $base = "+2*$PUSH8(%rsp)"; 1589 1590 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1591 foreach (Xupdate_256_AVX()) { # 29 instructions 1592 eval; 1593 eval(shift(@insns)); 1594 eval(shift(@insns)); 1595 eval(shift(@insns)); 1596 } 1597 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1598 foreach (@insns) { eval; } # remaining instructions 1599 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1600} 1601 1602 for ($i=0,$j=0; $j<4; $j++) { 1603 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1604 push(@X,shift(@X)); # rotate(@X) 1605 } 1606 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1607 &cmpb (($SZ-1)."($Tbl)",0); 1608 &jne (".Lavx2_00_47"); 1609 1610 for ($i=0; $i<16; ) { 1611 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1612 foreach(bodyx_00_15()) { eval; } 1613 } 1614 } else { # SHA512 1615 my @X = map("%ymm$_",(0..7)); 1616 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1617 1618$code.=<<___; 1619 jmp .Loop_avx2 1620.align 16 1621.Loop_avx2: 1622 vmovdqu -16*$SZ($inp),%xmm0 1623 vmovdqu -16*$SZ+16($inp),%xmm1 1624 vmovdqu -16*$SZ+32($inp),%xmm2 1625 lea $TABLE+0x80(%rip),$Tbl # size optimization 1626 vmovdqu -16*$SZ+48($inp),%xmm3 1627 vmovdqu -16*$SZ+64($inp),%xmm4 1628 vmovdqu -16*$SZ+80($inp),%xmm5 1629 vmovdqu -16*$SZ+96($inp),%xmm6 1630 vmovdqu -16*$SZ+112($inp),%xmm7 1631 #mov $inp,$_inp # offload $inp 1632 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1633 vinserti128 \$1,(%r12),@X[0],@X[0] 1634 vinserti128 \$1,16(%r12),@X[1],@X[1] 1635 vpshufb $t2,@X[0],@X[0] 1636 vinserti128 \$1,32(%r12),@X[2],@X[2] 1637 vpshufb $t2,@X[1],@X[1] 1638 vinserti128 \$1,48(%r12),@X[3],@X[3] 1639 vpshufb $t2,@X[2],@X[2] 1640 vinserti128 \$1,64(%r12),@X[4],@X[4] 1641 vpshufb $t2,@X[3],@X[3] 1642 vinserti128 \$1,80(%r12),@X[5],@X[5] 1643 vpshufb $t2,@X[4],@X[4] 1644 vinserti128 \$1,96(%r12),@X[6],@X[6] 1645 vpshufb $t2,@X[5],@X[5] 1646 vinserti128 \$1,112(%r12),@X[7],@X[7] 1647 1648 vpaddq -0x80($Tbl),@X[0],$t0 1649 vpshufb $t2,@X[6],@X[6] 1650 vpaddq -0x60($Tbl),@X[1],$t1 1651 vpshufb $t2,@X[7],@X[7] 1652 vpaddq -0x40($Tbl),@X[2],$t2 1653 vpaddq -0x20($Tbl),@X[3],$t3 1654 vmovdqa $t0,0x00(%rsp) 1655 vpaddq 0x00($Tbl),@X[4],$t0 1656 vmovdqa $t1,0x20(%rsp) 1657 vpaddq 0x20($Tbl),@X[5],$t1 1658 vmovdqa $t2,0x40(%rsp) 1659 vpaddq 0x40($Tbl),@X[6],$t2 1660 vmovdqa $t3,0x60(%rsp) 1661 lea -$PUSH8(%rsp),%rsp 1662 vpaddq 0x60($Tbl),@X[7],$t3 1663 vmovdqa $t0,0x00(%rsp) 1664 xor $a1,$a1 1665 vmovdqa $t1,0x20(%rsp) 1666 mov $B,$a3 1667 vmovdqa $t2,0x40(%rsp) 1668 xor $C,$a3 # magic 1669 vmovdqa $t3,0x60(%rsp) 1670 mov $F,$a4 1671 add \$16*2*$SZ,$Tbl 1672 jmp .Lavx2_00_47 1673 1674.align 16 1675.Lavx2_00_47: 1676___ 1677 1678sub AVX2_512_00_47 () { 1679my $j = shift; 1680my $body = shift; 1681my @X = @_; 1682my @insns = (&$body,&$body); # 48 instructions 1683my $base = "+2*$PUSH8(%rsp)"; 1684 1685 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 1686 foreach (Xupdate_512_AVX()) { # 23 instructions 1687 eval; 1688 if ($_ !~ /\;$/) { 1689 eval(shift(@insns)); 1690 eval(shift(@insns)); 1691 eval(shift(@insns)); 1692 } 1693 } 1694 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1695 foreach (@insns) { eval; } # remaining instructions 1696 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1697} 1698 1699 for ($i=0,$j=0; $j<8; $j++) { 1700 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 1701 push(@X,shift(@X)); # rotate(@X) 1702 } 1703 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1704 &cmpb (($SZ-1-0x80)."($Tbl)",0); 1705 &jne (".Lavx2_00_47"); 1706 1707 for ($i=0; $i<16; ) { 1708 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1709 foreach(bodyx_00_15()) { eval; } 1710 } 1711} 1712$code.=<<___; 1713 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1714 add $a1,$A 1715 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1716 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1717 1718 add $SZ*0($ctx),$A 1719 add $SZ*1($ctx),$B 1720 add $SZ*2($ctx),$C 1721 add $SZ*3($ctx),$D 1722 add $SZ*4($ctx),$E 1723 add $SZ*5($ctx),$F 1724 add $SZ*6($ctx),$G 1725 add $SZ*7($ctx),$H 1726 1727 mov $A,$SZ*0($ctx) 1728 mov $B,$SZ*1($ctx) 1729 mov $C,$SZ*2($ctx) 1730 mov $D,$SZ*3($ctx) 1731 mov $E,$SZ*4($ctx) 1732 mov $F,$SZ*5($ctx) 1733 mov $G,$SZ*6($ctx) 1734 mov $H,$SZ*7($ctx) 1735 1736 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 1737 je .Ldone_avx2 1738 1739 xor $a1,$a1 1740 mov $B,$a3 1741 xor $C,$a3 # magic 1742 mov $F,$a4 1743 jmp .Lower_avx2 1744.align 16 1745.Lower_avx2: 1746___ 1747 for ($i=0; $i<8; ) { 1748 my $base="+16($Tbl)"; 1749 foreach(bodyx_00_15()) { eval; } 1750 } 1751$code.=<<___; 1752 lea -$PUSH8($Tbl),$Tbl 1753 cmp %rsp,$Tbl 1754 jae .Lower_avx2 1755 1756 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 1757 add $a1,$A 1758 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 1759 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1760 1761 add $SZ*0($ctx),$A 1762 add $SZ*1($ctx),$B 1763 add $SZ*2($ctx),$C 1764 add $SZ*3($ctx),$D 1765 add $SZ*4($ctx),$E 1766 add $SZ*5($ctx),$F 1767 lea `2*16*$SZ`($inp),$inp # inp+=2 1768 add $SZ*6($ctx),$G 1769 mov $inp,%r12 1770 add $SZ*7($ctx),$H 1771 cmp $_end,$inp 1772 1773 mov $A,$SZ*0($ctx) 1774 cmove %rsp,%r12 # next block or stale data 1775 mov $B,$SZ*1($ctx) 1776 mov $C,$SZ*2($ctx) 1777 mov $D,$SZ*3($ctx) 1778 mov $E,$SZ*4($ctx) 1779 mov $F,$SZ*5($ctx) 1780 mov $G,$SZ*6($ctx) 1781 mov $H,$SZ*7($ctx) 1782 1783 jbe .Loop_avx2 1784 lea (%rsp),$Tbl 1785 1786.Ldone_avx2: 1787 lea ($Tbl),%rsp 1788 mov $_rsp,%rsi 1789.cfi_def_cfa %rsi,8 1790 vzeroupper 1791___ 1792$code.=<<___ if ($win64); 1793 movaps 16*$SZ+32(%rsp),%xmm6 1794 movaps 16*$SZ+48(%rsp),%xmm7 1795 movaps 16*$SZ+64(%rsp),%xmm8 1796 movaps 16*$SZ+80(%rsp),%xmm9 1797___ 1798$code.=<<___ if ($win64 && $SZ>4); 1799 movaps 16*$SZ+96(%rsp),%xmm10 1800 movaps 16*$SZ+112(%rsp),%xmm11 1801___ 1802$code.=<<___; 1803 mov -48(%rsi),%r15 1804.cfi_restore %r15 1805 mov -40(%rsi),%r14 1806.cfi_restore %r14 1807 mov -32(%rsi),%r13 1808.cfi_restore %r13 1809 mov -24(%rsi),%r12 1810.cfi_restore %r12 1811 mov -16(%rsi),%rbp 1812.cfi_restore %rbp 1813 mov -8(%rsi),%rbx 1814.cfi_restore %rbx 1815 lea (%rsi),%rsp 1816.cfi_def_cfa_register %rsp 1817.Lepilogue_avx2: 1818 ret 1819.cfi_endproc 1820.size ${func}_avx2,.-${func}_avx2 1821___ 1822}} 1823}}}}} 1824 1825# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1826# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1827if ($win64) { 1828$rec="%rcx"; 1829$frame="%rdx"; 1830$context="%r8"; 1831$disp="%r9"; 1832 1833$code.=<<___; 1834.extern __imp_RtlVirtualUnwind 1835.type se_handler,\@abi-omnipotent 1836.align 16 1837se_handler: 1838 push %rsi 1839 push %rdi 1840 push %rbx 1841 push %rbp 1842 push %r12 1843 push %r13 1844 push %r14 1845 push %r15 1846 pushfq 1847 sub \$64,%rsp 1848 1849 mov 120($context),%rax # pull context->Rax 1850 mov 248($context),%rbx # pull context->Rip 1851 1852 mov 8($disp),%rsi # disp->ImageBase 1853 mov 56($disp),%r11 # disp->HanderlData 1854 1855 mov 0(%r11),%r10d # HandlerData[0] 1856 lea (%rsi,%r10),%r10 # prologue label 1857 cmp %r10,%rbx # context->Rip<prologue label 1858 jb .Lin_prologue 1859 1860 mov 152($context),%rax # pull context->Rsp 1861 1862 mov 4(%r11),%r10d # HandlerData[1] 1863 lea (%rsi,%r10),%r10 # epilogue label 1864 cmp %r10,%rbx # context->Rip>=epilogue label 1865 jae .Lin_prologue 1866___ 1867$code.=<<___ if ($avx>1); 1868 lea .Lavx2_shortcut(%rip),%r10 1869 cmp %r10,%rbx # context->Rip<avx2_shortcut 1870 jb .Lnot_in_avx2 1871 1872 and \$-256*$SZ,%rax 1873 add \$`2*$SZ*($rounds-8)`,%rax 1874.Lnot_in_avx2: 1875___ 1876$code.=<<___; 1877 mov %rax,%rsi # put aside Rsp 1878 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 1879 1880 mov -8(%rax),%rbx 1881 mov -16(%rax),%rbp 1882 mov -24(%rax),%r12 1883 mov -32(%rax),%r13 1884 mov -40(%rax),%r14 1885 mov -48(%rax),%r15 1886 mov %rbx,144($context) # restore context->Rbx 1887 mov %rbp,160($context) # restore context->Rbp 1888 mov %r12,216($context) # restore context->R12 1889 mov %r13,224($context) # restore context->R13 1890 mov %r14,232($context) # restore context->R14 1891 mov %r15,240($context) # restore context->R15 1892 1893 lea .Lepilogue(%rip),%r10 1894 cmp %r10,%rbx 1895 jb .Lin_prologue # non-AVX code 1896 1897 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 1898 lea 512($context),%rdi # &context.Xmm6 1899 mov \$`$SZ==4?8:12`,%ecx 1900 .long 0xa548f3fc # cld; rep movsq 1901 1902.Lin_prologue: 1903 mov 8(%rax),%rdi 1904 mov 16(%rax),%rsi 1905 mov %rax,152($context) # restore context->Rsp 1906 mov %rsi,168($context) # restore context->Rsi 1907 mov %rdi,176($context) # restore context->Rdi 1908 1909 mov 40($disp),%rdi # disp->ContextRecord 1910 mov $context,%rsi # context 1911 mov \$154,%ecx # sizeof(CONTEXT) 1912 .long 0xa548f3fc # cld; rep movsq 1913 1914 mov $disp,%rsi 1915 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1916 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1917 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1918 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1919 mov 40(%rsi),%r10 # disp->ContextRecord 1920 lea 56(%rsi),%r11 # &disp->HandlerData 1921 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1922 mov %r10,32(%rsp) # arg5 1923 mov %r11,40(%rsp) # arg6 1924 mov %r12,48(%rsp) # arg7 1925 mov %rcx,56(%rsp) # arg8, (NULL) 1926 call *__imp_RtlVirtualUnwind(%rip) 1927 1928 mov \$1,%eax # ExceptionContinueSearch 1929 add \$64,%rsp 1930 popfq 1931 pop %r15 1932 pop %r14 1933 pop %r13 1934 pop %r12 1935 pop %rbp 1936 pop %rbx 1937 pop %rdi 1938 pop %rsi 1939 ret 1940.size se_handler,.-se_handler 1941___ 1942 1943$code.=<<___ if ($SZ==4 && $shaext); 1944.type shaext_handler,\@abi-omnipotent 1945.align 16 1946shaext_handler: 1947 push %rsi 1948 push %rdi 1949 push %rbx 1950 push %rbp 1951 push %r12 1952 push %r13 1953 push %r14 1954 push %r15 1955 pushfq 1956 sub \$64,%rsp 1957 1958 mov 120($context),%rax # pull context->Rax 1959 mov 248($context),%rbx # pull context->Rip 1960 1961 lea .Lprologue_shaext(%rip),%r10 1962 cmp %r10,%rbx # context->Rip<.Lprologue 1963 jb .Lin_prologue 1964 1965 lea .Lepilogue_shaext(%rip),%r10 1966 cmp %r10,%rbx # context->Rip>=.Lepilogue 1967 jae .Lin_prologue 1968 1969 lea -8-5*16(%rax),%rsi 1970 lea 512($context),%rdi # &context.Xmm6 1971 mov \$10,%ecx 1972 .long 0xa548f3fc # cld; rep movsq 1973 1974 jmp .Lin_prologue 1975.size shaext_handler,.-shaext_handler 1976___ 1977 1978$code.=<<___; 1979.section .pdata 1980.align 4 1981 .rva .LSEH_begin_${func}_nohw 1982 .rva .LSEH_end_${func}_nohw 1983 .rva .LSEH_info_${func}_nohw 1984___ 1985$code.=<<___ if ($SZ==4 && $shaext); 1986 .rva .LSEH_begin_${func}_hw 1987 .rva .LSEH_end_${func}_hw 1988 .rva .LSEH_info_${func}_hw 1989___ 1990$code.=<<___ if ($SZ==4); 1991 .rva .LSEH_begin_${func}_ssse3 1992 .rva .LSEH_end_${func}_ssse3 1993 .rva .LSEH_info_${func}_ssse3 1994___ 1995$code.=<<___ if ($avx); 1996 .rva .LSEH_begin_${func}_avx 1997 .rva .LSEH_end_${func}_avx 1998 .rva .LSEH_info_${func}_avx 1999___ 2000$code.=<<___ if ($avx>1); 2001 .rva .LSEH_begin_${func}_avx2 2002 .rva .LSEH_end_${func}_avx2 2003 .rva .LSEH_info_${func}_avx2 2004___ 2005$code.=<<___; 2006.section .xdata 2007.align 8 2008.LSEH_info_${func}_nohw: 2009 .byte 9,0,0,0 2010 .rva se_handler 2011 .rva .Lprologue,.Lepilogue # HandlerData[] 2012___ 2013$code.=<<___ if ($SZ==4 && $shaext); 2014.LSEH_info_${func}_hw: 2015 .byte 9,0,0,0 2016 .rva shaext_handler 2017___ 2018$code.=<<___ if ($SZ==4); 2019.LSEH_info_${func}_ssse3: 2020 .byte 9,0,0,0 2021 .rva se_handler 2022 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2023___ 2024$code.=<<___ if ($avx); 2025.LSEH_info_${func}_avx: 2026 .byte 9,0,0,0 2027 .rva se_handler 2028 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2029___ 2030$code.=<<___ if ($avx>1); 2031.LSEH_info_${func}_avx2: 2032 .byte 9,0,0,0 2033 .rva se_handler 2034 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2035___ 2036} 2037 2038sub sha256op38 { 2039 my $instr = shift; 2040 my %opcodelet = ( 2041 "sha256rnds2" => 0xcb, 2042 "sha256msg1" => 0xcc, 2043 "sha256msg2" => 0xcd ); 2044 2045 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2046 my @opcode=(0x0f,0x38); 2047 push @opcode,$opcodelet{$instr}; 2048 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2049 return ".byte\t".join(',',@opcode); 2050 } else { 2051 return $instr."\t".@_[0]; 2052 } 2053} 2054 2055foreach (split("\n",$code)) { 2056 s/\`([^\`]*)\`/eval $1/geo; 2057 2058 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2059 2060 print $_,"\n"; 2061} 2062close STDOUT or die "error closing STDOUT: $!"; 2063