1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# November 2014 18# 19# ChaCha20 for x86_64. 20# 21# December 2016 22# 23# Add AVX512F code path. 24# 25# Performance in cycles per byte out of large buffer. 26# 27# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v) 28# 29# P4 9.48/+99% -/22.7(ii) - 30# Core2 7.83/+55% 7.90/8.08 4.35 31# Westmere 7.19/+50% 5.60/6.70 3.00 32# Sandy Bridge 8.31/+42% 5.45/6.76 2.72 33# Ivy Bridge 6.71/+46% 5.40/6.49 2.41 34# Haswell 5.92/+43% 5.20/6.45 2.42 1.23 35# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57] 36# Silvermont 12.0/+33% 7.75/7.40 7.03(iii) 37# Knights L 11.7/- - 9.60(iii) 0.80 38# Goldmont 10.6/+17% 5.10/- 3.28 39# Sledgehammer 7.28/+52% -/14.2(ii) - 40# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) 41# Ryzen 5.96/+50% 5.19/- 2.40 2.09 42# VIA Nano 10.5/+46% 6.72/8.60 6.05 43# 44# (i) compared to older gcc 3.x one can observe >2x improvement on 45# most platforms; 46# (ii) as it can be seen, SSE2 performance is too low on legacy 47# processors; NxSSE2 results are naturally better, but not 48# impressively better than IALU ones, which is why you won't 49# find SSE2 code below; 50# (iii) this is not optimal result for Atom because of MSROM 51# limitations, SSE2 can do better, but gain is considered too 52# low to justify the [maintenance] effort; 53# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; 54# 55# Modified from upstream OpenSSL to remove the XOP code. 56 57$flavour = shift; 58$output = shift; 59if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 60 61$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 62 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 65( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 66die "can't locate x86_64-xlate.pl"; 67 68$avx = 2; 69 70open OUT,"| \"$^X\" $xlate $flavour $output"; 71*STDOUT=*OUT; 72 73# input parameter block 74($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 75 76$code.=<<___; 77.text 78 79.extern OPENSSL_ia32cap_P 80 81.align 64 82.Lzero: 83.long 0,0,0,0 84.Lone: 85.long 1,0,0,0 86.Linc: 87.long 0,1,2,3 88.Lfour: 89.long 4,4,4,4 90.Lincy: 91.long 0,2,4,6,1,3,5,7 92.Leight: 93.long 8,8,8,8,8,8,8,8 94.Lrot16: 95.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 96.Lrot24: 97.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 98.Lsigma: 99.asciz "expand 32-byte k" 100.align 64 101.Lzeroz: 102.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 103.Lfourz: 104.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 105.Lincz: 106.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 107.Lsixteen: 108.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 109.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 110___ 111 112sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 113{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 114 my $arg = pop; 115 $arg = "\$$arg" if ($arg*1 eq $arg); 116 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 117} 118 119@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 120 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 121@t=("%esi","%edi"); 122 123sub ROUND { # critical path is 24 cycles per round 124my ($a0,$b0,$c0,$d0)=@_; 125my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 126my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 127my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 128my ($xc,$xc_)=map("\"$_\"",@t); 129my @x=map("\"$_\"",@x); 130 131 # Consider order in which variables are addressed by their 132 # index: 133 # 134 # a b c d 135 # 136 # 0 4 8 12 < even round 137 # 1 5 9 13 138 # 2 6 10 14 139 # 3 7 11 15 140 # 0 5 10 15 < odd round 141 # 1 6 11 12 142 # 2 7 8 13 143 # 3 4 9 14 144 # 145 # 'a', 'b' and 'd's are permanently allocated in registers, 146 # @x[0..7,12..15], while 'c's are maintained in memory. If 147 # you observe 'c' column, you'll notice that pair of 'c's is 148 # invariant between rounds. This means that we have to reload 149 # them once per round, in the middle. This is why you'll see 150 # bunch of 'c' stores and loads in the middle, but none in 151 # the beginning or end. 152 153 # Normally instructions would be interleaved to favour in-order 154 # execution. Generally out-of-order cores manage it gracefully, 155 # but not this time for some reason. As in-order execution 156 # cores are dying breed, old Atom is the only one around, 157 # instructions are left uninterleaved. Besides, Atom is better 158 # off executing 1xSSSE3 code anyway... 159 160 ( 161 "&add (@x[$a0],@x[$b0])", # Q1 162 "&xor (@x[$d0],@x[$a0])", 163 "&rol (@x[$d0],16)", 164 "&add (@x[$a1],@x[$b1])", # Q2 165 "&xor (@x[$d1],@x[$a1])", 166 "&rol (@x[$d1],16)", 167 168 "&add ($xc,@x[$d0])", 169 "&xor (@x[$b0],$xc)", 170 "&rol (@x[$b0],12)", 171 "&add ($xc_,@x[$d1])", 172 "&xor (@x[$b1],$xc_)", 173 "&rol (@x[$b1],12)", 174 175 "&add (@x[$a0],@x[$b0])", 176 "&xor (@x[$d0],@x[$a0])", 177 "&rol (@x[$d0],8)", 178 "&add (@x[$a1],@x[$b1])", 179 "&xor (@x[$d1],@x[$a1])", 180 "&rol (@x[$d1],8)", 181 182 "&add ($xc,@x[$d0])", 183 "&xor (@x[$b0],$xc)", 184 "&rol (@x[$b0],7)", 185 "&add ($xc_,@x[$d1])", 186 "&xor (@x[$b1],$xc_)", 187 "&rol (@x[$b1],7)", 188 189 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 190 "&mov (\"4*$c1(%rsp)\",$xc_)", 191 "&mov ($xc,\"4*$c2(%rsp)\")", 192 "&mov ($xc_,\"4*$c3(%rsp)\")", 193 194 "&add (@x[$a2],@x[$b2])", # Q3 195 "&xor (@x[$d2],@x[$a2])", 196 "&rol (@x[$d2],16)", 197 "&add (@x[$a3],@x[$b3])", # Q4 198 "&xor (@x[$d3],@x[$a3])", 199 "&rol (@x[$d3],16)", 200 201 "&add ($xc,@x[$d2])", 202 "&xor (@x[$b2],$xc)", 203 "&rol (@x[$b2],12)", 204 "&add ($xc_,@x[$d3])", 205 "&xor (@x[$b3],$xc_)", 206 "&rol (@x[$b3],12)", 207 208 "&add (@x[$a2],@x[$b2])", 209 "&xor (@x[$d2],@x[$a2])", 210 "&rol (@x[$d2],8)", 211 "&add (@x[$a3],@x[$b3])", 212 "&xor (@x[$d3],@x[$a3])", 213 "&rol (@x[$d3],8)", 214 215 "&add ($xc,@x[$d2])", 216 "&xor (@x[$b2],$xc)", 217 "&rol (@x[$b2],7)", 218 "&add ($xc_,@x[$d3])", 219 "&xor (@x[$b3],$xc_)", 220 "&rol (@x[$b3],7)" 221 ); 222} 223 224######################################################################## 225# Generic code path that handles all lengths on pre-SSSE3 processors. 226$code.=<<___; 227.globl ChaCha20_ctr32 228.type ChaCha20_ctr32,\@function,5 229.align 64 230ChaCha20_ctr32: 231.cfi_startproc 232 cmp \$0,$len 233 je .Lno_data 234 mov OPENSSL_ia32cap_P+4(%rip),%r10 235___ 236$code.=<<___ if ($avx>2); 237 bt \$48,%r10 # check for AVX512F 238 jc .LChaCha20_avx512 239___ 240$code.=<<___; 241 test \$`1<<(41-32)`,%r10d 242 jnz .LChaCha20_ssse3 243 244 push %rbx 245.cfi_push rbx 246 push %rbp 247.cfi_push rbp 248 push %r12 249.cfi_push r12 250 push %r13 251.cfi_push r13 252 push %r14 253.cfi_push r14 254 push %r15 255.cfi_push r15 256 sub \$64+24,%rsp 257.cfi_adjust_cfa_offset `64+24` 258.Lctr32_body: 259 260 #movdqa .Lsigma(%rip),%xmm0 261 movdqu ($key),%xmm1 262 movdqu 16($key),%xmm2 263 movdqu ($counter),%xmm3 264 movdqa .Lone(%rip),%xmm4 265 266 #movdqa %xmm0,4*0(%rsp) # key[0] 267 movdqa %xmm1,4*4(%rsp) # key[1] 268 movdqa %xmm2,4*8(%rsp) # key[2] 269 movdqa %xmm3,4*12(%rsp) # key[3] 270 mov $len,%rbp # reassign $len 271 jmp .Loop_outer 272 273.align 32 274.Loop_outer: 275 mov \$0x61707865,@x[0] # 'expa' 276 mov \$0x3320646e,@x[1] # 'nd 3' 277 mov \$0x79622d32,@x[2] # '2-by' 278 mov \$0x6b206574,@x[3] # 'te k' 279 mov 4*4(%rsp),@x[4] 280 mov 4*5(%rsp),@x[5] 281 mov 4*6(%rsp),@x[6] 282 mov 4*7(%rsp),@x[7] 283 movd %xmm3,@x[12] 284 mov 4*13(%rsp),@x[13] 285 mov 4*14(%rsp),@x[14] 286 mov 4*15(%rsp),@x[15] 287 288 mov %rbp,64+0(%rsp) # save len 289 mov \$10,%ebp 290 mov $inp,64+8(%rsp) # save inp 291 movq %xmm2,%rsi # "@x[8]" 292 mov $out,64+16(%rsp) # save out 293 mov %rsi,%rdi 294 shr \$32,%rdi # "@x[9]" 295 jmp .Loop 296 297.align 32 298.Loop: 299___ 300 foreach (&ROUND (0, 4, 8,12)) { eval; } 301 foreach (&ROUND (0, 5,10,15)) { eval; } 302 &dec ("%ebp"); 303 &jnz (".Loop"); 304 305$code.=<<___; 306 mov @t[1],4*9(%rsp) # modulo-scheduled 307 mov @t[0],4*8(%rsp) 308 mov 64(%rsp),%rbp # load len 309 movdqa %xmm2,%xmm1 310 mov 64+8(%rsp),$inp # load inp 311 paddd %xmm4,%xmm3 # increment counter 312 mov 64+16(%rsp),$out # load out 313 314 add \$0x61707865,@x[0] # 'expa' 315 add \$0x3320646e,@x[1] # 'nd 3' 316 add \$0x79622d32,@x[2] # '2-by' 317 add \$0x6b206574,@x[3] # 'te k' 318 add 4*4(%rsp),@x[4] 319 add 4*5(%rsp),@x[5] 320 add 4*6(%rsp),@x[6] 321 add 4*7(%rsp),@x[7] 322 add 4*12(%rsp),@x[12] 323 add 4*13(%rsp),@x[13] 324 add 4*14(%rsp),@x[14] 325 add 4*15(%rsp),@x[15] 326 paddd 4*8(%rsp),%xmm1 327 328 cmp \$64,%rbp 329 jb .Ltail 330 331 xor 4*0($inp),@x[0] # xor with input 332 xor 4*1($inp),@x[1] 333 xor 4*2($inp),@x[2] 334 xor 4*3($inp),@x[3] 335 xor 4*4($inp),@x[4] 336 xor 4*5($inp),@x[5] 337 xor 4*6($inp),@x[6] 338 xor 4*7($inp),@x[7] 339 movdqu 4*8($inp),%xmm0 340 xor 4*12($inp),@x[12] 341 xor 4*13($inp),@x[13] 342 xor 4*14($inp),@x[14] 343 xor 4*15($inp),@x[15] 344 lea 4*16($inp),$inp # inp+=64 345 pxor %xmm1,%xmm0 346 347 movdqa %xmm2,4*8(%rsp) 348 movd %xmm3,4*12(%rsp) 349 350 mov @x[0],4*0($out) # write output 351 mov @x[1],4*1($out) 352 mov @x[2],4*2($out) 353 mov @x[3],4*3($out) 354 mov @x[4],4*4($out) 355 mov @x[5],4*5($out) 356 mov @x[6],4*6($out) 357 mov @x[7],4*7($out) 358 movdqu %xmm0,4*8($out) 359 mov @x[12],4*12($out) 360 mov @x[13],4*13($out) 361 mov @x[14],4*14($out) 362 mov @x[15],4*15($out) 363 lea 4*16($out),$out # out+=64 364 365 sub \$64,%rbp 366 jnz .Loop_outer 367 368 jmp .Ldone 369 370.align 16 371.Ltail: 372 mov @x[0],4*0(%rsp) 373 mov @x[1],4*1(%rsp) 374 xor %rbx,%rbx 375 mov @x[2],4*2(%rsp) 376 mov @x[3],4*3(%rsp) 377 mov @x[4],4*4(%rsp) 378 mov @x[5],4*5(%rsp) 379 mov @x[6],4*6(%rsp) 380 mov @x[7],4*7(%rsp) 381 movdqa %xmm1,4*8(%rsp) 382 mov @x[12],4*12(%rsp) 383 mov @x[13],4*13(%rsp) 384 mov @x[14],4*14(%rsp) 385 mov @x[15],4*15(%rsp) 386 387.Loop_tail: 388 movzb ($inp,%rbx),%eax 389 movzb (%rsp,%rbx),%edx 390 lea 1(%rbx),%rbx 391 xor %edx,%eax 392 mov %al,-1($out,%rbx) 393 dec %rbp 394 jnz .Loop_tail 395 396.Ldone: 397 lea 64+24+48(%rsp),%rsi 398 mov -48(%rsi),%r15 399.cfi_restore r15 400 mov -40(%rsi),%r14 401.cfi_restore r14 402 mov -32(%rsi),%r13 403.cfi_restore r13 404 mov -24(%rsi),%r12 405.cfi_restore r12 406 mov -16(%rsi),%rbp 407.cfi_restore rbp 408 mov -8(%rsi),%rbx 409.cfi_restore rbx 410 lea (%rsi),%rsp 411.cfi_adjust_cfa_offset `-64-24-48` 412.Lno_data: 413 ret 414.cfi_endproc 415.size ChaCha20_ctr32,.-ChaCha20_ctr32 416___ 417 418######################################################################## 419# SSSE3 code path that handles shorter lengths 420{ 421my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 422 423sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 424 &paddd ($a,$b); 425 &pxor ($d,$a); 426 &pshufb ($d,$rot16); 427 428 &paddd ($c,$d); 429 &pxor ($b,$c); 430 &movdqa ($t,$b); 431 &psrld ($b,20); 432 &pslld ($t,12); 433 &por ($b,$t); 434 435 &paddd ($a,$b); 436 &pxor ($d,$a); 437 &pshufb ($d,$rot24); 438 439 &paddd ($c,$d); 440 &pxor ($b,$c); 441 &movdqa ($t,$b); 442 &psrld ($b,25); 443 &pslld ($t,7); 444 &por ($b,$t); 445} 446 447my $xframe = $win64 ? 32+8 : 8; 448 449$code.=<<___; 450.type ChaCha20_ssse3,\@function,5 451.align 32 452ChaCha20_ssse3: 453.LChaCha20_ssse3: 454.cfi_startproc 455 mov %rsp,%r9 # frame pointer 456.cfi_def_cfa_register r9 457___ 458$code.=<<___; 459 cmp \$128,$len # we might throw away some data, 460 ja .LChaCha20_4x # but overall it won't be slower 461 462.Ldo_sse3_after_all: 463 sub \$64+$xframe,%rsp 464___ 465$code.=<<___ if ($win64); 466 movaps %xmm6,-0x28(%r9) 467 movaps %xmm7,-0x18(%r9) 468.Lssse3_body: 469___ 470$code.=<<___; 471 movdqa .Lsigma(%rip),$a 472 movdqu ($key),$b 473 movdqu 16($key),$c 474 movdqu ($counter),$d 475 movdqa .Lrot16(%rip),$rot16 476 movdqa .Lrot24(%rip),$rot24 477 478 movdqa $a,0x00(%rsp) 479 movdqa $b,0x10(%rsp) 480 movdqa $c,0x20(%rsp) 481 movdqa $d,0x30(%rsp) 482 mov \$10,$counter # reuse $counter 483 jmp .Loop_ssse3 484 485.align 32 486.Loop_outer_ssse3: 487 movdqa .Lone(%rip),$d 488 movdqa 0x00(%rsp),$a 489 movdqa 0x10(%rsp),$b 490 movdqa 0x20(%rsp),$c 491 paddd 0x30(%rsp),$d 492 mov \$10,$counter 493 movdqa $d,0x30(%rsp) 494 jmp .Loop_ssse3 495 496.align 32 497.Loop_ssse3: 498___ 499 &SSSE3ROUND(); 500 &pshufd ($c,$c,0b01001110); 501 &pshufd ($b,$b,0b00111001); 502 &pshufd ($d,$d,0b10010011); 503 &nop (); 504 505 &SSSE3ROUND(); 506 &pshufd ($c,$c,0b01001110); 507 &pshufd ($b,$b,0b10010011); 508 &pshufd ($d,$d,0b00111001); 509 510 &dec ($counter); 511 &jnz (".Loop_ssse3"); 512 513$code.=<<___; 514 paddd 0x00(%rsp),$a 515 paddd 0x10(%rsp),$b 516 paddd 0x20(%rsp),$c 517 paddd 0x30(%rsp),$d 518 519 cmp \$64,$len 520 jb .Ltail_ssse3 521 522 movdqu 0x00($inp),$t 523 movdqu 0x10($inp),$t1 524 pxor $t,$a # xor with input 525 movdqu 0x20($inp),$t 526 pxor $t1,$b 527 movdqu 0x30($inp),$t1 528 lea 0x40($inp),$inp # inp+=64 529 pxor $t,$c 530 pxor $t1,$d 531 532 movdqu $a,0x00($out) # write output 533 movdqu $b,0x10($out) 534 movdqu $c,0x20($out) 535 movdqu $d,0x30($out) 536 lea 0x40($out),$out # out+=64 537 538 sub \$64,$len 539 jnz .Loop_outer_ssse3 540 541 jmp .Ldone_ssse3 542 543.align 16 544.Ltail_ssse3: 545 movdqa $a,0x00(%rsp) 546 movdqa $b,0x10(%rsp) 547 movdqa $c,0x20(%rsp) 548 movdqa $d,0x30(%rsp) 549 xor $counter,$counter 550 551.Loop_tail_ssse3: 552 movzb ($inp,$counter),%eax 553 movzb (%rsp,$counter),%ecx 554 lea 1($counter),$counter 555 xor %ecx,%eax 556 mov %al,-1($out,$counter) 557 dec $len 558 jnz .Loop_tail_ssse3 559 560.Ldone_ssse3: 561___ 562$code.=<<___ if ($win64); 563 movaps -0x28(%r9),%xmm6 564 movaps -0x18(%r9),%xmm7 565___ 566$code.=<<___; 567 lea (%r9),%rsp 568.cfi_def_cfa_register rsp 569.Lssse3_epilogue: 570 ret 571.cfi_endproc 572.size ChaCha20_ssse3,.-ChaCha20_ssse3 573___ 574} 575 576######################################################################## 577# SSSE3 code path that handles longer messages. 578{ 579# assign variables to favor Atom front-end 580my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 581 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 582my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 583 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 584 585sub SSSE3_lane_ROUND { 586my ($a0,$b0,$c0,$d0)=@_; 587my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 588my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 589my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 590my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 591my @x=map("\"$_\"",@xx); 592 593 # Consider order in which variables are addressed by their 594 # index: 595 # 596 # a b c d 597 # 598 # 0 4 8 12 < even round 599 # 1 5 9 13 600 # 2 6 10 14 601 # 3 7 11 15 602 # 0 5 10 15 < odd round 603 # 1 6 11 12 604 # 2 7 8 13 605 # 3 4 9 14 606 # 607 # 'a', 'b' and 'd's are permanently allocated in registers, 608 # @x[0..7,12..15], while 'c's are maintained in memory. If 609 # you observe 'c' column, you'll notice that pair of 'c's is 610 # invariant between rounds. This means that we have to reload 611 # them once per round, in the middle. This is why you'll see 612 # bunch of 'c' stores and loads in the middle, but none in 613 # the beginning or end. 614 615 ( 616 "&paddd (@x[$a0],@x[$b0])", # Q1 617 "&paddd (@x[$a1],@x[$b1])", # Q2 618 "&pxor (@x[$d0],@x[$a0])", 619 "&pxor (@x[$d1],@x[$a1])", 620 "&pshufb (@x[$d0],$t1)", 621 "&pshufb (@x[$d1],$t1)", 622 623 "&paddd ($xc,@x[$d0])", 624 "&paddd ($xc_,@x[$d1])", 625 "&pxor (@x[$b0],$xc)", 626 "&pxor (@x[$b1],$xc_)", 627 "&movdqa ($t0,@x[$b0])", 628 "&pslld (@x[$b0],12)", 629 "&psrld ($t0,20)", 630 "&movdqa ($t1,@x[$b1])", 631 "&pslld (@x[$b1],12)", 632 "&por (@x[$b0],$t0)", 633 "&psrld ($t1,20)", 634 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 635 "&por (@x[$b1],$t1)", 636 637 "&paddd (@x[$a0],@x[$b0])", 638 "&paddd (@x[$a1],@x[$b1])", 639 "&pxor (@x[$d0],@x[$a0])", 640 "&pxor (@x[$d1],@x[$a1])", 641 "&pshufb (@x[$d0],$t0)", 642 "&pshufb (@x[$d1],$t0)", 643 644 "&paddd ($xc,@x[$d0])", 645 "&paddd ($xc_,@x[$d1])", 646 "&pxor (@x[$b0],$xc)", 647 "&pxor (@x[$b1],$xc_)", 648 "&movdqa ($t1,@x[$b0])", 649 "&pslld (@x[$b0],7)", 650 "&psrld ($t1,25)", 651 "&movdqa ($t0,@x[$b1])", 652 "&pslld (@x[$b1],7)", 653 "&por (@x[$b0],$t1)", 654 "&psrld ($t0,25)", 655 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 656 "&por (@x[$b1],$t0)", 657 658 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 659 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 660 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 661 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 662 663 "&paddd (@x[$a2],@x[$b2])", # Q3 664 "&paddd (@x[$a3],@x[$b3])", # Q4 665 "&pxor (@x[$d2],@x[$a2])", 666 "&pxor (@x[$d3],@x[$a3])", 667 "&pshufb (@x[$d2],$t1)", 668 "&pshufb (@x[$d3],$t1)", 669 670 "&paddd ($xc,@x[$d2])", 671 "&paddd ($xc_,@x[$d3])", 672 "&pxor (@x[$b2],$xc)", 673 "&pxor (@x[$b3],$xc_)", 674 "&movdqa ($t0,@x[$b2])", 675 "&pslld (@x[$b2],12)", 676 "&psrld ($t0,20)", 677 "&movdqa ($t1,@x[$b3])", 678 "&pslld (@x[$b3],12)", 679 "&por (@x[$b2],$t0)", 680 "&psrld ($t1,20)", 681 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 682 "&por (@x[$b3],$t1)", 683 684 "&paddd (@x[$a2],@x[$b2])", 685 "&paddd (@x[$a3],@x[$b3])", 686 "&pxor (@x[$d2],@x[$a2])", 687 "&pxor (@x[$d3],@x[$a3])", 688 "&pshufb (@x[$d2],$t0)", 689 "&pshufb (@x[$d3],$t0)", 690 691 "&paddd ($xc,@x[$d2])", 692 "&paddd ($xc_,@x[$d3])", 693 "&pxor (@x[$b2],$xc)", 694 "&pxor (@x[$b3],$xc_)", 695 "&movdqa ($t1,@x[$b2])", 696 "&pslld (@x[$b2],7)", 697 "&psrld ($t1,25)", 698 "&movdqa ($t0,@x[$b3])", 699 "&pslld (@x[$b3],7)", 700 "&por (@x[$b2],$t1)", 701 "&psrld ($t0,25)", 702 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 703 "&por (@x[$b3],$t0)" 704 ); 705} 706 707my $xframe = $win64 ? 0xa8 : 8; 708 709$code.=<<___; 710.type ChaCha20_4x,\@function,5 711.align 32 712ChaCha20_4x: 713.LChaCha20_4x: 714.cfi_startproc 715 mov %rsp,%r9 # frame pointer 716.cfi_def_cfa_register r9 717 mov %r10,%r11 718___ 719$code.=<<___ if ($avx>1); 720 shr \$32,%r10 # OPENSSL_ia32cap_P+8 721 test \$`1<<5`,%r10 # test AVX2 722 jnz .LChaCha20_8x 723___ 724$code.=<<___; 725 cmp \$192,$len 726 ja .Lproceed4x 727 728 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 729 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 730 je .Ldo_sse3_after_all # to detect Atom 731 732.Lproceed4x: 733 sub \$0x140+$xframe,%rsp 734___ 735 ################ stack layout 736 # +0x00 SIMD equivalent of @x[8-12] 737 # ... 738 # +0x40 constant copy of key[0-2] smashed by lanes 739 # ... 740 # +0x100 SIMD counters (with nonce smashed by lanes) 741 # ... 742 # +0x140 743$code.=<<___ if ($win64); 744 movaps %xmm6,-0xa8(%r9) 745 movaps %xmm7,-0x98(%r9) 746 movaps %xmm8,-0x88(%r9) 747 movaps %xmm9,-0x78(%r9) 748 movaps %xmm10,-0x68(%r9) 749 movaps %xmm11,-0x58(%r9) 750 movaps %xmm12,-0x48(%r9) 751 movaps %xmm13,-0x38(%r9) 752 movaps %xmm14,-0x28(%r9) 753 movaps %xmm15,-0x18(%r9) 754.L4x_body: 755___ 756$code.=<<___; 757 movdqa .Lsigma(%rip),$xa3 # key[0] 758 movdqu ($key),$xb3 # key[1] 759 movdqu 16($key),$xt3 # key[2] 760 movdqu ($counter),$xd3 # key[3] 761 lea 0x100(%rsp),%rcx # size optimization 762 lea .Lrot16(%rip),%r10 763 lea .Lrot24(%rip),%r11 764 765 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 766 pshufd \$0x55,$xa3,$xa1 767 movdqa $xa0,0x40(%rsp) # ... and offload 768 pshufd \$0xaa,$xa3,$xa2 769 movdqa $xa1,0x50(%rsp) 770 pshufd \$0xff,$xa3,$xa3 771 movdqa $xa2,0x60(%rsp) 772 movdqa $xa3,0x70(%rsp) 773 774 pshufd \$0x00,$xb3,$xb0 775 pshufd \$0x55,$xb3,$xb1 776 movdqa $xb0,0x80-0x100(%rcx) 777 pshufd \$0xaa,$xb3,$xb2 778 movdqa $xb1,0x90-0x100(%rcx) 779 pshufd \$0xff,$xb3,$xb3 780 movdqa $xb2,0xa0-0x100(%rcx) 781 movdqa $xb3,0xb0-0x100(%rcx) 782 783 pshufd \$0x00,$xt3,$xt0 # "$xc0" 784 pshufd \$0x55,$xt3,$xt1 # "$xc1" 785 movdqa $xt0,0xc0-0x100(%rcx) 786 pshufd \$0xaa,$xt3,$xt2 # "$xc2" 787 movdqa $xt1,0xd0-0x100(%rcx) 788 pshufd \$0xff,$xt3,$xt3 # "$xc3" 789 movdqa $xt2,0xe0-0x100(%rcx) 790 movdqa $xt3,0xf0-0x100(%rcx) 791 792 pshufd \$0x00,$xd3,$xd0 793 pshufd \$0x55,$xd3,$xd1 794 paddd .Linc(%rip),$xd0 # don't save counters yet 795 pshufd \$0xaa,$xd3,$xd2 796 movdqa $xd1,0x110-0x100(%rcx) 797 pshufd \$0xff,$xd3,$xd3 798 movdqa $xd2,0x120-0x100(%rcx) 799 movdqa $xd3,0x130-0x100(%rcx) 800 801 jmp .Loop_enter4x 802 803.align 32 804.Loop_outer4x: 805 movdqa 0x40(%rsp),$xa0 # re-load smashed key 806 movdqa 0x50(%rsp),$xa1 807 movdqa 0x60(%rsp),$xa2 808 movdqa 0x70(%rsp),$xa3 809 movdqa 0x80-0x100(%rcx),$xb0 810 movdqa 0x90-0x100(%rcx),$xb1 811 movdqa 0xa0-0x100(%rcx),$xb2 812 movdqa 0xb0-0x100(%rcx),$xb3 813 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 814 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 815 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 816 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 817 movdqa 0x100-0x100(%rcx),$xd0 818 movdqa 0x110-0x100(%rcx),$xd1 819 movdqa 0x120-0x100(%rcx),$xd2 820 movdqa 0x130-0x100(%rcx),$xd3 821 paddd .Lfour(%rip),$xd0 # next SIMD counters 822 823.Loop_enter4x: 824 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 825 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 826 movdqa (%r10),$xt3 # .Lrot16(%rip) 827 mov \$10,%eax 828 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 829 jmp .Loop4x 830 831.align 32 832.Loop4x: 833___ 834 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 835 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 836$code.=<<___; 837 dec %eax 838 jnz .Loop4x 839 840 paddd 0x40(%rsp),$xa0 # accumulate key material 841 paddd 0x50(%rsp),$xa1 842 paddd 0x60(%rsp),$xa2 843 paddd 0x70(%rsp),$xa3 844 845 movdqa $xa0,$xt2 # "de-interlace" data 846 punpckldq $xa1,$xa0 847 movdqa $xa2,$xt3 848 punpckldq $xa3,$xa2 849 punpckhdq $xa1,$xt2 850 punpckhdq $xa3,$xt3 851 movdqa $xa0,$xa1 852 punpcklqdq $xa2,$xa0 # "a0" 853 movdqa $xt2,$xa3 854 punpcklqdq $xt3,$xt2 # "a2" 855 punpckhqdq $xa2,$xa1 # "a1" 856 punpckhqdq $xt3,$xa3 # "a3" 857___ 858 ($xa2,$xt2)=($xt2,$xa2); 859$code.=<<___; 860 paddd 0x80-0x100(%rcx),$xb0 861 paddd 0x90-0x100(%rcx),$xb1 862 paddd 0xa0-0x100(%rcx),$xb2 863 paddd 0xb0-0x100(%rcx),$xb3 864 865 movdqa $xa0,0x00(%rsp) # offload $xaN 866 movdqa $xa1,0x10(%rsp) 867 movdqa 0x20(%rsp),$xa0 # "xc2" 868 movdqa 0x30(%rsp),$xa1 # "xc3" 869 870 movdqa $xb0,$xt2 871 punpckldq $xb1,$xb0 872 movdqa $xb2,$xt3 873 punpckldq $xb3,$xb2 874 punpckhdq $xb1,$xt2 875 punpckhdq $xb3,$xt3 876 movdqa $xb0,$xb1 877 punpcklqdq $xb2,$xb0 # "b0" 878 movdqa $xt2,$xb3 879 punpcklqdq $xt3,$xt2 # "b2" 880 punpckhqdq $xb2,$xb1 # "b1" 881 punpckhqdq $xt3,$xb3 # "b3" 882___ 883 ($xb2,$xt2)=($xt2,$xb2); 884 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 885$code.=<<___; 886 paddd 0xc0-0x100(%rcx),$xc0 887 paddd 0xd0-0x100(%rcx),$xc1 888 paddd 0xe0-0x100(%rcx),$xc2 889 paddd 0xf0-0x100(%rcx),$xc3 890 891 movdqa $xa2,0x20(%rsp) # keep offloading $xaN 892 movdqa $xa3,0x30(%rsp) 893 894 movdqa $xc0,$xt2 895 punpckldq $xc1,$xc0 896 movdqa $xc2,$xt3 897 punpckldq $xc3,$xc2 898 punpckhdq $xc1,$xt2 899 punpckhdq $xc3,$xt3 900 movdqa $xc0,$xc1 901 punpcklqdq $xc2,$xc0 # "c0" 902 movdqa $xt2,$xc3 903 punpcklqdq $xt3,$xt2 # "c2" 904 punpckhqdq $xc2,$xc1 # "c1" 905 punpckhqdq $xt3,$xc3 # "c3" 906___ 907 ($xc2,$xt2)=($xt2,$xc2); 908 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 909$code.=<<___; 910 paddd 0x100-0x100(%rcx),$xd0 911 paddd 0x110-0x100(%rcx),$xd1 912 paddd 0x120-0x100(%rcx),$xd2 913 paddd 0x130-0x100(%rcx),$xd3 914 915 movdqa $xd0,$xt2 916 punpckldq $xd1,$xd0 917 movdqa $xd2,$xt3 918 punpckldq $xd3,$xd2 919 punpckhdq $xd1,$xt2 920 punpckhdq $xd3,$xt3 921 movdqa $xd0,$xd1 922 punpcklqdq $xd2,$xd0 # "d0" 923 movdqa $xt2,$xd3 924 punpcklqdq $xt3,$xt2 # "d2" 925 punpckhqdq $xd2,$xd1 # "d1" 926 punpckhqdq $xt3,$xd3 # "d3" 927___ 928 ($xd2,$xt2)=($xt2,$xd2); 929$code.=<<___; 930 cmp \$64*4,$len 931 jb .Ltail4x 932 933 movdqu 0x00($inp),$xt0 # xor with input 934 movdqu 0x10($inp),$xt1 935 movdqu 0x20($inp),$xt2 936 movdqu 0x30($inp),$xt3 937 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 938 pxor $xb0,$xt1 939 pxor $xc0,$xt2 940 pxor $xd0,$xt3 941 942 movdqu $xt0,0x00($out) 943 movdqu 0x40($inp),$xt0 944 movdqu $xt1,0x10($out) 945 movdqu 0x50($inp),$xt1 946 movdqu $xt2,0x20($out) 947 movdqu 0x60($inp),$xt2 948 movdqu $xt3,0x30($out) 949 movdqu 0x70($inp),$xt3 950 lea 0x80($inp),$inp # size optimization 951 pxor 0x10(%rsp),$xt0 952 pxor $xb1,$xt1 953 pxor $xc1,$xt2 954 pxor $xd1,$xt3 955 956 movdqu $xt0,0x40($out) 957 movdqu 0x00($inp),$xt0 958 movdqu $xt1,0x50($out) 959 movdqu 0x10($inp),$xt1 960 movdqu $xt2,0x60($out) 961 movdqu 0x20($inp),$xt2 962 movdqu $xt3,0x70($out) 963 lea 0x80($out),$out # size optimization 964 movdqu 0x30($inp),$xt3 965 pxor 0x20(%rsp),$xt0 966 pxor $xb2,$xt1 967 pxor $xc2,$xt2 968 pxor $xd2,$xt3 969 970 movdqu $xt0,0x00($out) 971 movdqu 0x40($inp),$xt0 972 movdqu $xt1,0x10($out) 973 movdqu 0x50($inp),$xt1 974 movdqu $xt2,0x20($out) 975 movdqu 0x60($inp),$xt2 976 movdqu $xt3,0x30($out) 977 movdqu 0x70($inp),$xt3 978 lea 0x80($inp),$inp # inp+=64*4 979 pxor 0x30(%rsp),$xt0 980 pxor $xb3,$xt1 981 pxor $xc3,$xt2 982 pxor $xd3,$xt3 983 movdqu $xt0,0x40($out) 984 movdqu $xt1,0x50($out) 985 movdqu $xt2,0x60($out) 986 movdqu $xt3,0x70($out) 987 lea 0x80($out),$out # out+=64*4 988 989 sub \$64*4,$len 990 jnz .Loop_outer4x 991 992 jmp .Ldone4x 993 994.Ltail4x: 995 cmp \$192,$len 996 jae .L192_or_more4x 997 cmp \$128,$len 998 jae .L128_or_more4x 999 cmp \$64,$len 1000 jae .L64_or_more4x 1001 1002 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1003 xor %r10,%r10 1004 #movdqa $xt0,0x00(%rsp) 1005 movdqa $xb0,0x10(%rsp) 1006 movdqa $xc0,0x20(%rsp) 1007 movdqa $xd0,0x30(%rsp) 1008 jmp .Loop_tail4x 1009 1010.align 32 1011.L64_or_more4x: 1012 movdqu 0x00($inp),$xt0 # xor with input 1013 movdqu 0x10($inp),$xt1 1014 movdqu 0x20($inp),$xt2 1015 movdqu 0x30($inp),$xt3 1016 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1017 pxor $xb0,$xt1 1018 pxor $xc0,$xt2 1019 pxor $xd0,$xt3 1020 movdqu $xt0,0x00($out) 1021 movdqu $xt1,0x10($out) 1022 movdqu $xt2,0x20($out) 1023 movdqu $xt3,0x30($out) 1024 je .Ldone4x 1025 1026 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1027 lea 0x40($inp),$inp # inp+=64*1 1028 xor %r10,%r10 1029 movdqa $xt0,0x00(%rsp) 1030 movdqa $xb1,0x10(%rsp) 1031 lea 0x40($out),$out # out+=64*1 1032 movdqa $xc1,0x20(%rsp) 1033 sub \$64,$len # len-=64*1 1034 movdqa $xd1,0x30(%rsp) 1035 jmp .Loop_tail4x 1036 1037.align 32 1038.L128_or_more4x: 1039 movdqu 0x00($inp),$xt0 # xor with input 1040 movdqu 0x10($inp),$xt1 1041 movdqu 0x20($inp),$xt2 1042 movdqu 0x30($inp),$xt3 1043 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1044 pxor $xb0,$xt1 1045 pxor $xc0,$xt2 1046 pxor $xd0,$xt3 1047 1048 movdqu $xt0,0x00($out) 1049 movdqu 0x40($inp),$xt0 1050 movdqu $xt1,0x10($out) 1051 movdqu 0x50($inp),$xt1 1052 movdqu $xt2,0x20($out) 1053 movdqu 0x60($inp),$xt2 1054 movdqu $xt3,0x30($out) 1055 movdqu 0x70($inp),$xt3 1056 pxor 0x10(%rsp),$xt0 1057 pxor $xb1,$xt1 1058 pxor $xc1,$xt2 1059 pxor $xd1,$xt3 1060 movdqu $xt0,0x40($out) 1061 movdqu $xt1,0x50($out) 1062 movdqu $xt2,0x60($out) 1063 movdqu $xt3,0x70($out) 1064 je .Ldone4x 1065 1066 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1067 lea 0x80($inp),$inp # inp+=64*2 1068 xor %r10,%r10 1069 movdqa $xt0,0x00(%rsp) 1070 movdqa $xb2,0x10(%rsp) 1071 lea 0x80($out),$out # out+=64*2 1072 movdqa $xc2,0x20(%rsp) 1073 sub \$128,$len # len-=64*2 1074 movdqa $xd2,0x30(%rsp) 1075 jmp .Loop_tail4x 1076 1077.align 32 1078.L192_or_more4x: 1079 movdqu 0x00($inp),$xt0 # xor with input 1080 movdqu 0x10($inp),$xt1 1081 movdqu 0x20($inp),$xt2 1082 movdqu 0x30($inp),$xt3 1083 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1084 pxor $xb0,$xt1 1085 pxor $xc0,$xt2 1086 pxor $xd0,$xt3 1087 1088 movdqu $xt0,0x00($out) 1089 movdqu 0x40($inp),$xt0 1090 movdqu $xt1,0x10($out) 1091 movdqu 0x50($inp),$xt1 1092 movdqu $xt2,0x20($out) 1093 movdqu 0x60($inp),$xt2 1094 movdqu $xt3,0x30($out) 1095 movdqu 0x70($inp),$xt3 1096 lea 0x80($inp),$inp # size optimization 1097 pxor 0x10(%rsp),$xt0 1098 pxor $xb1,$xt1 1099 pxor $xc1,$xt2 1100 pxor $xd1,$xt3 1101 1102 movdqu $xt0,0x40($out) 1103 movdqu 0x00($inp),$xt0 1104 movdqu $xt1,0x50($out) 1105 movdqu 0x10($inp),$xt1 1106 movdqu $xt2,0x60($out) 1107 movdqu 0x20($inp),$xt2 1108 movdqu $xt3,0x70($out) 1109 lea 0x80($out),$out # size optimization 1110 movdqu 0x30($inp),$xt3 1111 pxor 0x20(%rsp),$xt0 1112 pxor $xb2,$xt1 1113 pxor $xc2,$xt2 1114 pxor $xd2,$xt3 1115 movdqu $xt0,0x00($out) 1116 movdqu $xt1,0x10($out) 1117 movdqu $xt2,0x20($out) 1118 movdqu $xt3,0x30($out) 1119 je .Ldone4x 1120 1121 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1122 lea 0x40($inp),$inp # inp+=64*3 1123 xor %r10,%r10 1124 movdqa $xt0,0x00(%rsp) 1125 movdqa $xb3,0x10(%rsp) 1126 lea 0x40($out),$out # out+=64*3 1127 movdqa $xc3,0x20(%rsp) 1128 sub \$192,$len # len-=64*3 1129 movdqa $xd3,0x30(%rsp) 1130 1131.Loop_tail4x: 1132 movzb ($inp,%r10),%eax 1133 movzb (%rsp,%r10),%ecx 1134 lea 1(%r10),%r10 1135 xor %ecx,%eax 1136 mov %al,-1($out,%r10) 1137 dec $len 1138 jnz .Loop_tail4x 1139 1140.Ldone4x: 1141___ 1142$code.=<<___ if ($win64); 1143 movaps -0xa8(%r9),%xmm6 1144 movaps -0x98(%r9),%xmm7 1145 movaps -0x88(%r9),%xmm8 1146 movaps -0x78(%r9),%xmm9 1147 movaps -0x68(%r9),%xmm10 1148 movaps -0x58(%r9),%xmm11 1149 movaps -0x48(%r9),%xmm12 1150 movaps -0x38(%r9),%xmm13 1151 movaps -0x28(%r9),%xmm14 1152 movaps -0x18(%r9),%xmm15 1153___ 1154$code.=<<___; 1155 lea (%r9),%rsp 1156.cfi_def_cfa_register rsp 1157.L4x_epilogue: 1158 ret 1159.cfi_endproc 1160.size ChaCha20_4x,.-ChaCha20_4x 1161___ 1162} 1163 1164######################################################################## 1165# AVX2 code path 1166if ($avx>1) { 1167my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1168 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1169my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1170 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1171 1172sub AVX2_lane_ROUND { 1173my ($a0,$b0,$c0,$d0)=@_; 1174my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1175my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1176my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1177my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1178my @x=map("\"$_\"",@xx); 1179 1180 # Consider order in which variables are addressed by their 1181 # index: 1182 # 1183 # a b c d 1184 # 1185 # 0 4 8 12 < even round 1186 # 1 5 9 13 1187 # 2 6 10 14 1188 # 3 7 11 15 1189 # 0 5 10 15 < odd round 1190 # 1 6 11 12 1191 # 2 7 8 13 1192 # 3 4 9 14 1193 # 1194 # 'a', 'b' and 'd's are permanently allocated in registers, 1195 # @x[0..7,12..15], while 'c's are maintained in memory. If 1196 # you observe 'c' column, you'll notice that pair of 'c's is 1197 # invariant between rounds. This means that we have to reload 1198 # them once per round, in the middle. This is why you'll see 1199 # bunch of 'c' stores and loads in the middle, but none in 1200 # the beginning or end. 1201 1202 ( 1203 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1204 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1205 "&vpshufb (@x[$d0],@x[$d0],$t1)", 1206 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1207 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1208 "&vpshufb (@x[$d1],@x[$d1],$t1)", 1209 1210 "&vpaddd ($xc,$xc,@x[$d0])", 1211 "&vpxor (@x[$b0],$xc,@x[$b0])", 1212 "&vpslld ($t0,@x[$b0],12)", 1213 "&vpsrld (@x[$b0],@x[$b0],20)", 1214 "&vpor (@x[$b0],$t0,@x[$b0])", 1215 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1216 "&vpaddd ($xc_,$xc_,@x[$d1])", 1217 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1218 "&vpslld ($t1,@x[$b1],12)", 1219 "&vpsrld (@x[$b1],@x[$b1],20)", 1220 "&vpor (@x[$b1],$t1,@x[$b1])", 1221 1222 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1223 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1224 "&vpshufb (@x[$d0],@x[$d0],$t0)", 1225 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1226 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1227 "&vpshufb (@x[$d1],@x[$d1],$t0)", 1228 1229 "&vpaddd ($xc,$xc,@x[$d0])", 1230 "&vpxor (@x[$b0],$xc,@x[$b0])", 1231 "&vpslld ($t1,@x[$b0],7)", 1232 "&vpsrld (@x[$b0],@x[$b0],25)", 1233 "&vpor (@x[$b0],$t1,@x[$b0])", 1234 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1235 "&vpaddd ($xc_,$xc_,@x[$d1])", 1236 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1237 "&vpslld ($t0,@x[$b1],7)", 1238 "&vpsrld (@x[$b1],@x[$b1],25)", 1239 "&vpor (@x[$b1],$t0,@x[$b1])", 1240 1241 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1242 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1243 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1244 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1245 1246 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1247 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1248 "&vpshufb (@x[$d2],@x[$d2],$t1)", 1249 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1250 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1251 "&vpshufb (@x[$d3],@x[$d3],$t1)", 1252 1253 "&vpaddd ($xc,$xc,@x[$d2])", 1254 "&vpxor (@x[$b2],$xc,@x[$b2])", 1255 "&vpslld ($t0,@x[$b2],12)", 1256 "&vpsrld (@x[$b2],@x[$b2],20)", 1257 "&vpor (@x[$b2],$t0,@x[$b2])", 1258 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1259 "&vpaddd ($xc_,$xc_,@x[$d3])", 1260 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1261 "&vpslld ($t1,@x[$b3],12)", 1262 "&vpsrld (@x[$b3],@x[$b3],20)", 1263 "&vpor (@x[$b3],$t1,@x[$b3])", 1264 1265 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1266 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1267 "&vpshufb (@x[$d2],@x[$d2],$t0)", 1268 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1269 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1270 "&vpshufb (@x[$d3],@x[$d3],$t0)", 1271 1272 "&vpaddd ($xc,$xc,@x[$d2])", 1273 "&vpxor (@x[$b2],$xc,@x[$b2])", 1274 "&vpslld ($t1,@x[$b2],7)", 1275 "&vpsrld (@x[$b2],@x[$b2],25)", 1276 "&vpor (@x[$b2],$t1,@x[$b2])", 1277 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1278 "&vpaddd ($xc_,$xc_,@x[$d3])", 1279 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1280 "&vpslld ($t0,@x[$b3],7)", 1281 "&vpsrld (@x[$b3],@x[$b3],25)", 1282 "&vpor (@x[$b3],$t0,@x[$b3])" 1283 ); 1284} 1285 1286my $xframe = $win64 ? 0xa8 : 8; 1287 1288$code.=<<___; 1289.type ChaCha20_8x,\@function,5 1290.align 32 1291ChaCha20_8x: 1292.LChaCha20_8x: 1293.cfi_startproc 1294 mov %rsp,%r9 # frame register 1295.cfi_def_cfa_register r9 1296 sub \$0x280+$xframe,%rsp 1297 and \$-32,%rsp 1298___ 1299$code.=<<___ if ($win64); 1300 movaps %xmm6,-0xa8(%r9) 1301 movaps %xmm7,-0x98(%r9) 1302 movaps %xmm8,-0x88(%r9) 1303 movaps %xmm9,-0x78(%r9) 1304 movaps %xmm10,-0x68(%r9) 1305 movaps %xmm11,-0x58(%r9) 1306 movaps %xmm12,-0x48(%r9) 1307 movaps %xmm13,-0x38(%r9) 1308 movaps %xmm14,-0x28(%r9) 1309 movaps %xmm15,-0x18(%r9) 1310.L8x_body: 1311___ 1312$code.=<<___; 1313 vzeroupper 1314 1315 ################ stack layout 1316 # +0x00 SIMD equivalent of @x[8-12] 1317 # ... 1318 # +0x80 constant copy of key[0-2] smashed by lanes 1319 # ... 1320 # +0x200 SIMD counters (with nonce smashed by lanes) 1321 # ... 1322 # +0x280 1323 1324 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1325 vbroadcasti128 ($key),$xb3 # key[1] 1326 vbroadcasti128 16($key),$xt3 # key[2] 1327 vbroadcasti128 ($counter),$xd3 # key[3] 1328 lea 0x100(%rsp),%rcx # size optimization 1329 lea 0x200(%rsp),%rax # size optimization 1330 lea .Lrot16(%rip),%r10 1331 lea .Lrot24(%rip),%r11 1332 1333 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1334 vpshufd \$0x55,$xa3,$xa1 1335 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1336 vpshufd \$0xaa,$xa3,$xa2 1337 vmovdqa $xa1,0xa0-0x100(%rcx) 1338 vpshufd \$0xff,$xa3,$xa3 1339 vmovdqa $xa2,0xc0-0x100(%rcx) 1340 vmovdqa $xa3,0xe0-0x100(%rcx) 1341 1342 vpshufd \$0x00,$xb3,$xb0 1343 vpshufd \$0x55,$xb3,$xb1 1344 vmovdqa $xb0,0x100-0x100(%rcx) 1345 vpshufd \$0xaa,$xb3,$xb2 1346 vmovdqa $xb1,0x120-0x100(%rcx) 1347 vpshufd \$0xff,$xb3,$xb3 1348 vmovdqa $xb2,0x140-0x100(%rcx) 1349 vmovdqa $xb3,0x160-0x100(%rcx) 1350 1351 vpshufd \$0x00,$xt3,$xt0 # "xc0" 1352 vpshufd \$0x55,$xt3,$xt1 # "xc1" 1353 vmovdqa $xt0,0x180-0x200(%rax) 1354 vpshufd \$0xaa,$xt3,$xt2 # "xc2" 1355 vmovdqa $xt1,0x1a0-0x200(%rax) 1356 vpshufd \$0xff,$xt3,$xt3 # "xc3" 1357 vmovdqa $xt2,0x1c0-0x200(%rax) 1358 vmovdqa $xt3,0x1e0-0x200(%rax) 1359 1360 vpshufd \$0x00,$xd3,$xd0 1361 vpshufd \$0x55,$xd3,$xd1 1362 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 1363 vpshufd \$0xaa,$xd3,$xd2 1364 vmovdqa $xd1,0x220-0x200(%rax) 1365 vpshufd \$0xff,$xd3,$xd3 1366 vmovdqa $xd2,0x240-0x200(%rax) 1367 vmovdqa $xd3,0x260-0x200(%rax) 1368 1369 jmp .Loop_enter8x 1370 1371.align 32 1372.Loop_outer8x: 1373 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 1374 vmovdqa 0xa0-0x100(%rcx),$xa1 1375 vmovdqa 0xc0-0x100(%rcx),$xa2 1376 vmovdqa 0xe0-0x100(%rcx),$xa3 1377 vmovdqa 0x100-0x100(%rcx),$xb0 1378 vmovdqa 0x120-0x100(%rcx),$xb1 1379 vmovdqa 0x140-0x100(%rcx),$xb2 1380 vmovdqa 0x160-0x100(%rcx),$xb3 1381 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 1382 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 1383 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 1384 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 1385 vmovdqa 0x200-0x200(%rax),$xd0 1386 vmovdqa 0x220-0x200(%rax),$xd1 1387 vmovdqa 0x240-0x200(%rax),$xd2 1388 vmovdqa 0x260-0x200(%rax),$xd3 1389 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 1390 1391.Loop_enter8x: 1392 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 1393 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 1394 vbroadcasti128 (%r10),$xt3 1395 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 1396 mov \$10,%eax 1397 jmp .Loop8x 1398 1399.align 32 1400.Loop8x: 1401___ 1402 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 1403 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 1404$code.=<<___; 1405 dec %eax 1406 jnz .Loop8x 1407 1408 lea 0x200(%rsp),%rax # size optimization 1409 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 1410 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 1411 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 1412 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 1413 1414 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1415 vpunpckldq $xa3,$xa2,$xt3 1416 vpunpckhdq $xa1,$xa0,$xa0 1417 vpunpckhdq $xa3,$xa2,$xa2 1418 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1419 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1420 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1421 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1422___ 1423 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1424$code.=<<___; 1425 vpaddd 0x100-0x100(%rcx),$xb0,$xb0 1426 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 1427 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 1428 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 1429 1430 vpunpckldq $xb1,$xb0,$xt2 1431 vpunpckldq $xb3,$xb2,$xt3 1432 vpunpckhdq $xb1,$xb0,$xb0 1433 vpunpckhdq $xb3,$xb2,$xb2 1434 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1435 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1436 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1437 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1438___ 1439 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1440$code.=<<___; 1441 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 1442 vperm2i128 \$0x31,$xb0,$xa0,$xb0 1443 vperm2i128 \$0x20,$xb1,$xa1,$xa0 1444 vperm2i128 \$0x31,$xb1,$xa1,$xb1 1445 vperm2i128 \$0x20,$xb2,$xa2,$xa1 1446 vperm2i128 \$0x31,$xb2,$xa2,$xb2 1447 vperm2i128 \$0x20,$xb3,$xa3,$xa2 1448 vperm2i128 \$0x31,$xb3,$xa3,$xb3 1449___ 1450 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 1451 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1452$code.=<<___; 1453 vmovdqa $xa0,0x00(%rsp) # offload $xaN 1454 vmovdqa $xa1,0x20(%rsp) 1455 vmovdqa 0x40(%rsp),$xc2 # $xa0 1456 vmovdqa 0x60(%rsp),$xc3 # $xa1 1457 1458 vpaddd 0x180-0x200(%rax),$xc0,$xc0 1459 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 1460 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 1461 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 1462 1463 vpunpckldq $xc1,$xc0,$xt2 1464 vpunpckldq $xc3,$xc2,$xt3 1465 vpunpckhdq $xc1,$xc0,$xc0 1466 vpunpckhdq $xc3,$xc2,$xc2 1467 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1468 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1469 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1470 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1471___ 1472 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1473$code.=<<___; 1474 vpaddd 0x200-0x200(%rax),$xd0,$xd0 1475 vpaddd 0x220-0x200(%rax),$xd1,$xd1 1476 vpaddd 0x240-0x200(%rax),$xd2,$xd2 1477 vpaddd 0x260-0x200(%rax),$xd3,$xd3 1478 1479 vpunpckldq $xd1,$xd0,$xt2 1480 vpunpckldq $xd3,$xd2,$xt3 1481 vpunpckhdq $xd1,$xd0,$xd0 1482 vpunpckhdq $xd3,$xd2,$xd2 1483 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1484 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1485 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1486 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1487___ 1488 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1489$code.=<<___; 1490 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 1491 vperm2i128 \$0x31,$xd0,$xc0,$xd0 1492 vperm2i128 \$0x20,$xd1,$xc1,$xc0 1493 vperm2i128 \$0x31,$xd1,$xc1,$xd1 1494 vperm2i128 \$0x20,$xd2,$xc2,$xc1 1495 vperm2i128 \$0x31,$xd2,$xc2,$xd2 1496 vperm2i128 \$0x20,$xd3,$xc3,$xc2 1497 vperm2i128 \$0x31,$xd3,$xc3,$xd3 1498___ 1499 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 1500 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 1501 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 1502 ($xa0,$xa1)=($xt2,$xt3); 1503$code.=<<___; 1504 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 1505 vmovdqa 0x20(%rsp),$xa1 1506 1507 cmp \$64*8,$len 1508 jb .Ltail8x 1509 1510 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1511 vpxor 0x20($inp),$xb0,$xb0 1512 vpxor 0x40($inp),$xc0,$xc0 1513 vpxor 0x60($inp),$xd0,$xd0 1514 lea 0x80($inp),$inp # size optimization 1515 vmovdqu $xa0,0x00($out) 1516 vmovdqu $xb0,0x20($out) 1517 vmovdqu $xc0,0x40($out) 1518 vmovdqu $xd0,0x60($out) 1519 lea 0x80($out),$out # size optimization 1520 1521 vpxor 0x00($inp),$xa1,$xa1 1522 vpxor 0x20($inp),$xb1,$xb1 1523 vpxor 0x40($inp),$xc1,$xc1 1524 vpxor 0x60($inp),$xd1,$xd1 1525 lea 0x80($inp),$inp # size optimization 1526 vmovdqu $xa1,0x00($out) 1527 vmovdqu $xb1,0x20($out) 1528 vmovdqu $xc1,0x40($out) 1529 vmovdqu $xd1,0x60($out) 1530 lea 0x80($out),$out # size optimization 1531 1532 vpxor 0x00($inp),$xa2,$xa2 1533 vpxor 0x20($inp),$xb2,$xb2 1534 vpxor 0x40($inp),$xc2,$xc2 1535 vpxor 0x60($inp),$xd2,$xd2 1536 lea 0x80($inp),$inp # size optimization 1537 vmovdqu $xa2,0x00($out) 1538 vmovdqu $xb2,0x20($out) 1539 vmovdqu $xc2,0x40($out) 1540 vmovdqu $xd2,0x60($out) 1541 lea 0x80($out),$out # size optimization 1542 1543 vpxor 0x00($inp),$xa3,$xa3 1544 vpxor 0x20($inp),$xb3,$xb3 1545 vpxor 0x40($inp),$xc3,$xc3 1546 vpxor 0x60($inp),$xd3,$xd3 1547 lea 0x80($inp),$inp # size optimization 1548 vmovdqu $xa3,0x00($out) 1549 vmovdqu $xb3,0x20($out) 1550 vmovdqu $xc3,0x40($out) 1551 vmovdqu $xd3,0x60($out) 1552 lea 0x80($out),$out # size optimization 1553 1554 sub \$64*8,$len 1555 jnz .Loop_outer8x 1556 1557 jmp .Ldone8x 1558 1559.Ltail8x: 1560 cmp \$448,$len 1561 jae .L448_or_more8x 1562 cmp \$384,$len 1563 jae .L384_or_more8x 1564 cmp \$320,$len 1565 jae .L320_or_more8x 1566 cmp \$256,$len 1567 jae .L256_or_more8x 1568 cmp \$192,$len 1569 jae .L192_or_more8x 1570 cmp \$128,$len 1571 jae .L128_or_more8x 1572 cmp \$64,$len 1573 jae .L64_or_more8x 1574 1575 xor %r10,%r10 1576 vmovdqa $xa0,0x00(%rsp) 1577 vmovdqa $xb0,0x20(%rsp) 1578 jmp .Loop_tail8x 1579 1580.align 32 1581.L64_or_more8x: 1582 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1583 vpxor 0x20($inp),$xb0,$xb0 1584 vmovdqu $xa0,0x00($out) 1585 vmovdqu $xb0,0x20($out) 1586 je .Ldone8x 1587 1588 lea 0x40($inp),$inp # inp+=64*1 1589 xor %r10,%r10 1590 vmovdqa $xc0,0x00(%rsp) 1591 lea 0x40($out),$out # out+=64*1 1592 sub \$64,$len # len-=64*1 1593 vmovdqa $xd0,0x20(%rsp) 1594 jmp .Loop_tail8x 1595 1596.align 32 1597.L128_or_more8x: 1598 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1599 vpxor 0x20($inp),$xb0,$xb0 1600 vpxor 0x40($inp),$xc0,$xc0 1601 vpxor 0x60($inp),$xd0,$xd0 1602 vmovdqu $xa0,0x00($out) 1603 vmovdqu $xb0,0x20($out) 1604 vmovdqu $xc0,0x40($out) 1605 vmovdqu $xd0,0x60($out) 1606 je .Ldone8x 1607 1608 lea 0x80($inp),$inp # inp+=64*2 1609 xor %r10,%r10 1610 vmovdqa $xa1,0x00(%rsp) 1611 lea 0x80($out),$out # out+=64*2 1612 sub \$128,$len # len-=64*2 1613 vmovdqa $xb1,0x20(%rsp) 1614 jmp .Loop_tail8x 1615 1616.align 32 1617.L192_or_more8x: 1618 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1619 vpxor 0x20($inp),$xb0,$xb0 1620 vpxor 0x40($inp),$xc0,$xc0 1621 vpxor 0x60($inp),$xd0,$xd0 1622 vpxor 0x80($inp),$xa1,$xa1 1623 vpxor 0xa0($inp),$xb1,$xb1 1624 vmovdqu $xa0,0x00($out) 1625 vmovdqu $xb0,0x20($out) 1626 vmovdqu $xc0,0x40($out) 1627 vmovdqu $xd0,0x60($out) 1628 vmovdqu $xa1,0x80($out) 1629 vmovdqu $xb1,0xa0($out) 1630 je .Ldone8x 1631 1632 lea 0xc0($inp),$inp # inp+=64*3 1633 xor %r10,%r10 1634 vmovdqa $xc1,0x00(%rsp) 1635 lea 0xc0($out),$out # out+=64*3 1636 sub \$192,$len # len-=64*3 1637 vmovdqa $xd1,0x20(%rsp) 1638 jmp .Loop_tail8x 1639 1640.align 32 1641.L256_or_more8x: 1642 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1643 vpxor 0x20($inp),$xb0,$xb0 1644 vpxor 0x40($inp),$xc0,$xc0 1645 vpxor 0x60($inp),$xd0,$xd0 1646 vpxor 0x80($inp),$xa1,$xa1 1647 vpxor 0xa0($inp),$xb1,$xb1 1648 vpxor 0xc0($inp),$xc1,$xc1 1649 vpxor 0xe0($inp),$xd1,$xd1 1650 vmovdqu $xa0,0x00($out) 1651 vmovdqu $xb0,0x20($out) 1652 vmovdqu $xc0,0x40($out) 1653 vmovdqu $xd0,0x60($out) 1654 vmovdqu $xa1,0x80($out) 1655 vmovdqu $xb1,0xa0($out) 1656 vmovdqu $xc1,0xc0($out) 1657 vmovdqu $xd1,0xe0($out) 1658 je .Ldone8x 1659 1660 lea 0x100($inp),$inp # inp+=64*4 1661 xor %r10,%r10 1662 vmovdqa $xa2,0x00(%rsp) 1663 lea 0x100($out),$out # out+=64*4 1664 sub \$256,$len # len-=64*4 1665 vmovdqa $xb2,0x20(%rsp) 1666 jmp .Loop_tail8x 1667 1668.align 32 1669.L320_or_more8x: 1670 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1671 vpxor 0x20($inp),$xb0,$xb0 1672 vpxor 0x40($inp),$xc0,$xc0 1673 vpxor 0x60($inp),$xd0,$xd0 1674 vpxor 0x80($inp),$xa1,$xa1 1675 vpxor 0xa0($inp),$xb1,$xb1 1676 vpxor 0xc0($inp),$xc1,$xc1 1677 vpxor 0xe0($inp),$xd1,$xd1 1678 vpxor 0x100($inp),$xa2,$xa2 1679 vpxor 0x120($inp),$xb2,$xb2 1680 vmovdqu $xa0,0x00($out) 1681 vmovdqu $xb0,0x20($out) 1682 vmovdqu $xc0,0x40($out) 1683 vmovdqu $xd0,0x60($out) 1684 vmovdqu $xa1,0x80($out) 1685 vmovdqu $xb1,0xa0($out) 1686 vmovdqu $xc1,0xc0($out) 1687 vmovdqu $xd1,0xe0($out) 1688 vmovdqu $xa2,0x100($out) 1689 vmovdqu $xb2,0x120($out) 1690 je .Ldone8x 1691 1692 lea 0x140($inp),$inp # inp+=64*5 1693 xor %r10,%r10 1694 vmovdqa $xc2,0x00(%rsp) 1695 lea 0x140($out),$out # out+=64*5 1696 sub \$320,$len # len-=64*5 1697 vmovdqa $xd2,0x20(%rsp) 1698 jmp .Loop_tail8x 1699 1700.align 32 1701.L384_or_more8x: 1702 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1703 vpxor 0x20($inp),$xb0,$xb0 1704 vpxor 0x40($inp),$xc0,$xc0 1705 vpxor 0x60($inp),$xd0,$xd0 1706 vpxor 0x80($inp),$xa1,$xa1 1707 vpxor 0xa0($inp),$xb1,$xb1 1708 vpxor 0xc0($inp),$xc1,$xc1 1709 vpxor 0xe0($inp),$xd1,$xd1 1710 vpxor 0x100($inp),$xa2,$xa2 1711 vpxor 0x120($inp),$xb2,$xb2 1712 vpxor 0x140($inp),$xc2,$xc2 1713 vpxor 0x160($inp),$xd2,$xd2 1714 vmovdqu $xa0,0x00($out) 1715 vmovdqu $xb0,0x20($out) 1716 vmovdqu $xc0,0x40($out) 1717 vmovdqu $xd0,0x60($out) 1718 vmovdqu $xa1,0x80($out) 1719 vmovdqu $xb1,0xa0($out) 1720 vmovdqu $xc1,0xc0($out) 1721 vmovdqu $xd1,0xe0($out) 1722 vmovdqu $xa2,0x100($out) 1723 vmovdqu $xb2,0x120($out) 1724 vmovdqu $xc2,0x140($out) 1725 vmovdqu $xd2,0x160($out) 1726 je .Ldone8x 1727 1728 lea 0x180($inp),$inp # inp+=64*6 1729 xor %r10,%r10 1730 vmovdqa $xa3,0x00(%rsp) 1731 lea 0x180($out),$out # out+=64*6 1732 sub \$384,$len # len-=64*6 1733 vmovdqa $xb3,0x20(%rsp) 1734 jmp .Loop_tail8x 1735 1736.align 32 1737.L448_or_more8x: 1738 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1739 vpxor 0x20($inp),$xb0,$xb0 1740 vpxor 0x40($inp),$xc0,$xc0 1741 vpxor 0x60($inp),$xd0,$xd0 1742 vpxor 0x80($inp),$xa1,$xa1 1743 vpxor 0xa0($inp),$xb1,$xb1 1744 vpxor 0xc0($inp),$xc1,$xc1 1745 vpxor 0xe0($inp),$xd1,$xd1 1746 vpxor 0x100($inp),$xa2,$xa2 1747 vpxor 0x120($inp),$xb2,$xb2 1748 vpxor 0x140($inp),$xc2,$xc2 1749 vpxor 0x160($inp),$xd2,$xd2 1750 vpxor 0x180($inp),$xa3,$xa3 1751 vpxor 0x1a0($inp),$xb3,$xb3 1752 vmovdqu $xa0,0x00($out) 1753 vmovdqu $xb0,0x20($out) 1754 vmovdqu $xc0,0x40($out) 1755 vmovdqu $xd0,0x60($out) 1756 vmovdqu $xa1,0x80($out) 1757 vmovdqu $xb1,0xa0($out) 1758 vmovdqu $xc1,0xc0($out) 1759 vmovdqu $xd1,0xe0($out) 1760 vmovdqu $xa2,0x100($out) 1761 vmovdqu $xb2,0x120($out) 1762 vmovdqu $xc2,0x140($out) 1763 vmovdqu $xd2,0x160($out) 1764 vmovdqu $xa3,0x180($out) 1765 vmovdqu $xb3,0x1a0($out) 1766 je .Ldone8x 1767 1768 lea 0x1c0($inp),$inp # inp+=64*7 1769 xor %r10,%r10 1770 vmovdqa $xc3,0x00(%rsp) 1771 lea 0x1c0($out),$out # out+=64*7 1772 sub \$448,$len # len-=64*7 1773 vmovdqa $xd3,0x20(%rsp) 1774 1775.Loop_tail8x: 1776 movzb ($inp,%r10),%eax 1777 movzb (%rsp,%r10),%ecx 1778 lea 1(%r10),%r10 1779 xor %ecx,%eax 1780 mov %al,-1($out,%r10) 1781 dec $len 1782 jnz .Loop_tail8x 1783 1784.Ldone8x: 1785 vzeroall 1786___ 1787$code.=<<___ if ($win64); 1788 movaps -0xa8(%r9),%xmm6 1789 movaps -0x98(%r9),%xmm7 1790 movaps -0x88(%r9),%xmm8 1791 movaps -0x78(%r9),%xmm9 1792 movaps -0x68(%r9),%xmm10 1793 movaps -0x58(%r9),%xmm11 1794 movaps -0x48(%r9),%xmm12 1795 movaps -0x38(%r9),%xmm13 1796 movaps -0x28(%r9),%xmm14 1797 movaps -0x18(%r9),%xmm15 1798___ 1799$code.=<<___; 1800 lea (%r9),%rsp 1801.cfi_def_cfa_register rsp 1802.L8x_epilogue: 1803 ret 1804.cfi_endproc 1805.size ChaCha20_8x,.-ChaCha20_8x 1806___ 1807} 1808 1809######################################################################## 1810# AVX512 code paths 1811if ($avx>2) { 1812# This one handles shorter inputs... 1813 1814my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 1815my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1816 1817sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 1818 &vpaddd ($a,$a,$b); 1819 &vpxord ($d,$d,$a); 1820 &vprold ($d,$d,16); 1821 1822 &vpaddd ($c,$c,$d); 1823 &vpxord ($b,$b,$c); 1824 &vprold ($b,$b,12); 1825 1826 &vpaddd ($a,$a,$b); 1827 &vpxord ($d,$d,$a); 1828 &vprold ($d,$d,8); 1829 1830 &vpaddd ($c,$c,$d); 1831 &vpxord ($b,$b,$c); 1832 &vprold ($b,$b,7); 1833} 1834 1835my $xframe = $win64 ? 32+8 : 8; 1836 1837$code.=<<___; 1838.type ChaCha20_avx512,\@function,5 1839.align 32 1840ChaCha20_avx512: 1841.LChaCha20_avx512: 1842.cfi_startproc 1843 mov %rsp,%r9 # frame pointer 1844.cfi_def_cfa_register r9 1845 cmp \$512,$len 1846 ja .LChaCha20_16x 1847 1848 sub \$64+$xframe,%rsp 1849___ 1850$code.=<<___ if ($win64); 1851 movaps %xmm6,-0x28(%r9) 1852 movaps %xmm7,-0x18(%r9) 1853.Lavx512_body: 1854___ 1855$code.=<<___; 1856 vbroadcasti32x4 .Lsigma(%rip),$a 1857 vbroadcasti32x4 ($key),$b 1858 vbroadcasti32x4 16($key),$c 1859 vbroadcasti32x4 ($counter),$d 1860 1861 vmovdqa32 $a,$a_ 1862 vmovdqa32 $b,$b_ 1863 vmovdqa32 $c,$c_ 1864 vpaddd .Lzeroz(%rip),$d,$d 1865 vmovdqa32 .Lfourz(%rip),$fourz 1866 mov \$10,$counter # reuse $counter 1867 vmovdqa32 $d,$d_ 1868 jmp .Loop_avx512 1869 1870.align 16 1871.Loop_outer_avx512: 1872 vmovdqa32 $a_,$a 1873 vmovdqa32 $b_,$b 1874 vmovdqa32 $c_,$c 1875 vpaddd $fourz,$d_,$d 1876 mov \$10,$counter 1877 vmovdqa32 $d,$d_ 1878 jmp .Loop_avx512 1879 1880.align 32 1881.Loop_avx512: 1882___ 1883 &AVX512ROUND(); 1884 &vpshufd ($c,$c,0b01001110); 1885 &vpshufd ($b,$b,0b00111001); 1886 &vpshufd ($d,$d,0b10010011); 1887 1888 &AVX512ROUND(); 1889 &vpshufd ($c,$c,0b01001110); 1890 &vpshufd ($b,$b,0b10010011); 1891 &vpshufd ($d,$d,0b00111001); 1892 1893 &dec ($counter); 1894 &jnz (".Loop_avx512"); 1895 1896$code.=<<___; 1897 vpaddd $a_,$a,$a 1898 vpaddd $b_,$b,$b 1899 vpaddd $c_,$c,$c 1900 vpaddd $d_,$d,$d 1901 1902 sub \$64,$len 1903 jb .Ltail64_avx512 1904 1905 vpxor 0x00($inp),%x#$a,$t0 # xor with input 1906 vpxor 0x10($inp),%x#$b,$t1 1907 vpxor 0x20($inp),%x#$c,$t2 1908 vpxor 0x30($inp),%x#$d,$t3 1909 lea 0x40($inp),$inp # inp+=64 1910 1911 vmovdqu $t0,0x00($out) # write output 1912 vmovdqu $t1,0x10($out) 1913 vmovdqu $t2,0x20($out) 1914 vmovdqu $t3,0x30($out) 1915 lea 0x40($out),$out # out+=64 1916 1917 jz .Ldone_avx512 1918 1919 vextracti32x4 \$1,$a,$t0 1920 vextracti32x4 \$1,$b,$t1 1921 vextracti32x4 \$1,$c,$t2 1922 vextracti32x4 \$1,$d,$t3 1923 1924 sub \$64,$len 1925 jb .Ltail_avx512 1926 1927 vpxor 0x00($inp),$t0,$t0 # xor with input 1928 vpxor 0x10($inp),$t1,$t1 1929 vpxor 0x20($inp),$t2,$t2 1930 vpxor 0x30($inp),$t3,$t3 1931 lea 0x40($inp),$inp # inp+=64 1932 1933 vmovdqu $t0,0x00($out) # write output 1934 vmovdqu $t1,0x10($out) 1935 vmovdqu $t2,0x20($out) 1936 vmovdqu $t3,0x30($out) 1937 lea 0x40($out),$out # out+=64 1938 1939 jz .Ldone_avx512 1940 1941 vextracti32x4 \$2,$a,$t0 1942 vextracti32x4 \$2,$b,$t1 1943 vextracti32x4 \$2,$c,$t2 1944 vextracti32x4 \$2,$d,$t3 1945 1946 sub \$64,$len 1947 jb .Ltail_avx512 1948 1949 vpxor 0x00($inp),$t0,$t0 # xor with input 1950 vpxor 0x10($inp),$t1,$t1 1951 vpxor 0x20($inp),$t2,$t2 1952 vpxor 0x30($inp),$t3,$t3 1953 lea 0x40($inp),$inp # inp+=64 1954 1955 vmovdqu $t0,0x00($out) # write output 1956 vmovdqu $t1,0x10($out) 1957 vmovdqu $t2,0x20($out) 1958 vmovdqu $t3,0x30($out) 1959 lea 0x40($out),$out # out+=64 1960 1961 jz .Ldone_avx512 1962 1963 vextracti32x4 \$3,$a,$t0 1964 vextracti32x4 \$3,$b,$t1 1965 vextracti32x4 \$3,$c,$t2 1966 vextracti32x4 \$3,$d,$t3 1967 1968 sub \$64,$len 1969 jb .Ltail_avx512 1970 1971 vpxor 0x00($inp),$t0,$t0 # xor with input 1972 vpxor 0x10($inp),$t1,$t1 1973 vpxor 0x20($inp),$t2,$t2 1974 vpxor 0x30($inp),$t3,$t3 1975 lea 0x40($inp),$inp # inp+=64 1976 1977 vmovdqu $t0,0x00($out) # write output 1978 vmovdqu $t1,0x10($out) 1979 vmovdqu $t2,0x20($out) 1980 vmovdqu $t3,0x30($out) 1981 lea 0x40($out),$out # out+=64 1982 1983 jnz .Loop_outer_avx512 1984 1985 jmp .Ldone_avx512 1986 1987.align 16 1988.Ltail64_avx512: 1989 vmovdqa %x#$a,0x00(%rsp) 1990 vmovdqa %x#$b,0x10(%rsp) 1991 vmovdqa %x#$c,0x20(%rsp) 1992 vmovdqa %x#$d,0x30(%rsp) 1993 add \$64,$len 1994 jmp .Loop_tail_avx512 1995 1996.align 16 1997.Ltail_avx512: 1998 vmovdqa $t0,0x00(%rsp) 1999 vmovdqa $t1,0x10(%rsp) 2000 vmovdqa $t2,0x20(%rsp) 2001 vmovdqa $t3,0x30(%rsp) 2002 add \$64,$len 2003 2004.Loop_tail_avx512: 2005 movzb ($inp,$counter),%eax 2006 movzb (%rsp,$counter),%ecx 2007 lea 1($counter),$counter 2008 xor %ecx,%eax 2009 mov %al,-1($out,$counter) 2010 dec $len 2011 jnz .Loop_tail_avx512 2012 2013 vmovdqa32 $a_,0x00(%rsp) 2014 2015.Ldone_avx512: 2016 vzeroall 2017___ 2018$code.=<<___ if ($win64); 2019 movaps -0x28(%r9),%xmm6 2020 movaps -0x18(%r9),%xmm7 2021___ 2022$code.=<<___; 2023 lea (%r9),%rsp 2024.cfi_def_cfa_register rsp 2025.Lavx512_epilogue: 2026 ret 2027.cfi_endproc 2028.size ChaCha20_avx512,.-ChaCha20_avx512 2029___ 2030} 2031if ($avx>2) { 2032# This one handles longer inputs... 2033 2034my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2035 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2036my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2037 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2038my @key=map("%zmm$_",(16..31)); 2039my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2040 2041sub AVX512_lane_ROUND { 2042my ($a0,$b0,$c0,$d0)=@_; 2043my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2044my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2045my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2046my @x=map("\"$_\"",@xx); 2047 2048 ( 2049 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2050 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2051 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2052 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2053 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2054 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2055 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2056 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2057 "&vprold (@x[$d0],@x[$d0],16)", 2058 "&vprold (@x[$d1],@x[$d1],16)", 2059 "&vprold (@x[$d2],@x[$d2],16)", 2060 "&vprold (@x[$d3],@x[$d3],16)", 2061 2062 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2063 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2064 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2065 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2066 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2067 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2068 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2069 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2070 "&vprold (@x[$b0],@x[$b0],12)", 2071 "&vprold (@x[$b1],@x[$b1],12)", 2072 "&vprold (@x[$b2],@x[$b2],12)", 2073 "&vprold (@x[$b3],@x[$b3],12)", 2074 2075 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2076 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2077 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2078 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2079 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2080 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2081 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2082 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2083 "&vprold (@x[$d0],@x[$d0],8)", 2084 "&vprold (@x[$d1],@x[$d1],8)", 2085 "&vprold (@x[$d2],@x[$d2],8)", 2086 "&vprold (@x[$d3],@x[$d3],8)", 2087 2088 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2089 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2090 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2091 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2092 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2093 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2094 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2095 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2096 "&vprold (@x[$b0],@x[$b0],7)", 2097 "&vprold (@x[$b1],@x[$b1],7)", 2098 "&vprold (@x[$b2],@x[$b2],7)", 2099 "&vprold (@x[$b3],@x[$b3],7)" 2100 ); 2101} 2102 2103my $xframe = $win64 ? 0xa8 : 8; 2104 2105$code.=<<___; 2106.type ChaCha20_16x,\@function,5 2107.align 32 2108ChaCha20_16x: 2109.LChaCha20_16x: 2110.cfi_startproc 2111 mov %rsp,%r9 # frame register 2112.cfi_def_cfa_register r9 2113 sub \$64+$xframe,%rsp 2114 and \$-64,%rsp 2115___ 2116$code.=<<___ if ($win64); 2117 movaps %xmm6,-0xa8(%r9) 2118 movaps %xmm7,-0x98(%r9) 2119 movaps %xmm8,-0x88(%r9) 2120 movaps %xmm9,-0x78(%r9) 2121 movaps %xmm10,-0x68(%r9) 2122 movaps %xmm11,-0x58(%r9) 2123 movaps %xmm12,-0x48(%r9) 2124 movaps %xmm13,-0x38(%r9) 2125 movaps %xmm14,-0x28(%r9) 2126 movaps %xmm15,-0x18(%r9) 2127.L16x_body: 2128___ 2129$code.=<<___; 2130 vzeroupper 2131 2132 lea .Lsigma(%rip),%r10 2133 vbroadcasti32x4 (%r10),$xa3 # key[0] 2134 vbroadcasti32x4 ($key),$xb3 # key[1] 2135 vbroadcasti32x4 16($key),$xc3 # key[2] 2136 vbroadcasti32x4 ($counter),$xd3 # key[3] 2137 2138 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2139 vpshufd \$0x55,$xa3,$xa1 2140 vpshufd \$0xaa,$xa3,$xa2 2141 vpshufd \$0xff,$xa3,$xa3 2142 vmovdqa64 $xa0,@key[0] 2143 vmovdqa64 $xa1,@key[1] 2144 vmovdqa64 $xa2,@key[2] 2145 vmovdqa64 $xa3,@key[3] 2146 2147 vpshufd \$0x00,$xb3,$xb0 2148 vpshufd \$0x55,$xb3,$xb1 2149 vpshufd \$0xaa,$xb3,$xb2 2150 vpshufd \$0xff,$xb3,$xb3 2151 vmovdqa64 $xb0,@key[4] 2152 vmovdqa64 $xb1,@key[5] 2153 vmovdqa64 $xb2,@key[6] 2154 vmovdqa64 $xb3,@key[7] 2155 2156 vpshufd \$0x00,$xc3,$xc0 2157 vpshufd \$0x55,$xc3,$xc1 2158 vpshufd \$0xaa,$xc3,$xc2 2159 vpshufd \$0xff,$xc3,$xc3 2160 vmovdqa64 $xc0,@key[8] 2161 vmovdqa64 $xc1,@key[9] 2162 vmovdqa64 $xc2,@key[10] 2163 vmovdqa64 $xc3,@key[11] 2164 2165 vpshufd \$0x00,$xd3,$xd0 2166 vpshufd \$0x55,$xd3,$xd1 2167 vpshufd \$0xaa,$xd3,$xd2 2168 vpshufd \$0xff,$xd3,$xd3 2169 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 2170 vmovdqa64 $xd0,@key[12] 2171 vmovdqa64 $xd1,@key[13] 2172 vmovdqa64 $xd2,@key[14] 2173 vmovdqa64 $xd3,@key[15] 2174 2175 mov \$10,%eax 2176 jmp .Loop16x 2177 2178.align 32 2179.Loop_outer16x: 2180 vpbroadcastd 0(%r10),$xa0 # reload key 2181 vpbroadcastd 4(%r10),$xa1 2182 vpbroadcastd 8(%r10),$xa2 2183 vpbroadcastd 12(%r10),$xa3 2184 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 2185 vmovdqa64 @key[4],$xb0 2186 vmovdqa64 @key[5],$xb1 2187 vmovdqa64 @key[6],$xb2 2188 vmovdqa64 @key[7],$xb3 2189 vmovdqa64 @key[8],$xc0 2190 vmovdqa64 @key[9],$xc1 2191 vmovdqa64 @key[10],$xc2 2192 vmovdqa64 @key[11],$xc3 2193 vmovdqa64 @key[12],$xd0 2194 vmovdqa64 @key[13],$xd1 2195 vmovdqa64 @key[14],$xd2 2196 vmovdqa64 @key[15],$xd3 2197 2198 vmovdqa64 $xa0,@key[0] 2199 vmovdqa64 $xa1,@key[1] 2200 vmovdqa64 $xa2,@key[2] 2201 vmovdqa64 $xa3,@key[3] 2202 2203 mov \$10,%eax 2204 jmp .Loop16x 2205 2206.align 32 2207.Loop16x: 2208___ 2209 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 2210 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 2211$code.=<<___; 2212 dec %eax 2213 jnz .Loop16x 2214 2215 vpaddd @key[0],$xa0,$xa0 # accumulate key 2216 vpaddd @key[1],$xa1,$xa1 2217 vpaddd @key[2],$xa2,$xa2 2218 vpaddd @key[3],$xa3,$xa3 2219 2220 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2221 vpunpckldq $xa3,$xa2,$xt3 2222 vpunpckhdq $xa1,$xa0,$xa0 2223 vpunpckhdq $xa3,$xa2,$xa2 2224 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2225 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2226 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2227 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2228___ 2229 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2230$code.=<<___; 2231 vpaddd @key[4],$xb0,$xb0 2232 vpaddd @key[5],$xb1,$xb1 2233 vpaddd @key[6],$xb2,$xb2 2234 vpaddd @key[7],$xb3,$xb3 2235 2236 vpunpckldq $xb1,$xb0,$xt2 2237 vpunpckldq $xb3,$xb2,$xt3 2238 vpunpckhdq $xb1,$xb0,$xb0 2239 vpunpckhdq $xb3,$xb2,$xb2 2240 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2241 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2242 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2243 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2244___ 2245 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2246$code.=<<___; 2247 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 2248 vshufi32x4 \$0xee,$xb0,$xa0,$xb0 2249 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 2250 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 2251 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 2252 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 2253 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 2254 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 2255___ 2256 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2257$code.=<<___; 2258 vpaddd @key[8],$xc0,$xc0 2259 vpaddd @key[9],$xc1,$xc1 2260 vpaddd @key[10],$xc2,$xc2 2261 vpaddd @key[11],$xc3,$xc3 2262 2263 vpunpckldq $xc1,$xc0,$xt2 2264 vpunpckldq $xc3,$xc2,$xt3 2265 vpunpckhdq $xc1,$xc0,$xc0 2266 vpunpckhdq $xc3,$xc2,$xc2 2267 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2268 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2269 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2270 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2271___ 2272 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2273$code.=<<___; 2274 vpaddd @key[12],$xd0,$xd0 2275 vpaddd @key[13],$xd1,$xd1 2276 vpaddd @key[14],$xd2,$xd2 2277 vpaddd @key[15],$xd3,$xd3 2278 2279 vpunpckldq $xd1,$xd0,$xt2 2280 vpunpckldq $xd3,$xd2,$xt3 2281 vpunpckhdq $xd1,$xd0,$xd0 2282 vpunpckhdq $xd3,$xd2,$xd2 2283 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2284 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2285 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2286 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2287___ 2288 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2289$code.=<<___; 2290 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 2291 vshufi32x4 \$0xee,$xd0,$xc0,$xd0 2292 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 2293 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 2294 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 2295 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 2296 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 2297 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 2298___ 2299 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2300$code.=<<___; 2301 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 2302 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 2303 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 2304 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 2305 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 2306 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 2307 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 2308 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 2309 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 2310 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 2311 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 2312 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 2313 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 2314 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 2315 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 2316 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 2317___ 2318 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 2319 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 2320 2321 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 2322 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 2323 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2324 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2325$code.=<<___; 2326 cmp \$64*16,$len 2327 jb .Ltail16x 2328 2329 vpxord 0x00($inp),$xa0,$xa0 # xor with input 2330 vpxord 0x40($inp),$xb0,$xb0 2331 vpxord 0x80($inp),$xc0,$xc0 2332 vpxord 0xc0($inp),$xd0,$xd0 2333 vmovdqu32 $xa0,0x00($out) 2334 vmovdqu32 $xb0,0x40($out) 2335 vmovdqu32 $xc0,0x80($out) 2336 vmovdqu32 $xd0,0xc0($out) 2337 2338 vpxord 0x100($inp),$xa1,$xa1 2339 vpxord 0x140($inp),$xb1,$xb1 2340 vpxord 0x180($inp),$xc1,$xc1 2341 vpxord 0x1c0($inp),$xd1,$xd1 2342 vmovdqu32 $xa1,0x100($out) 2343 vmovdqu32 $xb1,0x140($out) 2344 vmovdqu32 $xc1,0x180($out) 2345 vmovdqu32 $xd1,0x1c0($out) 2346 2347 vpxord 0x200($inp),$xa2,$xa2 2348 vpxord 0x240($inp),$xb2,$xb2 2349 vpxord 0x280($inp),$xc2,$xc2 2350 vpxord 0x2c0($inp),$xd2,$xd2 2351 vmovdqu32 $xa2,0x200($out) 2352 vmovdqu32 $xb2,0x240($out) 2353 vmovdqu32 $xc2,0x280($out) 2354 vmovdqu32 $xd2,0x2c0($out) 2355 2356 vpxord 0x300($inp),$xa3,$xa3 2357 vpxord 0x340($inp),$xb3,$xb3 2358 vpxord 0x380($inp),$xc3,$xc3 2359 vpxord 0x3c0($inp),$xd3,$xd3 2360 lea 0x400($inp),$inp 2361 vmovdqu32 $xa3,0x300($out) 2362 vmovdqu32 $xb3,0x340($out) 2363 vmovdqu32 $xc3,0x380($out) 2364 vmovdqu32 $xd3,0x3c0($out) 2365 lea 0x400($out),$out 2366 2367 sub \$64*16,$len 2368 jnz .Loop_outer16x 2369 2370 jmp .Ldone16x 2371 2372.align 32 2373.Ltail16x: 2374 xor %r10,%r10 2375 sub $inp,$out 2376 cmp \$64*1,$len 2377 jb .Less_than_64_16x 2378 vpxord ($inp),$xa0,$xa0 # xor with input 2379 vmovdqu32 $xa0,($out,$inp) 2380 je .Ldone16x 2381 vmovdqa32 $xb0,$xa0 2382 lea 64($inp),$inp 2383 2384 cmp \$64*2,$len 2385 jb .Less_than_64_16x 2386 vpxord ($inp),$xb0,$xb0 2387 vmovdqu32 $xb0,($out,$inp) 2388 je .Ldone16x 2389 vmovdqa32 $xc0,$xa0 2390 lea 64($inp),$inp 2391 2392 cmp \$64*3,$len 2393 jb .Less_than_64_16x 2394 vpxord ($inp),$xc0,$xc0 2395 vmovdqu32 $xc0,($out,$inp) 2396 je .Ldone16x 2397 vmovdqa32 $xd0,$xa0 2398 lea 64($inp),$inp 2399 2400 cmp \$64*4,$len 2401 jb .Less_than_64_16x 2402 vpxord ($inp),$xd0,$xd0 2403 vmovdqu32 $xd0,($out,$inp) 2404 je .Ldone16x 2405 vmovdqa32 $xa1,$xa0 2406 lea 64($inp),$inp 2407 2408 cmp \$64*5,$len 2409 jb .Less_than_64_16x 2410 vpxord ($inp),$xa1,$xa1 2411 vmovdqu32 $xa1,($out,$inp) 2412 je .Ldone16x 2413 vmovdqa32 $xb1,$xa0 2414 lea 64($inp),$inp 2415 2416 cmp \$64*6,$len 2417 jb .Less_than_64_16x 2418 vpxord ($inp),$xb1,$xb1 2419 vmovdqu32 $xb1,($out,$inp) 2420 je .Ldone16x 2421 vmovdqa32 $xc1,$xa0 2422 lea 64($inp),$inp 2423 2424 cmp \$64*7,$len 2425 jb .Less_than_64_16x 2426 vpxord ($inp),$xc1,$xc1 2427 vmovdqu32 $xc1,($out,$inp) 2428 je .Ldone16x 2429 vmovdqa32 $xd1,$xa0 2430 lea 64($inp),$inp 2431 2432 cmp \$64*8,$len 2433 jb .Less_than_64_16x 2434 vpxord ($inp),$xd1,$xd1 2435 vmovdqu32 $xd1,($out,$inp) 2436 je .Ldone16x 2437 vmovdqa32 $xa2,$xa0 2438 lea 64($inp),$inp 2439 2440 cmp \$64*9,$len 2441 jb .Less_than_64_16x 2442 vpxord ($inp),$xa2,$xa2 2443 vmovdqu32 $xa2,($out,$inp) 2444 je .Ldone16x 2445 vmovdqa32 $xb2,$xa0 2446 lea 64($inp),$inp 2447 2448 cmp \$64*10,$len 2449 jb .Less_than_64_16x 2450 vpxord ($inp),$xb2,$xb2 2451 vmovdqu32 $xb2,($out,$inp) 2452 je .Ldone16x 2453 vmovdqa32 $xc2,$xa0 2454 lea 64($inp),$inp 2455 2456 cmp \$64*11,$len 2457 jb .Less_than_64_16x 2458 vpxord ($inp),$xc2,$xc2 2459 vmovdqu32 $xc2,($out,$inp) 2460 je .Ldone16x 2461 vmovdqa32 $xd2,$xa0 2462 lea 64($inp),$inp 2463 2464 cmp \$64*12,$len 2465 jb .Less_than_64_16x 2466 vpxord ($inp),$xd2,$xd2 2467 vmovdqu32 $xd2,($out,$inp) 2468 je .Ldone16x 2469 vmovdqa32 $xa3,$xa0 2470 lea 64($inp),$inp 2471 2472 cmp \$64*13,$len 2473 jb .Less_than_64_16x 2474 vpxord ($inp),$xa3,$xa3 2475 vmovdqu32 $xa3,($out,$inp) 2476 je .Ldone16x 2477 vmovdqa32 $xb3,$xa0 2478 lea 64($inp),$inp 2479 2480 cmp \$64*14,$len 2481 jb .Less_than_64_16x 2482 vpxord ($inp),$xb3,$xb3 2483 vmovdqu32 $xb3,($out,$inp) 2484 je .Ldone16x 2485 vmovdqa32 $xc3,$xa0 2486 lea 64($inp),$inp 2487 2488 cmp \$64*15,$len 2489 jb .Less_than_64_16x 2490 vpxord ($inp),$xc3,$xc3 2491 vmovdqu32 $xc3,($out,$inp) 2492 je .Ldone16x 2493 vmovdqa32 $xd3,$xa0 2494 lea 64($inp),$inp 2495 2496.Less_than_64_16x: 2497 vmovdqa32 $xa0,0x00(%rsp) 2498 lea ($out,$inp),$out 2499 and \$63,$len 2500 2501.Loop_tail16x: 2502 movzb ($inp,%r10),%eax 2503 movzb (%rsp,%r10),%ecx 2504 lea 1(%r10),%r10 2505 xor %ecx,%eax 2506 mov %al,-1($out,%r10) 2507 dec $len 2508 jnz .Loop_tail16x 2509 2510 vpxord $xa0,$xa0,$xa0 2511 vmovdqa32 $xa0,0(%rsp) 2512 2513.Ldone16x: 2514 vzeroall 2515___ 2516$code.=<<___ if ($win64); 2517 movaps -0xa8(%r9),%xmm6 2518 movaps -0x98(%r9),%xmm7 2519 movaps -0x88(%r9),%xmm8 2520 movaps -0x78(%r9),%xmm9 2521 movaps -0x68(%r9),%xmm10 2522 movaps -0x58(%r9),%xmm11 2523 movaps -0x48(%r9),%xmm12 2524 movaps -0x38(%r9),%xmm13 2525 movaps -0x28(%r9),%xmm14 2526 movaps -0x18(%r9),%xmm15 2527___ 2528$code.=<<___; 2529 lea (%r9),%rsp 2530.cfi_def_cfa_register rsp 2531.L16x_epilogue: 2532 ret 2533.cfi_endproc 2534.size ChaCha20_16x,.-ChaCha20_16x 2535___ 2536} 2537 2538# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2539# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2540if ($win64) { 2541$rec="%rcx"; 2542$frame="%rdx"; 2543$context="%r8"; 2544$disp="%r9"; 2545 2546$code.=<<___; 2547.extern __imp_RtlVirtualUnwind 2548.type se_handler,\@abi-omnipotent 2549.align 16 2550se_handler: 2551 push %rsi 2552 push %rdi 2553 push %rbx 2554 push %rbp 2555 push %r12 2556 push %r13 2557 push %r14 2558 push %r15 2559 pushfq 2560 sub \$64,%rsp 2561 2562 mov 120($context),%rax # pull context->Rax 2563 mov 248($context),%rbx # pull context->Rip 2564 2565 mov 8($disp),%rsi # disp->ImageBase 2566 mov 56($disp),%r11 # disp->HandlerData 2567 2568 lea .Lctr32_body(%rip),%r10 2569 cmp %r10,%rbx # context->Rip<.Lprologue 2570 jb .Lcommon_seh_tail 2571 2572 mov 152($context),%rax # pull context->Rsp 2573 2574 lea .Lno_data(%rip),%r10 # epilogue label 2575 cmp %r10,%rbx # context->Rip>=.Lepilogue 2576 jae .Lcommon_seh_tail 2577 2578 lea 64+24+48(%rax),%rax 2579 2580 mov -8(%rax),%rbx 2581 mov -16(%rax),%rbp 2582 mov -24(%rax),%r12 2583 mov -32(%rax),%r13 2584 mov -40(%rax),%r14 2585 mov -48(%rax),%r15 2586 mov %rbx,144($context) # restore context->Rbx 2587 mov %rbp,160($context) # restore context->Rbp 2588 mov %r12,216($context) # restore context->R12 2589 mov %r13,224($context) # restore context->R13 2590 mov %r14,232($context) # restore context->R14 2591 mov %r15,240($context) # restore context->R14 2592 2593.Lcommon_seh_tail: 2594 mov 8(%rax),%rdi 2595 mov 16(%rax),%rsi 2596 mov %rax,152($context) # restore context->Rsp 2597 mov %rsi,168($context) # restore context->Rsi 2598 mov %rdi,176($context) # restore context->Rdi 2599 2600 mov 40($disp),%rdi # disp->ContextRecord 2601 mov $context,%rsi # context 2602 mov \$154,%ecx # sizeof(CONTEXT) 2603 .long 0xa548f3fc # cld; rep movsq 2604 2605 mov $disp,%rsi 2606 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2607 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2608 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2609 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2610 mov 40(%rsi),%r10 # disp->ContextRecord 2611 lea 56(%rsi),%r11 # &disp->HandlerData 2612 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2613 mov %r10,32(%rsp) # arg5 2614 mov %r11,40(%rsp) # arg6 2615 mov %r12,48(%rsp) # arg7 2616 mov %rcx,56(%rsp) # arg8, (NULL) 2617 call *__imp_RtlVirtualUnwind(%rip) 2618 2619 mov \$1,%eax # ExceptionContinueSearch 2620 add \$64,%rsp 2621 popfq 2622 pop %r15 2623 pop %r14 2624 pop %r13 2625 pop %r12 2626 pop %rbp 2627 pop %rbx 2628 pop %rdi 2629 pop %rsi 2630 ret 2631.size se_handler,.-se_handler 2632 2633.type ssse3_handler,\@abi-omnipotent 2634.align 16 2635ssse3_handler: 2636 push %rsi 2637 push %rdi 2638 push %rbx 2639 push %rbp 2640 push %r12 2641 push %r13 2642 push %r14 2643 push %r15 2644 pushfq 2645 sub \$64,%rsp 2646 2647 mov 120($context),%rax # pull context->Rax 2648 mov 248($context),%rbx # pull context->Rip 2649 2650 mov 8($disp),%rsi # disp->ImageBase 2651 mov 56($disp),%r11 # disp->HandlerData 2652 2653 mov 0(%r11),%r10d # HandlerData[0] 2654 lea (%rsi,%r10),%r10 # prologue label 2655 cmp %r10,%rbx # context->Rip<prologue label 2656 jb .Lcommon_seh_tail 2657 2658 mov 192($context),%rax # pull context->R9 2659 2660 mov 4(%r11),%r10d # HandlerData[1] 2661 lea (%rsi,%r10),%r10 # epilogue label 2662 cmp %r10,%rbx # context->Rip>=epilogue label 2663 jae .Lcommon_seh_tail 2664 2665 lea -0x28(%rax),%rsi 2666 lea 512($context),%rdi # &context.Xmm6 2667 mov \$4,%ecx 2668 .long 0xa548f3fc # cld; rep movsq 2669 2670 jmp .Lcommon_seh_tail 2671.size ssse3_handler,.-ssse3_handler 2672 2673.type full_handler,\@abi-omnipotent 2674.align 16 2675full_handler: 2676 push %rsi 2677 push %rdi 2678 push %rbx 2679 push %rbp 2680 push %r12 2681 push %r13 2682 push %r14 2683 push %r15 2684 pushfq 2685 sub \$64,%rsp 2686 2687 mov 120($context),%rax # pull context->Rax 2688 mov 248($context),%rbx # pull context->Rip 2689 2690 mov 8($disp),%rsi # disp->ImageBase 2691 mov 56($disp),%r11 # disp->HandlerData 2692 2693 mov 0(%r11),%r10d # HandlerData[0] 2694 lea (%rsi,%r10),%r10 # prologue label 2695 cmp %r10,%rbx # context->Rip<prologue label 2696 jb .Lcommon_seh_tail 2697 2698 mov 192($context),%rax # pull context->R9 2699 2700 mov 4(%r11),%r10d # HandlerData[1] 2701 lea (%rsi,%r10),%r10 # epilogue label 2702 cmp %r10,%rbx # context->Rip>=epilogue label 2703 jae .Lcommon_seh_tail 2704 2705 lea -0xa8(%rax),%rsi 2706 lea 512($context),%rdi # &context.Xmm6 2707 mov \$20,%ecx 2708 .long 0xa548f3fc # cld; rep movsq 2709 2710 jmp .Lcommon_seh_tail 2711.size full_handler,.-full_handler 2712 2713.section .pdata 2714.align 4 2715 .rva .LSEH_begin_ChaCha20_ctr32 2716 .rva .LSEH_end_ChaCha20_ctr32 2717 .rva .LSEH_info_ChaCha20_ctr32 2718 2719 .rva .LSEH_begin_ChaCha20_ssse3 2720 .rva .LSEH_end_ChaCha20_ssse3 2721 .rva .LSEH_info_ChaCha20_ssse3 2722 2723 .rva .LSEH_begin_ChaCha20_4x 2724 .rva .LSEH_end_ChaCha20_4x 2725 .rva .LSEH_info_ChaCha20_4x 2726___ 2727$code.=<<___ if ($avx>1); 2728 .rva .LSEH_begin_ChaCha20_8x 2729 .rva .LSEH_end_ChaCha20_8x 2730 .rva .LSEH_info_ChaCha20_8x 2731___ 2732$code.=<<___ if ($avx>2); 2733 .rva .LSEH_begin_ChaCha20_avx512 2734 .rva .LSEH_end_ChaCha20_avx512 2735 .rva .LSEH_info_ChaCha20_avx512 2736 2737 .rva .LSEH_begin_ChaCha20_16x 2738 .rva .LSEH_end_ChaCha20_16x 2739 .rva .LSEH_info_ChaCha20_16x 2740___ 2741$code.=<<___; 2742.section .xdata 2743.align 8 2744.LSEH_info_ChaCha20_ctr32: 2745 .byte 9,0,0,0 2746 .rva se_handler 2747 2748.LSEH_info_ChaCha20_ssse3: 2749 .byte 9,0,0,0 2750 .rva ssse3_handler 2751 .rva .Lssse3_body,.Lssse3_epilogue 2752 2753.LSEH_info_ChaCha20_4x: 2754 .byte 9,0,0,0 2755 .rva full_handler 2756 .rva .L4x_body,.L4x_epilogue 2757___ 2758$code.=<<___ if ($avx>1); 2759.LSEH_info_ChaCha20_8x: 2760 .byte 9,0,0,0 2761 .rva full_handler 2762 .rva .L8x_body,.L8x_epilogue # HandlerData[] 2763___ 2764$code.=<<___ if ($avx>2); 2765.LSEH_info_ChaCha20_avx512: 2766 .byte 9,0,0,0 2767 .rva ssse3_handler 2768 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 2769 2770.LSEH_info_ChaCha20_16x: 2771 .byte 9,0,0,0 2772 .rva full_handler 2773 .rva .L16x_body,.L16x_epilogue # HandlerData[] 2774___ 2775} 2776 2777foreach (split("\n",$code)) { 2778 s/\`([^\`]*)\`/eval $1/ge; 2779 2780 s/%x#%[yz]/%x/g; # "down-shift" 2781 2782 print $_,"\n"; 2783} 2784 2785close STDOUT or die "error closing STDOUT"; 2786