1#!/usr/bin/env perl 2 3# Copyright (c) 2015, CloudFlare Ltd. 4# 5# Permission to use, copy, modify, and/or distribute this software for any 6# purpose with or without fee is hereby granted, provided that the above 7# copyright notice and this permission notice appear in all copies. 8# 9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17############################################################################## 18# # 19# Author: Vlad Krasnov # 20# # 21############################################################################## 22 23$flavour = shift; 24$output = shift; 25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 26 27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 28 29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 32die "can't locate x86_64-xlate.pl"; 33 34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 35*STDOUT=*OUT; 36 37$avx = 2; 38 39$code.=<<___; 40.text 41.extern OPENSSL_ia32cap_P 42 43chacha20_poly1305_constants: 44 45.align 64 46.chacha20_consts: 47.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 48.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 49.rol8: 50.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 51.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 52.rol16: 53.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 54.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 55.avx2_init: 56.long 0,0,0,0 57.sse_inc: 58.long 1,0,0,0 59.avx2_inc: 60.long 2,0,0,0,2,0,0,0 61.clamp: 62.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 63.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF 64.align 16 65.and_masks: 66.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 67.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 68.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 69.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 70.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 71.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 72.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 73.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 74.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 75.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 76.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 77.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 78.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 79.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 80.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 81.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 82___ 83 84my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8"); 85my ($acc0,$acc1,$acc2)=map("%r$_",(10..12)); 86my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9"); 87my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15)); 88my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 89my $r_store="0*16(%rbp)"; 90my $s_store="1*16(%rbp)"; 91my $len_store="2*16(%rbp)"; 92my $state1_store="3*16(%rbp)"; 93my $state2_store="4*16(%rbp)"; 94my $tmp_store="5*16(%rbp)"; 95my $ctr0_store="6*16(%rbp)"; 96my $ctr1_store="7*16(%rbp)"; 97my $ctr2_store="8*16(%rbp)"; 98my $ctr3_store="9*16(%rbp)"; 99 100sub chacha_qr { 101my ($a,$b,$c,$d,$t,$dir)=@_; 102$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/); 103$code.="paddd $b, $a 104 pxor $a, $d 105 pshufb .rol16(%rip), $d 106 paddd $d, $c 107 pxor $c, $b 108 movdqa $b, $t 109 pslld \$12, $t 110 psrld \$20, $b 111 pxor $t, $b 112 paddd $b, $a 113 pxor $a, $d 114 pshufb .rol8(%rip), $d 115 paddd $d, $c 116 pxor $c, $b 117 movdqa $b, $t 118 pslld \$7, $t 119 psrld \$25, $b 120 pxor $t, $b\n"; 121$code.="palignr \$4, $b, $b 122 palignr \$8, $c, $c 123 palignr \$12, $d, $d\n" if ($dir =~ /left/); 124$code.="palignr \$12, $b, $b 125 palignr \$8, $c, $c 126 palignr \$4, $d, $d\n" if ($dir =~ /right/); 127$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/); 128} 129 130sub poly_add { 131my ($src)=@_; 132$code.="add $src, $acc0 133 adc 8+$src, $acc1 134 adc \$1, $acc2\n"; 135} 136 137sub poly_stage1 { 138$code.="mov 0+$r_store, %rax 139 mov %rax, $t2 140 mul $acc0 141 mov %rax, $t0 142 mov %rdx, $t1 143 mov 0+$r_store, %rax 144 mul $acc1 145 imulq $acc2, $t2 146 add %rax, $t1 147 adc %rdx, $t2\n"; 148} 149 150sub poly_stage2 { 151$code.="mov 8+$r_store, %rax 152 mov %rax, $t3 153 mul $acc0 154 add %rax, $t1 155 adc \$0, %rdx 156 mov %rdx, $acc0 157 mov 8+$r_store, %rax 158 mul $acc1 159 add %rax, $t2 160 adc \$0, %rdx\n"; 161} 162 163sub poly_stage3 { 164$code.="imulq $acc2, $t3 165 add $acc0, $t2 166 adc %rdx, $t3\n"; 167} 168 169sub poly_reduce_stage { 170$code.="mov $t0, $acc0 171 mov $t1, $acc1 172 mov $t2, $acc2 173 and \$3, $acc2 174 mov $t2, $t0 175 and \$-4, $t0 176 mov $t3, $t1 177 shrd \$2, $t3, $t2 178 shr \$2, $t3 179 add $t0, $acc0 180 adc $t1, $acc1 181 adc \$0, $acc2 182 add $t2, $acc0 183 adc $t3, $acc1 184 adc \$0, $acc2\n"; 185} 186 187sub poly_mul { 188 &poly_stage1(); 189 &poly_stage2(); 190 &poly_stage3(); 191 &poly_reduce_stage(); 192} 193 194sub prep_state { 195my ($n)=@_; 196$code.="movdqa .chacha20_consts(%rip), $A0 197 movdqa $state1_store, $B0 198 movdqa $state2_store, $C0\n"; 199$code.="movdqa $A0, $A1 200 movdqa $B0, $B1 201 movdqa $C0, $C1\n" if ($n ge 2); 202$code.="movdqa $A0, $A2 203 movdqa $B0, $B2 204 movdqa $C0, $C2\n" if ($n ge 3); 205$code.="movdqa $A0, $A3 206 movdqa $B0, $B3 207 movdqa $C0, $C3\n" if ($n ge 4); 208$code.="movdqa $ctr0_store, $D0 209 paddd .sse_inc(%rip), $D0 210 movdqa $D0, $ctr0_store\n" if ($n eq 1); 211$code.="movdqa $ctr0_store, $D1 212 paddd .sse_inc(%rip), $D1 213 movdqa $D1, $D0 214 paddd .sse_inc(%rip), $D0 215 movdqa $D0, $ctr0_store 216 movdqa $D1, $ctr1_store\n" if ($n eq 2); 217$code.="movdqa $ctr0_store, $D2 218 paddd .sse_inc(%rip), $D2 219 movdqa $D2, $D1 220 paddd .sse_inc(%rip), $D1 221 movdqa $D1, $D0 222 paddd .sse_inc(%rip), $D0 223 movdqa $D0, $ctr0_store 224 movdqa $D1, $ctr1_store 225 movdqa $D2, $ctr2_store\n" if ($n eq 3); 226$code.="movdqa $ctr0_store, $D3 227 paddd .sse_inc(%rip), $D3 228 movdqa $D3, $D2 229 paddd .sse_inc(%rip), $D2 230 movdqa $D2, $D1 231 paddd .sse_inc(%rip), $D1 232 movdqa $D1, $D0 233 paddd .sse_inc(%rip), $D0 234 movdqa $D0, $ctr0_store 235 movdqa $D1, $ctr1_store 236 movdqa $D2, $ctr2_store 237 movdqa $D3, $ctr3_store\n" if ($n eq 4); 238} 239 240sub finalize_state { 241my ($n)=@_; 242$code.="paddd .chacha20_consts(%rip), $A3 243 paddd $state1_store, $B3 244 paddd $state2_store, $C3 245 paddd $ctr3_store, $D3\n" if ($n eq 4); 246$code.="paddd .chacha20_consts(%rip), $A2 247 paddd $state1_store, $B2 248 paddd $state2_store, $C2 249 paddd $ctr2_store, $D2\n" if ($n ge 3); 250$code.="paddd .chacha20_consts(%rip), $A1 251 paddd $state1_store, $B1 252 paddd $state2_store, $C1 253 paddd $ctr1_store, $D1\n" if ($n ge 2); 254$code.="paddd .chacha20_consts(%rip), $A0 255 paddd $state1_store, $B0 256 paddd $state2_store, $C0 257 paddd $ctr0_store, $D0\n"; 258} 259 260sub xor_stream { 261my ($A, $B, $C, $D, $offset)=@_; 262$code.="movdqu 0*16 + $offset($inp), $A3 263 movdqu 1*16 + $offset($inp), $B3 264 movdqu 2*16 + $offset($inp), $C3 265 movdqu 3*16 + $offset($inp), $D3 266 pxor $A3, $A 267 pxor $B3, $B 268 pxor $C3, $C 269 pxor $D, $D3 270 movdqu $A, 0*16 + $offset($oup) 271 movdqu $B, 1*16 + $offset($oup) 272 movdqu $C, 2*16 + $offset($oup) 273 movdqu $D3, 3*16 + $offset($oup)\n"; 274} 275 276sub xor_stream_using_temp { 277my ($A, $B, $C, $D, $offset, $temp)=@_; 278$code.="movdqa $temp, $tmp_store 279 movdqu 0*16 + $offset($inp), $temp 280 pxor $A, $temp 281 movdqu $temp, 0*16 + $offset($oup) 282 movdqu 1*16 + $offset($inp), $temp 283 pxor $B, $temp 284 movdqu $temp, 1*16 + $offset($oup) 285 movdqu 2*16 + $offset($inp), $temp 286 pxor $C, $temp 287 movdqu $temp, 2*16 + $offset($oup) 288 movdqu 3*16 + $offset($inp), $temp 289 pxor $D, $temp 290 movdqu $temp, 3*16 + $offset($oup)\n"; 291} 292 293sub gen_chacha_round { 294my ($rot1, $rot2, $shift)=@_; 295my $round=""; 296$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20); 297$round.="movdqa $rot2, $C0 298 paddd $B3, $A3 299 paddd $B2, $A2 300 paddd $B1, $A1 301 paddd $B0, $A0 302 pxor $A3, $D3 303 pxor $A2, $D2 304 pxor $A1, $D1 305 pxor $A0, $D0 306 pshufb $C0, $D3 307 pshufb $C0, $D2 308 pshufb $C0, $D1 309 pshufb $C0, $D0 310 movdqa $tmp_store, $C0 311 paddd $D3, $C3 312 paddd $D2, $C2 313 paddd $D1, $C1 314 paddd $D0, $C0 315 pxor $C3, $B3 316 pxor $C2, $B2 317 pxor $C1, $B1 318 pxor $C0, $B0 319 movdqa $C0, $tmp_store 320 movdqa $B3, $C0 321 psrld \$$rot1, $C0 322 pslld \$32-$rot1, $B3 323 pxor $C0, $B3 324 movdqa $B2, $C0 325 psrld \$$rot1, $C0 326 pslld \$32-$rot1, $B2 327 pxor $C0, $B2 328 movdqa $B1, $C0 329 psrld \$$rot1, $C0 330 pslld \$32-$rot1, $B1 331 pxor $C0, $B1 332 movdqa $B0, $C0 333 psrld \$$rot1, $C0 334 pslld \$32-$rot1, $B0 335 pxor $C0, $B0\n"; 336($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 337($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 338$round.="movdqa $tmp_store, $C0 339 palignr \$$s1, $B3, $B3 340 palignr \$$s2, $C3, $C3 341 palignr \$$s3, $D3, $D3 342 palignr \$$s1, $B2, $B2 343 palignr \$$s2, $C2, $C2 344 palignr \$$s3, $D2, $D2 345 palignr \$$s1, $B1, $B1 346 palignr \$$s2, $C1, $C1 347 palignr \$$s3, $D1, $D1 348 palignr \$$s1, $B0, $B0 349 palignr \$$s2, $C0, $C0 350 palignr \$$s3, $D0, $D0\n" 351if (($shift =~ /left/) || ($shift =~ /right/)); 352return $round; 353}; 354 355$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") . 356 &gen_chacha_round(25, ".rol8(%rip)", "left") . 357 &gen_chacha_round(20, ".rol16(%rip)") . 358 &gen_chacha_round(25, ".rol8(%rip)", "right"); 359 360my @loop_body = split /\n/, $chacha_body; 361 362sub emit_body { 363my ($n)=@_; 364 for (my $i=0; $i < $n; $i++) { 365 $code=$code.shift(@loop_body)."\n"; 366 }; 367} 368 369{ 370################################################################################ 371# void poly_hash_ad_internal(); 372$code.=" 373.type poly_hash_ad_internal,\@function,2 374.align 64 375poly_hash_ad_internal: 376.cfi_startproc 377 xor $acc0, $acc0 378 xor $acc1, $acc1 379 xor $acc2, $acc2 380 cmp \$13, $itr2 381 jne hash_ad_loop 382poly_fast_tls_ad: 383 # Special treatment for the TLS case of 13 bytes 384 mov ($adp), $acc0 385 mov 5($adp), $acc1 386 shr \$24, $acc1 387 mov \$1, $acc2\n"; 388 &poly_mul(); $code.=" 389 ret 390hash_ad_loop: 391 # Hash in 16 byte chunk 392 cmp \$16, $itr2 393 jb hash_ad_tail\n"; 394 &poly_add("0($adp)"); 395 &poly_mul(); $code.=" 396 lea 1*16($adp), $adp 397 sub \$16, $itr2 398 jmp hash_ad_loop 399hash_ad_tail: 400 cmp \$0, $itr2 401 je 1f 402 # Hash last < 16 byte tail 403 xor $t0, $t0 404 xor $t1, $t1 405 xor $t2, $t2 406 add $itr2, $adp 407hash_ad_tail_loop: 408 shld \$8, $t0, $t1 409 shl \$8, $t0 410 movzxb -1($adp), $t2 411 xor $t2, $t0 412 dec $adp 413 dec $itr2 414 jne hash_ad_tail_loop 415 416 add $t0, $acc0 417 adc $t1, $acc1 418 adc \$1, $acc2\n"; 419 &poly_mul(); $code.=" 420 # Finished AD 4211: 422 ret 423.cfi_endproc 424.size poly_hash_ad_internal, .-poly_hash_ad_internal\n"; 425} 426 427{ 428################################################################################ 429# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 430$code.=" 431.globl chacha20_poly1305_open 432.type chacha20_poly1305_open,\@function,2 433.align 64 434chacha20_poly1305_open: 435.cfi_startproc 436 push %rbp 437.cfi_adjust_cfa_offset 8 438 push %rbx 439.cfi_adjust_cfa_offset 8 440 push %r12 441.cfi_adjust_cfa_offset 8 442 push %r13 443.cfi_adjust_cfa_offset 8 444 push %r14 445.cfi_adjust_cfa_offset 8 446 push %r15 447.cfi_adjust_cfa_offset 8 448 # We write the calculated authenticator back to keyp at the end, so save 449 # the pointer on the stack too. 450 push $keyp 451.cfi_adjust_cfa_offset 8 452 sub \$288 + 32, %rsp 453.cfi_adjust_cfa_offset 288 + 32 454.cfi_offset rbp, -16 455.cfi_offset rbx, -24 456.cfi_offset r12, -32 457.cfi_offset r13, -40 458.cfi_offset r14, -48 459.cfi_offset r15, -56 460 lea 32(%rsp), %rbp 461 and \$-32, %rbp 462 mov %rdx, 8+$len_store 463 mov %r8, 0+$len_store 464 mov %rdx, $inl\n"; $code.=" 465 mov OPENSSL_ia32cap_P+8(%rip), %eax 466 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 467 xor \$`(1<<5) + (1<<8)`, %eax 468 jz chacha20_poly1305_open_avx2\n" if ($avx>1); 469$code.=" 4701: 471 cmp \$128, $inl 472 jbe open_sse_128 473 # For long buffers, prepare the poly key first 474 movdqa .chacha20_consts(%rip), $A0 475 movdqu 0*16($keyp), $B0 476 movdqu 1*16($keyp), $C0 477 movdqu 2*16($keyp), $D0 478 movdqa $D0, $T1 479 # Store on stack, to free keyp 480 movdqa $B0, $state1_store 481 movdqa $C0, $state2_store 482 movdqa $D0, $ctr0_store 483 mov \$10, $acc0 4841: \n"; 485 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 486 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 487 dec $acc0 488 jne 1b 489 # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 490 paddd .chacha20_consts(%rip), $A0 491 paddd $state1_store, $B0 492 # Clamp and store the key 493 pand .clamp(%rip), $A0 494 movdqa $A0, $r_store 495 movdqa $B0, $s_store 496 # Hash 497 mov %r8, $itr2 498 call poly_hash_ad_internal 499open_sse_main_loop: 500 cmp \$16*16, $inl 501 jb 2f 502 # Load state, increment counter blocks\n"; 503 &prep_state(4); $code.=" 504 # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we 505 # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 506 mov \$4, $itr1 507 mov $inp, $itr2 5081: \n"; 509 &emit_body(20); 510 &poly_add("0($itr2)"); $code.=" 511 lea 2*8($itr2), $itr2\n"; 512 &emit_body(20); 513 &poly_stage1(); 514 &emit_body(20); 515 &poly_stage2(); 516 &emit_body(20); 517 &poly_stage3(); 518 &emit_body(20); 519 &poly_reduce_stage(); 520 foreach $l (@loop_body) {$code.=$l."\n";} 521 @loop_body = split /\n/, $chacha_body; $code.=" 522 dec $itr1 523 jge 1b\n"; 524 &poly_add("0($itr2)"); 525 &poly_mul(); $code.=" 526 lea 2*8($itr2), $itr2 527 cmp \$-6, $itr1 528 jg 1b\n"; 529 &finalize_state(4); 530 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 531 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 532 &xor_stream($A1, $B1, $C1, $D1, "8*16"); 533 &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.=" 534 lea 16*16($inp), $inp 535 lea 16*16($oup), $oup 536 sub \$16*16, $inl 537 jmp open_sse_main_loop 5382: 539 # Handle the various tail sizes efficiently 540 test $inl, $inl 541 jz open_sse_finalize 542 cmp \$4*16, $inl 543 ja 3f\n"; 544############################################################################### 545 # At most 64 bytes are left 546 &prep_state(1); $code.=" 547 xor $itr2, $itr2 548 mov $inl, $itr1 549 cmp \$16, $itr1 550 jb 2f 5511: \n"; 552 &poly_add("0($inp, $itr2)"); 553 &poly_mul(); $code.=" 554 sub \$16, $itr1 5552: 556 add \$16, $itr2\n"; 557 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 558 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 559 cmp \$16, $itr1 560 jae 1b 561 cmp \$10*16, $itr2 562 jne 2b\n"; 563 &finalize_state(1); $code.=" 564 jmp open_sse_tail_64_dec_loop 5653: 566 cmp \$8*16, $inl 567 ja 3f\n"; 568############################################################################### 569 # 65 - 128 bytes are left 570 &prep_state(2); $code.=" 571 mov $inl, $itr1 572 and \$-16, $itr1 573 xor $itr2, $itr2 5741: \n"; 575 &poly_add("0($inp, $itr2)"); 576 &poly_mul(); $code.=" 5772: 578 add \$16, $itr2\n"; 579 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 580 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 581 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 582 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.=" 583 cmp $itr1, $itr2 584 jb 1b 585 cmp \$10*16, $itr2 586 jne 2b\n"; 587 &finalize_state(2); 588 &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.=" 589 sub \$4*16, $inl 590 lea 4*16($inp), $inp 591 lea 4*16($oup), $oup 592 jmp open_sse_tail_64_dec_loop 5933: 594 cmp \$12*16, $inl 595 ja 3f\n"; 596############################################################################### 597 # 129 - 192 bytes are left 598 &prep_state(3); $code.=" 599 mov $inl, $itr1 600 mov \$10*16, $itr2 601 cmp \$10*16, $itr1 602 cmovg $itr2, $itr1 603 and \$-16, $itr1 604 xor $itr2, $itr2 6051: \n"; 606 &poly_add("0($inp, $itr2)"); 607 &poly_mul(); $code.=" 6082: 609 add \$16, $itr2\n"; 610 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 611 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 612 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 613 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 614 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 615 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 616 cmp $itr1, $itr2 617 jb 1b 618 cmp \$10*16, $itr2 619 jne 2b 620 cmp \$11*16, $inl 621 jb 1f\n"; 622 &poly_add("10*16($inp)"); 623 &poly_mul(); $code.=" 624 cmp \$12*16, $inl 625 jb 1f\n"; 626 &poly_add("11*16($inp)"); 627 &poly_mul(); $code.=" 6281: \n"; 629 &finalize_state(3); 630 &xor_stream($A2, $B2, $C2, $D2, "0*16"); 631 &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.=" 632 sub \$8*16, $inl 633 lea 8*16($inp), $inp 634 lea 8*16($oup), $oup 635 jmp open_sse_tail_64_dec_loop 6363: 637###############################################################################\n"; 638 # 193 - 255 bytes are left 639 &prep_state(4); $code.=" 640 xor $itr2, $itr2 6411: \n"; 642 &poly_add("0($inp, $itr2)"); 643 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left"); 644 &chacha_qr($A1,$B1,$C1,$D1,$C3,"left"); 645 &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load"); 646 &poly_stage1(); 647 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load"); 648 &poly_stage2(); 649 &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right"); 650 &chacha_qr($A1,$B1,$C1,$D1,$C3,"right"); 651 &poly_stage3(); 652 &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load"); 653 &poly_reduce_stage(); 654 &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.=" 655 add \$16, $itr2 656 cmp \$10*16, $itr2 657 jb 1b 658 mov $inl, $itr1 659 and \$-16, $itr1 6601: \n"; 661 &poly_add("0($inp, $itr2)"); 662 &poly_mul(); $code.=" 663 add \$16, $itr2 664 cmp $itr1, $itr2 665 jb 1b\n"; 666 &finalize_state(4); 667 &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0); 668 &xor_stream($A2, $B2, $C2, $D2, "4*16"); 669 &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.=" 670 movdqa $tmp_store, $D0 671 sub \$12*16, $inl 672 lea 12*16($inp), $inp 673 lea 12*16($oup), $oup 674############################################################################### 675 # Decrypt the remaining data, 16B at a time, using existing stream 676open_sse_tail_64_dec_loop: 677 cmp \$16, $inl 678 jb 1f 679 sub \$16, $inl 680 movdqu ($inp), $T0 681 pxor $T0, $A0 682 movdqu $A0, ($oup) 683 lea 16($inp), $inp 684 lea 16($oup), $oup 685 movdqa $B0, $A0 686 movdqa $C0, $B0 687 movdqa $D0, $C0 688 jmp open_sse_tail_64_dec_loop 6891: 690 movdqa $A0, $A1 691 692 # Decrypt up to 16 bytes at the end. 693open_sse_tail_16: 694 test $inl, $inl 695 jz open_sse_finalize 696 697 # Read the final bytes into $T0. They need to be read in reverse order so 698 # that they end up in the correct order in $T0. 699 pxor $T0, $T0 700 lea -1($inp, $inl), $inp 701 movq $inl, $itr2 7022: 703 pslldq \$1, $T0 704 pinsrb \$0, ($inp), $T0 705 sub \$1, $inp 706 sub \$1, $itr2 707 jnz 2b 708 7093: 710 movq $T0, $t0 711 pextrq \$1, $T0, $t1 712 # The final bytes of keystream are in $A1. 713 pxor $A1, $T0 714 715 # Copy the plaintext bytes out. 7162: 717 pextrb \$0, $T0, ($oup) 718 psrldq \$1, $T0 719 add \$1, $oup 720 sub \$1, $inl 721 jne 2b 722 723 add $t0, $acc0 724 adc $t1, $acc1 725 adc \$1, $acc2\n"; 726 &poly_mul(); $code.=" 727 728open_sse_finalize:\n"; 729 &poly_add($len_store); 730 &poly_mul(); $code.=" 731 # Final reduce 732 mov $acc0, $t0 733 mov $acc1, $t1 734 mov $acc2, $t2 735 sub \$-5, $acc0 736 sbb \$-1, $acc1 737 sbb \$3, $acc2 738 cmovc $t0, $acc0 739 cmovc $t1, $acc1 740 cmovc $t2, $acc2 741 # Add in s part of the key 742 add 0+$s_store, $acc0 743 adc 8+$s_store, $acc1 744 745 add \$288 + 32, %rsp 746.cfi_adjust_cfa_offset -(288 + 32) 747 pop $keyp 748.cfi_adjust_cfa_offset -8 749 movq $acc0, ($keyp) 750 movq $acc1, 8($keyp) 751 752 pop %r15 753.cfi_adjust_cfa_offset -8 754 pop %r14 755.cfi_adjust_cfa_offset -8 756 pop %r13 757.cfi_adjust_cfa_offset -8 758 pop %r12 759.cfi_adjust_cfa_offset -8 760 pop %rbx 761.cfi_adjust_cfa_offset -8 762 pop %rbp 763.cfi_adjust_cfa_offset -8 764 ret 765.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 766############################################################################### 767open_sse_128: 768 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 769 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 770 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 771 movdqu 2*16($keyp), $D0 772 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 773 movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2 774 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3 775 mov \$10, $acc0 7761: \n"; 777 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 778 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 779 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 780 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 781 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 782 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 783 dec $acc0 784 jnz 1b 785 paddd .chacha20_consts(%rip), $A0 786 paddd .chacha20_consts(%rip), $A1 787 paddd .chacha20_consts(%rip), $A2 788 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 789 paddd $T2, $C1\npaddd $T2, $C2 790 paddd $T3, $D1 791 paddd .sse_inc(%rip), $T3 792 paddd $T3, $D2 793 # Clamp and store the key 794 pand .clamp(%rip), $A0 795 movdqa $A0, $r_store 796 movdqa $B0, $s_store 797 # Hash 798 mov %r8, $itr2 799 call poly_hash_ad_internal 8001: 801 cmp \$16, $inl 802 jb open_sse_tail_16 803 sub \$16, $inl\n"; 804 # Load for hashing 805 &poly_add("0*8($inp)"); $code.=" 806 # Load for decryption 807 movdqu 0*16($inp), $T0 808 pxor $T0, $A1 809 movdqu $A1, 0*16($oup) 810 lea 1*16($inp), $inp 811 lea 1*16($oup), $oup\n"; 812 &poly_mul(); $code.=" 813 # Shift the stream left 814 movdqa $B1, $A1 815 movdqa $C1, $B1 816 movdqa $D1, $C1 817 movdqa $A2, $D1 818 movdqa $B2, $A2 819 movdqa $C2, $B2 820 movdqa $D2, $C2 821 jmp 1b 822 jmp open_sse_tail_16 823.size chacha20_poly1305_open, .-chacha20_poly1305_open 824.cfi_endproc 825 826################################################################################ 827################################################################################ 828# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp); 829.globl chacha20_poly1305_seal 830.type chacha20_poly1305_seal,\@function,2 831.align 64 832chacha20_poly1305_seal: 833.cfi_startproc 834 push %rbp 835.cfi_adjust_cfa_offset 8 836 push %rbx 837.cfi_adjust_cfa_offset 8 838 push %r12 839.cfi_adjust_cfa_offset 8 840 push %r13 841.cfi_adjust_cfa_offset 8 842 push %r14 843.cfi_adjust_cfa_offset 8 844 push %r15 845.cfi_adjust_cfa_offset 8 846 # We write the calculated authenticator back to keyp at the end, so save 847 # the pointer on the stack too. 848 push $keyp 849.cfi_adjust_cfa_offset 8 850 sub \$288 + 32, %rsp 851.cfi_adjust_cfa_offset 288 + 32 852.cfi_offset rbp, -16 853.cfi_offset rbx, -24 854.cfi_offset r12, -32 855.cfi_offset r13, -40 856.cfi_offset r14, -48 857.cfi_offset r15, -56 858 lea 32(%rsp), %rbp 859 and \$-32, %rbp 860 mov 56($keyp), $inl # extra_in_len 861 addq %rdx, $inl 862 mov $inl, 8+$len_store 863 mov %r8, 0+$len_store 864 mov %rdx, $inl\n"; $code.=" 865 mov OPENSSL_ia32cap_P+8(%rip), %eax 866 and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present 867 xor \$`(1<<5) + (1<<8)`, %eax 868 jz chacha20_poly1305_seal_avx2\n" if ($avx>1); 869$code.=" 870 cmp \$128, $inl 871 jbe seal_sse_128 872 # For longer buffers, prepare the poly key + some stream 873 movdqa .chacha20_consts(%rip), $A0 874 movdqu 0*16($keyp), $B0 875 movdqu 1*16($keyp), $C0 876 movdqu 2*16($keyp), $D0 877 movdqa $A0, $A1 878 movdqa $A0, $A2 879 movdqa $A0, $A3 880 movdqa $B0, $B1 881 movdqa $B0, $B2 882 movdqa $B0, $B3 883 movdqa $C0, $C1 884 movdqa $C0, $C2 885 movdqa $C0, $C3 886 movdqa $D0, $D3 887 paddd .sse_inc(%rip), $D0 888 movdqa $D0, $D2 889 paddd .sse_inc(%rip), $D0 890 movdqa $D0, $D1 891 paddd .sse_inc(%rip), $D0 892 # Store on stack 893 movdqa $B0, $state1_store 894 movdqa $C0, $state2_store 895 movdqa $D0, $ctr0_store 896 movdqa $D1, $ctr1_store 897 movdqa $D2, $ctr2_store 898 movdqa $D3, $ctr3_store 899 mov \$10, $acc0 9001: \n"; 901 foreach $l (@loop_body) {$code.=$l."\n";} 902 @loop_body = split /\n/, $chacha_body; $code.=" 903 dec $acc0 904 jnz 1b\n"; 905 &finalize_state(4); $code.=" 906 # Clamp and store the key 907 pand .clamp(%rip), $A3 908 movdqa $A3, $r_store 909 movdqa $B3, $s_store 910 # Hash 911 mov %r8, $itr2 912 call poly_hash_ad_internal\n"; 913 &xor_stream($A2,$B2,$C2,$D2,"0*16"); 914 &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.=" 915 cmp \$12*16, $inl 916 ja 1f 917 mov \$8*16, $itr1 918 sub \$8*16, $inl 919 lea 8*16($inp), $inp 920 jmp seal_sse_128_seal_hash 9211: \n"; 922 &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.=" 923 mov \$12*16, $itr1 924 sub \$12*16, $inl 925 lea 12*16($inp), $inp 926 mov \$2, $itr1 927 mov \$8, $itr2 928 cmp \$4*16, $inl 929 jbe seal_sse_tail_64 930 cmp \$8*16, $inl 931 jbe seal_sse_tail_128 932 cmp \$12*16, $inl 933 jbe seal_sse_tail_192 934 9351: \n"; 936 # The main loop 937 &prep_state(4); $code.=" 9382: \n"; 939 &emit_body(20); 940 &poly_add("0($oup)"); 941 &emit_body(20); 942 &poly_stage1(); 943 &emit_body(20); 944 &poly_stage2(); 945 &emit_body(20); 946 &poly_stage3(); 947 &emit_body(20); 948 &poly_reduce_stage(); 949 foreach $l (@loop_body) {$code.=$l."\n";} 950 @loop_body = split /\n/, $chacha_body; $code.=" 951 lea 16($oup), $oup 952 dec $itr2 953 jge 2b\n"; 954 &poly_add("0*8($oup)"); 955 &poly_mul(); $code.=" 956 lea 16($oup), $oup 957 dec $itr1 958 jg 2b\n"; 959 960 &finalize_state(4);$code.=" 961 movdqa $D2, $tmp_store\n"; 962 &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.=" 963 movdqa $tmp_store, $D2\n"; 964 &xor_stream($A2,$B2,$C2,$D2, 4*16); 965 &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.=" 966 cmp \$16*16, $inl 967 ja 3f 968 969 mov \$12*16, $itr1 970 sub \$12*16, $inl 971 lea 12*16($inp), $inp 972 jmp seal_sse_128_seal_hash 9733: \n"; 974 &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.=" 975 lea 16*16($inp), $inp 976 sub \$16*16, $inl 977 mov \$6, $itr1 978 mov \$4, $itr2 979 cmp \$12*16, $inl 980 jg 1b 981 mov $inl, $itr1 982 test $inl, $inl 983 je seal_sse_128_seal_hash 984 mov \$6, $itr1 985 cmp \$4*16, $inl 986 jg 3f 987############################################################################### 988seal_sse_tail_64:\n"; 989 &prep_state(1); $code.=" 9901: \n"; 991 &poly_add("0($oup)"); 992 &poly_mul(); $code.=" 993 lea 16($oup), $oup 9942: \n"; 995 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 996 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 997 &poly_add("0($oup)"); 998 &poly_mul(); $code.=" 999 lea 16($oup), $oup 1000 dec $itr1 1001 jg 1b 1002 dec $itr2 1003 jge 2b\n"; 1004 &finalize_state(1); $code.=" 1005 jmp seal_sse_128_seal 10063: 1007 cmp \$8*16, $inl 1008 jg 3f 1009############################################################################### 1010seal_sse_tail_128:\n"; 1011 &prep_state(2); $code.=" 10121: \n"; 1013 &poly_add("0($oup)"); 1014 &poly_mul(); $code.=" 1015 lea 16($oup), $oup 10162: \n"; 1017 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1018 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1019 &poly_add("0($oup)"); 1020 &poly_mul(); 1021 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1022 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1023 lea 16($oup), $oup 1024 dec $itr1 1025 jg 1b 1026 dec $itr2 1027 jge 2b\n"; 1028 &finalize_state(2); 1029 &xor_stream($A1,$B1,$C1,$D1,0*16); $code.=" 1030 mov \$4*16, $itr1 1031 sub \$4*16, $inl 1032 lea 4*16($inp), $inp 1033 jmp seal_sse_128_seal_hash 10343: 1035############################################################################### 1036seal_sse_tail_192:\n"; 1037 &prep_state(3); $code.=" 10381: \n"; 1039 &poly_add("0($oup)"); 1040 &poly_mul(); $code.=" 1041 lea 16($oup), $oup 10422: \n"; 1043 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1044 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1045 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1046 &poly_add("0($oup)"); 1047 &poly_mul(); 1048 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1049 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1050 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1051 lea 16($oup), $oup 1052 dec $itr1 1053 jg 1b 1054 dec $itr2 1055 jge 2b\n"; 1056 &finalize_state(3); 1057 &xor_stream($A2,$B2,$C2,$D2,0*16); 1058 &xor_stream($A1,$B1,$C1,$D1,4*16); $code.=" 1059 mov \$8*16, $itr1 1060 sub \$8*16, $inl 1061 lea 8*16($inp), $inp 1062############################################################################### 1063seal_sse_128_seal_hash: 1064 cmp \$16, $itr1 1065 jb seal_sse_128_seal\n"; 1066 &poly_add("0($oup)"); 1067 &poly_mul(); $code.=" 1068 sub \$16, $itr1 1069 lea 16($oup), $oup 1070 jmp seal_sse_128_seal_hash 1071 1072seal_sse_128_seal: 1073 cmp \$16, $inl 1074 jb seal_sse_tail_16 1075 sub \$16, $inl 1076 # Load for decryption 1077 movdqu 0*16($inp), $T0 1078 pxor $T0, $A0 1079 movdqu $A0, 0*16($oup) 1080 # Then hash 1081 add 0*8($oup), $acc0 1082 adc 1*8($oup), $acc1 1083 adc \$1, $acc2 1084 lea 1*16($inp), $inp 1085 lea 1*16($oup), $oup\n"; 1086 &poly_mul(); $code.=" 1087 # Shift the stream left 1088 movdqa $B0, $A0 1089 movdqa $C0, $B0 1090 movdqa $D0, $C0 1091 movdqa $A1, $D0 1092 movdqa $B1, $A1 1093 movdqa $C1, $B1 1094 movdqa $D1, $C1 1095 jmp seal_sse_128_seal 1096 1097seal_sse_tail_16: 1098 test $inl, $inl 1099 jz process_blocks_of_extra_in 1100 # We can only load the PT one byte at a time to avoid buffer overread 1101 mov $inl, $itr2 1102 mov $inl, $itr1 1103 lea -1($inp, $inl), $inp 1104 pxor $T3, $T3 11051: 1106 pslldq \$1, $T3 1107 pinsrb \$0, ($inp), $T3 1108 lea -1($inp), $inp 1109 dec $itr1 1110 jne 1b 1111 1112 # XOR the keystream with the plaintext. 1113 pxor $A0, $T3 1114 1115 # Write ciphertext out, byte-by-byte. 1116 movq $inl, $itr1 1117 movdqu $T3, $A0 11182: 1119 pextrb \$0, $A0, ($oup) 1120 psrldq \$1, $A0 1121 add \$1, $oup 1122 sub \$1, $itr1 1123 jnz 2b 1124 1125 # $T3 contains the final (partial, non-empty) block of ciphertext which 1126 # needs to be fed into the Poly1305 state. The right-most $inl bytes of it 1127 # are valid. We need to fill it with extra_in bytes until full, or until we 1128 # run out of bytes. 1129 # 1130 # $keyp points to the tag output, which is actually a struct with the 1131 # extra_in pointer and length at offset 48. 1132 movq 288+32(%rsp), $keyp 1133 movq 56($keyp), $t1 # extra_in_len 1134 movq 48($keyp), $t0 # extra_in 1135 test $t1, $t1 1136 jz process_partial_block # Common case: no bytes of extra_in 1137 1138 movq \$16, $t2 1139 subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3. 1140 cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len 1141 # (note that AT&T syntax reverses the arguments) 1142 jge load_extra_in 1143 movq $t1, $t2 1144 1145load_extra_in: 1146 # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load 1147 # into $T3. They are loaded in reverse order. 1148 leaq -1($t0, $t2), $inp 1149 # Update extra_in and extra_in_len to reflect the bytes that are about to 1150 # be read. 1151 addq $t2, $t0 1152 subq $t2, $t1 1153 movq $t0, 48($keyp) 1154 movq $t1, 56($keyp) 1155 1156 # Update $itr2, which is used to select the mask later on, to reflect the 1157 # extra bytes about to be added. 1158 addq $t2, $itr2 1159 1160 # Load $t2 bytes of extra_in into $T2. 1161 pxor $T2, $T2 11623: 1163 pslldq \$1, $T2 1164 pinsrb \$0, ($inp), $T2 1165 lea -1($inp), $inp 1166 sub \$1, $t2 1167 jnz 3b 1168 1169 # Shift $T2 up the length of the remainder from the main encryption. Sadly, 1170 # the shift for an XMM register has to be a constant, thus we loop to do 1171 # this. 1172 movq $inl, $t2 1173 11744: 1175 pslldq \$1, $T2 1176 sub \$1, $t2 1177 jnz 4b 1178 1179 # Mask $T3 (the remainder from the main encryption) so that superfluous 1180 # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are 1181 # disjoint and so we can merge them with an OR. 1182 lea .and_masks(%rip), $t2 1183 shl \$4, $inl 1184 pand -16($t2, $inl), $T3 1185 1186 # Merge $T2 into $T3, forming the remainder block. 1187 por $T2, $T3 1188 1189 # The block of ciphertext + extra_in is ready to be included in the 1190 # Poly1305 state. 1191 movq $T3, $t0 1192 pextrq \$1, $T3, $t1 1193 add $t0, $acc0 1194 adc $t1, $acc1 1195 adc \$1, $acc2\n"; 1196 &poly_mul(); $code.=" 1197 1198process_blocks_of_extra_in: 1199 # There may be additional bytes of extra_in to process. 1200 movq 288+32(%rsp), $keyp 1201 movq 48($keyp), $inp # extra_in 1202 movq 56($keyp), $itr2 # extra_in_len 1203 movq $itr2, $itr1 1204 shr \$4, $itr2 # number of blocks 1205 12065: 1207 jz process_extra_in_trailer\n"; 1208 &poly_add("0($inp)"); 1209 &poly_mul(); $code.=" 1210 leaq 16($inp), $inp 1211 subq \$1, $itr2 1212 jmp 5b 1213 1214process_extra_in_trailer: 1215 andq \$15, $itr1 # remaining num bytes (<16) of extra_in 1216 movq $itr1, $inl 1217 jz do_length_block 1218 leaq -1($inp, $itr1), $inp 1219 12206: 1221 pslldq \$1, $T3 1222 pinsrb \$0, ($inp), $T3 1223 lea -1($inp), $inp 1224 sub \$1, $itr1 1225 jnz 6b 1226 1227process_partial_block: 1228 # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0 1229 lea .and_masks(%rip), $t2 1230 shl \$4, $inl 1231 pand -16($t2, $inl), $T3 1232 movq $T3, $t0 1233 pextrq \$1, $T3, $t1 1234 add $t0, $acc0 1235 adc $t1, $acc1 1236 adc \$1, $acc2\n"; 1237 &poly_mul(); $code.=" 1238 1239do_length_block:\n"; 1240 &poly_add($len_store); 1241 &poly_mul(); $code.=" 1242 # Final reduce 1243 mov $acc0, $t0 1244 mov $acc1, $t1 1245 mov $acc2, $t2 1246 sub \$-5, $acc0 1247 sbb \$-1, $acc1 1248 sbb \$3, $acc2 1249 cmovc $t0, $acc0 1250 cmovc $t1, $acc1 1251 cmovc $t2, $acc2 1252 # Add in s part of the key 1253 add 0+$s_store, $acc0 1254 adc 8+$s_store, $acc1 1255 1256 add \$288 + 32, %rsp 1257.cfi_adjust_cfa_offset -(288 + 32) 1258 pop $keyp 1259.cfi_adjust_cfa_offset -8 1260 mov $acc0, 0*8($keyp) 1261 mov $acc1, 1*8($keyp) 1262 1263 pop %r15 1264.cfi_adjust_cfa_offset -8 1265 pop %r14 1266.cfi_adjust_cfa_offset -8 1267 pop %r13 1268.cfi_adjust_cfa_offset -8 1269 pop %r12 1270.cfi_adjust_cfa_offset -8 1271 pop %rbx 1272.cfi_adjust_cfa_offset -8 1273 pop %rbp 1274.cfi_adjust_cfa_offset -8 1275 ret 1276.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 1277################################################################################ 1278seal_sse_128: 1279 movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2 1280 movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2 1281 movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2 1282 movdqu 2*16($keyp), $D2 1283 movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0 1284 movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1 1285 movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3 1286 mov \$10, $acc0 12871:\n"; 1288 &chacha_qr($A0,$B0,$C0,$D0,$T0,"left"); 1289 &chacha_qr($A1,$B1,$C1,$D1,$T0,"left"); 1290 &chacha_qr($A2,$B2,$C2,$D2,$T0,"left"); 1291 &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); 1292 &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); 1293 &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1294 dec $acc0 1295 jnz 1b 1296 paddd .chacha20_consts(%rip), $A0 1297 paddd .chacha20_consts(%rip), $A1 1298 paddd .chacha20_consts(%rip), $A2 1299 paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2 1300 paddd $T2, $C0\npaddd $T2, $C1 1301 paddd $T3, $D0 1302 paddd .sse_inc(%rip), $T3 1303 paddd $T3, $D1 1304 # Clamp and store the key 1305 pand .clamp(%rip), $A2 1306 movdqa $A2, $r_store 1307 movdqa $B2, $s_store 1308 # Hash 1309 mov %r8, $itr2 1310 call poly_hash_ad_internal 1311 jmp seal_sse_128_seal 1312.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n"; 1313} 1314 1315# There should have been a cfi_endproc at the end of that function, but the two 1316# following blocks of code are jumped to without a stack frame and the CFI 1317# context which they are used in happens to match the CFI context at the end of 1318# the previous function. So the CFI table is just extended to the end of them. 1319 1320if ($avx>1) { 1321 1322($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15)); 1323my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15)); 1324($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3); 1325$state1_store="2*32(%rbp)"; 1326$state2_store="3*32(%rbp)"; 1327$tmp_store="4*32(%rbp)"; 1328$ctr0_store="5*32(%rbp)"; 1329$ctr1_store="6*32(%rbp)"; 1330$ctr2_store="7*32(%rbp)"; 1331$ctr3_store="8*32(%rbp)"; 1332 1333sub chacha_qr_avx2 { 1334my ($a,$b,$c,$d,$t,$dir)=@_; 1335$code.=<<___ if ($dir =~ /store/); 1336 vmovdqa $t, $tmp_store 1337___ 1338$code.=<<___; 1339 vpaddd $b, $a, $a 1340 vpxor $a, $d, $d 1341 vpshufb .rol16(%rip), $d, $d 1342 vpaddd $d, $c, $c 1343 vpxor $c, $b, $b 1344 vpsrld \$20, $b, $t 1345 vpslld \$12, $b, $b 1346 vpxor $t, $b, $b 1347 vpaddd $b, $a, $a 1348 vpxor $a, $d, $d 1349 vpshufb .rol8(%rip), $d, $d 1350 vpaddd $d, $c, $c 1351 vpxor $c, $b, $b 1352 vpslld \$7, $b, $t 1353 vpsrld \$25, $b, $b 1354 vpxor $t, $b, $b 1355___ 1356$code.=<<___ if ($dir =~ /left/); 1357 vpalignr \$12, $d, $d, $d 1358 vpalignr \$8, $c, $c, $c 1359 vpalignr \$4, $b, $b, $b 1360___ 1361$code.=<<___ if ($dir =~ /right/); 1362 vpalignr \$4, $d, $d, $d 1363 vpalignr \$8, $c, $c, $c 1364 vpalignr \$12, $b, $b, $b 1365___ 1366$code.=<<___ if ($dir =~ /load/); 1367 vmovdqa $tmp_store, $t 1368___ 1369} 1370 1371sub prep_state_avx2 { 1372my ($n)=@_; 1373$code.=<<___; 1374 vmovdqa .chacha20_consts(%rip), $A0 1375 vmovdqa $state1_store, $B0 1376 vmovdqa $state2_store, $C0 1377___ 1378$code.=<<___ if ($n ge 2); 1379 vmovdqa $A0, $A1 1380 vmovdqa $B0, $B1 1381 vmovdqa $C0, $C1 1382___ 1383$code.=<<___ if ($n ge 3); 1384 vmovdqa $A0, $A2 1385 vmovdqa $B0, $B2 1386 vmovdqa $C0, $C2 1387___ 1388$code.=<<___ if ($n ge 4); 1389 vmovdqa $A0, $A3 1390 vmovdqa $B0, $B3 1391 vmovdqa $C0, $C3 1392___ 1393$code.=<<___ if ($n eq 1); 1394 vmovdqa .avx2_inc(%rip), $D0 1395 vpaddd $ctr0_store, $D0, $D0 1396 vmovdqa $D0, $ctr0_store 1397___ 1398$code.=<<___ if ($n eq 2); 1399 vmovdqa .avx2_inc(%rip), $D0 1400 vpaddd $ctr0_store, $D0, $D1 1401 vpaddd $D1, $D0, $D0 1402 vmovdqa $D0, $ctr0_store 1403 vmovdqa $D1, $ctr1_store 1404___ 1405$code.=<<___ if ($n eq 3); 1406 vmovdqa .avx2_inc(%rip), $D0 1407 vpaddd $ctr0_store, $D0, $D2 1408 vpaddd $D2, $D0, $D1 1409 vpaddd $D1, $D0, $D0 1410 vmovdqa $D0, $ctr0_store 1411 vmovdqa $D1, $ctr1_store 1412 vmovdqa $D2, $ctr2_store 1413___ 1414$code.=<<___ if ($n eq 4); 1415 vmovdqa .avx2_inc(%rip), $D0 1416 vpaddd $ctr0_store, $D0, $D3 1417 vpaddd $D3, $D0, $D2 1418 vpaddd $D2, $D0, $D1 1419 vpaddd $D1, $D0, $D0 1420 vmovdqa $D3, $ctr3_store 1421 vmovdqa $D2, $ctr2_store 1422 vmovdqa $D1, $ctr1_store 1423 vmovdqa $D0, $ctr0_store 1424___ 1425} 1426 1427sub finalize_state_avx2 { 1428my ($n)=@_; 1429$code.=<<___ if ($n eq 4); 1430 vpaddd .chacha20_consts(%rip), $A3, $A3 1431 vpaddd $state1_store, $B3, $B3 1432 vpaddd $state2_store, $C3, $C3 1433 vpaddd $ctr3_store, $D3, $D3 1434___ 1435$code.=<<___ if ($n ge 3); 1436 vpaddd .chacha20_consts(%rip), $A2, $A2 1437 vpaddd $state1_store, $B2, $B2 1438 vpaddd $state2_store, $C2, $C2 1439 vpaddd $ctr2_store, $D2, $D2 1440___ 1441$code.=<<___ if ($n ge 2); 1442 vpaddd .chacha20_consts(%rip), $A1, $A1 1443 vpaddd $state1_store, $B1, $B1 1444 vpaddd $state2_store, $C1, $C1 1445 vpaddd $ctr1_store, $D1, $D1 1446___ 1447$code.=<<___; 1448 vpaddd .chacha20_consts(%rip), $A0, $A0 1449 vpaddd $state1_store, $B0, $B0 1450 vpaddd $state2_store, $C0, $C0 1451 vpaddd $ctr0_store, $D0, $D0 1452___ 1453} 1454 1455sub xor_stream_avx2 { 1456my ($A, $B, $C, $D, $offset, $hlp)=@_; 1457$code.=<<___; 1458 vperm2i128 \$0x02, $A, $B, $hlp 1459 vperm2i128 \$0x13, $A, $B, $B 1460 vperm2i128 \$0x02, $C, $D, $A 1461 vperm2i128 \$0x13, $C, $D, $C 1462 vpxor 0*32+$offset($inp), $hlp, $hlp 1463 vpxor 1*32+$offset($inp), $A, $A 1464 vpxor 2*32+$offset($inp), $B, $B 1465 vpxor 3*32+$offset($inp), $C, $C 1466 vmovdqu $hlp, 0*32+$offset($oup) 1467 vmovdqu $A, 1*32+$offset($oup) 1468 vmovdqu $B, 2*32+$offset($oup) 1469 vmovdqu $C, 3*32+$offset($oup) 1470___ 1471} 1472 1473sub finish_stream_avx2 { 1474my ($A, $B, $C, $D, $hlp)=@_; 1475$code.=<<___; 1476 vperm2i128 \$0x13, $A, $B, $hlp 1477 vperm2i128 \$0x02, $A, $B, $A 1478 vperm2i128 \$0x02, $C, $D, $B 1479 vperm2i128 \$0x13, $C, $D, $D 1480 vmovdqa $hlp, $C 1481___ 1482} 1483 1484sub poly_stage1_mulx { 1485$code.=<<___; 1486 mov 0+$r_store, %rdx 1487 mov %rdx, $t2 1488 mulx $acc0, $t0, $t1 1489 mulx $acc1, %rax, %rdx 1490 imulq $acc2, $t2 1491 add %rax, $t1 1492 adc %rdx, $t2 1493___ 1494} 1495 1496sub poly_stage2_mulx { 1497$code.=<<___; 1498 mov 8+$r_store, %rdx 1499 mulx $acc0, $acc0, %rax 1500 add $acc0, $t1 1501 mulx $acc1, $acc1, $t3 1502 adc $acc1, $t2 1503 adc \$0, $t3 1504 imulq $acc2, %rdx 1505___ 1506} 1507 1508sub poly_stage3_mulx { 1509$code.=<<___; 1510 add %rax, $t2 1511 adc %rdx, $t3 1512___ 1513} 1514 1515sub poly_mul_mulx { 1516 &poly_stage1_mulx(); 1517 &poly_stage2_mulx(); 1518 &poly_stage3_mulx(); 1519 &poly_reduce_stage(); 1520} 1521 1522sub gen_chacha_round_avx2 { 1523my ($rot1, $rot2, $shift)=@_; 1524my $round=""; 1525$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20); 1526$round=$round ."vmovdqa $rot2, $C0 1527 vpaddd $B3, $A3, $A3 1528 vpaddd $B2, $A2, $A2 1529 vpaddd $B1, $A1, $A1 1530 vpaddd $B0, $A0, $A0 1531 vpxor $A3, $D3, $D3 1532 vpxor $A2, $D2, $D2 1533 vpxor $A1, $D1, $D1 1534 vpxor $A0, $D0, $D0 1535 vpshufb $C0, $D3, $D3 1536 vpshufb $C0, $D2, $D2 1537 vpshufb $C0, $D1, $D1 1538 vpshufb $C0, $D0, $D0 1539 vmovdqa $tmp_store, $C0 1540 vpaddd $D3, $C3, $C3 1541 vpaddd $D2, $C2, $C2 1542 vpaddd $D1, $C1, $C1 1543 vpaddd $D0, $C0, $C0 1544 vpxor $C3, $B3, $B3 1545 vpxor $C2, $B2, $B2 1546 vpxor $C1, $B1, $B1 1547 vpxor $C0, $B0, $B0 1548 vmovdqa $C0, $tmp_store 1549 vpsrld \$$rot1, $B3, $C0 1550 vpslld \$32-$rot1, $B3, $B3 1551 vpxor $C0, $B3, $B3 1552 vpsrld \$$rot1, $B2, $C0 1553 vpslld \$32-$rot1, $B2, $B2 1554 vpxor $C0, $B2, $B2 1555 vpsrld \$$rot1, $B1, $C0 1556 vpslld \$32-$rot1, $B1, $B1 1557 vpxor $C0, $B1, $B1 1558 vpsrld \$$rot1, $B0, $C0 1559 vpslld \$32-$rot1, $B0, $B0 1560 vpxor $C0, $B0, $B0\n"; 1561($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/); 1562($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/); 1563$round=$round ."vmovdqa $tmp_store, $C0 1564 vpalignr \$$s1, $B3, $B3, $B3 1565 vpalignr \$$s2, $C3, $C3, $C3 1566 vpalignr \$$s3, $D3, $D3, $D3 1567 vpalignr \$$s1, $B2, $B2, $B2 1568 vpalignr \$$s2, $C2, $C2, $C2 1569 vpalignr \$$s3, $D2, $D2, $D2 1570 vpalignr \$$s1, $B1, $B1, $B1 1571 vpalignr \$$s2, $C1, $C1, $C1 1572 vpalignr \$$s3, $D1, $D1, $D1 1573 vpalignr \$$s1, $B0, $B0, $B0 1574 vpalignr \$$s2, $C0, $C0, $C0 1575 vpalignr \$$s3, $D0, $D0, $D0\n" 1576if (($shift =~ /left/) || ($shift =~ /right/)); 1577return $round; 1578}; 1579 1580$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1581 &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") . 1582 &gen_chacha_round_avx2(20, ".rol16(%rip)") . 1583 &gen_chacha_round_avx2(25, ".rol8(%rip)", "right"); 1584 1585@loop_body = split /\n/, $chacha_body; 1586 1587$code.=" 1588############################################################################### 1589.type chacha20_poly1305_open_avx2,\@function,2 1590.align 64 1591chacha20_poly1305_open_avx2: 1592 vzeroupper 1593 vmovdqa .chacha20_consts(%rip), $A0 1594 vbroadcasti128 0*16($keyp), $B0 1595 vbroadcasti128 1*16($keyp), $C0 1596 vbroadcasti128 2*16($keyp), $D0 1597 vpaddd .avx2_init(%rip), $D0, $D0 1598 cmp \$6*32, $inl 1599 jbe open_avx2_192 1600 cmp \$10*32, $inl 1601 jbe open_avx2_320 1602 1603 vmovdqa $B0, $state1_store 1604 vmovdqa $C0, $state2_store 1605 vmovdqa $D0, $ctr0_store 1606 mov \$10, $acc0 16071: \n"; 1608 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1609 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1610 dec $acc0 1611 jne 1b 1612 vpaddd .chacha20_consts(%rip), $A0, $A0 1613 vpaddd $state1_store, $B0, $B0 1614 vpaddd $state2_store, $C0, $C0 1615 vpaddd $ctr0_store, $D0, $D0 1616 1617 vperm2i128 \$0x02, $A0, $B0, $T0 1618 # Clamp and store key 1619 vpand .clamp(%rip), $T0, $T0 1620 vmovdqa $T0, $r_store 1621 # Stream for the first 64 bytes 1622 vperm2i128 \$0x13, $A0, $B0, $A0 1623 vperm2i128 \$0x13, $C0, $D0, $B0 1624 # Hash AD + first 64 bytes 1625 mov %r8, $itr2 1626 call poly_hash_ad_internal 1627 xor $itr1, $itr1 1628 # Hash first 64 bytes 16291: \n"; 1630 &poly_add("0($inp, $itr1)"); 1631 &poly_mul(); $code.=" 1632 add \$16, $itr1 1633 cmp \$2*32, $itr1 1634 jne 1b 1635 # Decrypt first 64 bytes 1636 vpxor 0*32($inp), $A0, $A0 1637 vpxor 1*32($inp), $B0, $B0 1638 vmovdqu $A0, 0*32($oup) 1639 vmovdqu $B0, 1*32($oup) 1640 lea 2*32($inp), $inp 1641 lea 2*32($oup), $oup 1642 sub \$2*32, $inl 16431: 1644 # Hash and decrypt 512 bytes each iteration 1645 cmp \$16*32, $inl 1646 jb 3f\n"; 1647 &prep_state_avx2(4); $code.=" 1648 xor $itr1, $itr1 16492: \n"; 1650 &poly_add("0*8($inp, $itr1)"); 1651 &emit_body(10); 1652 &poly_stage1_mulx(); 1653 &emit_body(9); 1654 &poly_stage2_mulx(); 1655 &emit_body(12); 1656 &poly_stage3_mulx(); 1657 &emit_body(10); 1658 &poly_reduce_stage(); 1659 &emit_body(9); 1660 &poly_add("2*8($inp, $itr1)"); 1661 &emit_body(8); 1662 &poly_stage1_mulx(); 1663 &emit_body(18); 1664 &poly_stage2_mulx(); 1665 &emit_body(18); 1666 &poly_stage3_mulx(); 1667 &emit_body(9); 1668 &poly_reduce_stage(); 1669 &emit_body(8); 1670 &poly_add("4*8($inp, $itr1)"); $code.=" 1671 lea 6*8($itr1), $itr1\n"; 1672 &emit_body(18); 1673 &poly_stage1_mulx(); 1674 &emit_body(8); 1675 &poly_stage2_mulx(); 1676 &emit_body(8); 1677 &poly_stage3_mulx(); 1678 &emit_body(18); 1679 &poly_reduce_stage(); 1680 foreach $l (@loop_body) {$code.=$l."\n";} 1681 @loop_body = split /\n/, $chacha_body; $code.=" 1682 cmp \$10*6*8, $itr1 1683 jne 2b\n"; 1684 &finalize_state_avx2(4); $code.=" 1685 vmovdqa $A0, $tmp_store\n"; 1686 &poly_add("10*6*8($inp)"); 1687 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1688 vmovdqa $tmp_store, $A0\n"; 1689 &poly_mul(); 1690 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1691 &poly_add("10*6*8+2*8($inp)"); 1692 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1693 &poly_mul(); 1694 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 1695 lea 16*32($inp), $inp 1696 lea 16*32($oup), $oup 1697 sub \$16*32, $inl 1698 jmp 1b 16993: 1700 test $inl, $inl 1701 vzeroupper 1702 je open_sse_finalize 17033: 1704 cmp \$4*32, $inl 1705 ja 3f\n"; 1706############################################################################### 1707 # 1-128 bytes left 1708 &prep_state_avx2(1); $code.=" 1709 xor $itr2, $itr2 1710 mov $inl, $itr1 1711 and \$-16, $itr1 1712 test $itr1, $itr1 1713 je 2f 17141: \n"; 1715 &poly_add("0*8($inp, $itr2)"); 1716 &poly_mul(); $code.=" 17172: 1718 add \$16, $itr2\n"; 1719 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1720 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1721 cmp $itr1, $itr2 1722 jb 1b 1723 cmp \$160, $itr2 1724 jne 2b\n"; 1725 &finalize_state_avx2(1); 1726 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 1727 jmp open_avx2_tail_loop 17283: 1729 cmp \$8*32, $inl 1730 ja 3f\n"; 1731############################################################################### 1732 # 129-256 bytes left 1733 &prep_state_avx2(2); $code.=" 1734 mov $inl, $tmp_store 1735 mov $inl, $itr1 1736 sub \$4*32, $itr1 1737 shr \$4, $itr1 1738 mov \$10, $itr2 1739 cmp \$10, $itr1 1740 cmovg $itr2, $itr1 1741 mov $inp, $inl 1742 xor $itr2, $itr2 17431: \n"; 1744 &poly_add("0*8($inl)"); 1745 &poly_mul_mulx(); $code.=" 1746 lea 16($inl), $inl 17472: \n"; 1748 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1749 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.=" 1750 inc $itr2\n"; 1751 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1752 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1753 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 1754 cmp $itr1, $itr2 1755 jb 1b 1756 cmp \$10, $itr2 1757 jne 2b 1758 mov $inl, $itr2 1759 sub $inp, $inl 1760 mov $inl, $itr1 1761 mov $tmp_store, $inl 17621: 1763 add \$16, $itr1 1764 cmp $inl, $itr1 1765 jg 1f\n"; 1766 &poly_add("0*8($itr2)"); 1767 &poly_mul_mulx(); $code.=" 1768 lea 16($itr2), $itr2 1769 jmp 1b 17701: \n"; 1771 &finalize_state_avx2(2); 1772 &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0); 1773 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1774 lea 4*32($inp), $inp 1775 lea 4*32($oup), $oup 1776 sub \$4*32, $inl 1777 jmp open_avx2_tail_loop 17783: 1779 cmp \$12*32, $inl 1780 ja 3f\n"; 1781############################################################################### 1782 # 257-383 bytes left 1783 &prep_state_avx2(3); $code.=" 1784 mov $inl, $tmp_store 1785 mov $inl, $itr1 1786 sub \$8*32, $itr1 1787 shr \$4, $itr1 1788 add \$6, $itr1 1789 mov \$10, $itr2 1790 cmp \$10, $itr1 1791 cmovg $itr2, $itr1 1792 mov $inp, $inl 1793 xor $itr2, $itr2 17941: \n"; 1795 &poly_add("0*8($inl)"); 1796 &poly_mul_mulx(); $code.=" 1797 lea 16($inl), $inl 17982: \n"; 1799 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 1800 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1801 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1802 &poly_add("0*8($inl)"); 1803 &poly_mul(); $code.=" 1804 lea 16($inl), $inl 1805 inc $itr2\n"; 1806 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); 1807 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 1808 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.=" 1809 cmp $itr1, $itr2 1810 jb 1b 1811 cmp \$10, $itr2 1812 jne 2b 1813 mov $inl, $itr2 1814 sub $inp, $inl 1815 mov $inl, $itr1 1816 mov $tmp_store, $inl 18171: 1818 add \$16, $itr1 1819 cmp $inl, $itr1 1820 jg 1f\n"; 1821 &poly_add("0*8($itr2)"); 1822 &poly_mul_mulx(); $code.=" 1823 lea 16($itr2), $itr2 1824 jmp 1b 18251: \n"; 1826 &finalize_state_avx2(3); 1827 &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0); 1828 &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0); 1829 &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.=" 1830 lea 8*32($inp), $inp 1831 lea 8*32($oup), $oup 1832 sub \$8*32, $inl 1833 jmp open_avx2_tail_loop 18343: \n"; 1835############################################################################### 1836 # 384-512 bytes left 1837 &prep_state_avx2(4); $code.=" 1838 xor $itr1, $itr1 1839 mov $inp, $itr2 18401: \n"; 1841 &poly_add("0*8($itr2)"); 1842 &poly_mul(); $code.=" 1843 lea 2*8($itr2), $itr2 18442: \n"; 1845 &emit_body(37); 1846 &poly_add("0*8($itr2)"); 1847 &poly_mul_mulx(); 1848 &emit_body(48); 1849 &poly_add("2*8($itr2)"); 1850 &poly_mul_mulx(); $code.=" 1851 lea 4*8($itr2), $itr2\n"; 1852 foreach $l (@loop_body) {$code.=$l."\n";} 1853 @loop_body = split /\n/, $chacha_body; $code.=" 1854 inc $itr1 1855 cmp \$4, $itr1 1856 jl 1b 1857 cmp \$10, $itr1 1858 jne 2b 1859 mov $inl, $itr1 1860 sub \$12*32, $itr1 1861 and \$-16, $itr1 18621: 1863 test $itr1, $itr1 1864 je 1f\n"; 1865 &poly_add("0*8($itr2)"); 1866 &poly_mul_mulx(); $code.=" 1867 lea 2*8($itr2), $itr2 1868 sub \$2*8, $itr1 1869 jmp 1b 18701: \n"; 1871 &finalize_state_avx2(4); $code.=" 1872 vmovdqa $A0, $tmp_store\n"; 1873 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 1874 vmovdqa $tmp_store, $A0\n"; 1875 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 1876 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 1877 &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.=" 1878 lea 12*32($inp), $inp 1879 lea 12*32($oup), $oup 1880 sub \$12*32, $inl 1881open_avx2_tail_loop: 1882 cmp \$32, $inl 1883 jb open_avx2_tail 1884 sub \$32, $inl 1885 vpxor ($inp), $A0, $A0 1886 vmovdqu $A0, ($oup) 1887 lea 1*32($inp), $inp 1888 lea 1*32($oup), $oup 1889 vmovdqa $B0, $A0 1890 vmovdqa $C0, $B0 1891 vmovdqa $D0, $C0 1892 jmp open_avx2_tail_loop 1893open_avx2_tail: 1894 cmp \$16, $inl 1895 vmovdqa $A0x, $A1x 1896 jb 1f 1897 sub \$16, $inl 1898 #load for decryption 1899 vpxor ($inp), $A0x, $A1x 1900 vmovdqu $A1x, ($oup) 1901 lea 1*16($inp), $inp 1902 lea 1*16($oup), $oup 1903 vperm2i128 \$0x11, $A0, $A0, $A0 1904 vmovdqa $A0x, $A1x 19051: 1906 vzeroupper 1907 jmp open_sse_tail_16 1908############################################################################### 1909open_avx2_192: 1910 vmovdqa $A0, $A1 1911 vmovdqa $A0, $A2 1912 vmovdqa $B0, $B1 1913 vmovdqa $B0, $B2 1914 vmovdqa $C0, $C1 1915 vmovdqa $C0, $C2 1916 vpaddd .avx2_inc(%rip), $D0, $D1 1917 vmovdqa $D0, $T2 1918 vmovdqa $D1, $T3 1919 mov \$10, $acc0 19201: \n"; 1921 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 1922 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 1923 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 1924 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 1925 dec $acc0 1926 jne 1b 1927 vpaddd $A2, $A0, $A0 1928 vpaddd $A2, $A1, $A1 1929 vpaddd $B2, $B0, $B0 1930 vpaddd $B2, $B1, $B1 1931 vpaddd $C2, $C0, $C0 1932 vpaddd $C2, $C1, $C1 1933 vpaddd $T2, $D0, $D0 1934 vpaddd $T3, $D1, $D1 1935 vperm2i128 \$0x02, $A0, $B0, $T0 1936 # Clamp and store the key 1937 vpand .clamp(%rip), $T0, $T0 1938 vmovdqa $T0, $r_store 1939 # Stream for up to 192 bytes 1940 vperm2i128 \$0x13, $A0, $B0, $A0 1941 vperm2i128 \$0x13, $C0, $D0, $B0 1942 vperm2i128 \$0x02, $A1, $B1, $C0 1943 vperm2i128 \$0x02, $C1, $D1, $D0 1944 vperm2i128 \$0x13, $A1, $B1, $A1 1945 vperm2i128 \$0x13, $C1, $D1, $B1 1946open_avx2_short: 1947 mov %r8, $itr2 1948 call poly_hash_ad_internal 1949open_avx2_hash_and_xor_loop: 1950 cmp \$32, $inl 1951 jb open_avx2_short_tail_32 1952 sub \$32, $inl\n"; 1953 # Load + hash 1954 &poly_add("0*8($inp)"); 1955 &poly_mul(); 1956 &poly_add("2*8($inp)"); 1957 &poly_mul(); $code.=" 1958 # Load + decrypt 1959 vpxor ($inp), $A0, $A0 1960 vmovdqu $A0, ($oup) 1961 lea 1*32($inp), $inp 1962 lea 1*32($oup), $oup 1963 # Shift stream 1964 vmovdqa $B0, $A0 1965 vmovdqa $C0, $B0 1966 vmovdqa $D0, $C0 1967 vmovdqa $A1, $D0 1968 vmovdqa $B1, $A1 1969 vmovdqa $C1, $B1 1970 vmovdqa $D1, $C1 1971 vmovdqa $A2, $D1 1972 vmovdqa $B2, $A2 1973 jmp open_avx2_hash_and_xor_loop 1974open_avx2_short_tail_32: 1975 cmp \$16, $inl 1976 vmovdqa $A0x, $A1x 1977 jb 1f 1978 sub \$16, $inl\n"; 1979 &poly_add("0*8($inp)"); 1980 &poly_mul(); $code.=" 1981 vpxor ($inp), $A0x, $A3x 1982 vmovdqu $A3x, ($oup) 1983 lea 1*16($inp), $inp 1984 lea 1*16($oup), $oup 1985 vextracti128 \$1, $A0, $A1x 19861: 1987 vzeroupper 1988 jmp open_sse_tail_16 1989############################################################################### 1990open_avx2_320: 1991 vmovdqa $A0, $A1 1992 vmovdqa $A0, $A2 1993 vmovdqa $B0, $B1 1994 vmovdqa $B0, $B2 1995 vmovdqa $C0, $C1 1996 vmovdqa $C0, $C2 1997 vpaddd .avx2_inc(%rip), $D0, $D1 1998 vpaddd .avx2_inc(%rip), $D1, $D2 1999 vmovdqa $B0, $T1 2000 vmovdqa $C0, $T2 2001 vmovdqa $D0, $ctr0_store 2002 vmovdqa $D1, $ctr1_store 2003 vmovdqa $D2, $ctr2_store 2004 mov \$10, $acc0 20051: \n"; 2006 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2007 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2008 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2009 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2010 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2011 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2012 dec $acc0 2013 jne 1b 2014 vpaddd .chacha20_consts(%rip), $A0, $A0 2015 vpaddd .chacha20_consts(%rip), $A1, $A1 2016 vpaddd .chacha20_consts(%rip), $A2, $A2 2017 vpaddd $T1, $B0, $B0 2018 vpaddd $T1, $B1, $B1 2019 vpaddd $T1, $B2, $B2 2020 vpaddd $T2, $C0, $C0 2021 vpaddd $T2, $C1, $C1 2022 vpaddd $T2, $C2, $C2 2023 vpaddd $ctr0_store, $D0, $D0 2024 vpaddd $ctr1_store, $D1, $D1 2025 vpaddd $ctr2_store, $D2, $D2 2026 vperm2i128 \$0x02, $A0, $B0, $T0 2027 # Clamp and store the key 2028 vpand .clamp(%rip), $T0, $T0 2029 vmovdqa $T0, $r_store 2030 # Stream for up to 320 bytes 2031 vperm2i128 \$0x13, $A0, $B0, $A0 2032 vperm2i128 \$0x13, $C0, $D0, $B0 2033 vperm2i128 \$0x02, $A1, $B1, $C0 2034 vperm2i128 \$0x02, $C1, $D1, $D0 2035 vperm2i128 \$0x13, $A1, $B1, $A1 2036 vperm2i128 \$0x13, $C1, $D1, $B1 2037 vperm2i128 \$0x02, $A2, $B2, $C1 2038 vperm2i128 \$0x02, $C2, $D2, $D1 2039 vperm2i128 \$0x13, $A2, $B2, $A2 2040 vperm2i128 \$0x13, $C2, $D2, $B2 2041 jmp open_avx2_short 2042.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 2043############################################################################### 2044############################################################################### 2045.type chacha20_poly1305_seal_avx2,\@function,2 2046.align 64 2047chacha20_poly1305_seal_avx2: 2048 vzeroupper 2049 vmovdqa .chacha20_consts(%rip), $A0 2050 vbroadcasti128 0*16($keyp), $B0 2051 vbroadcasti128 1*16($keyp), $C0 2052 vbroadcasti128 2*16($keyp), $D0 2053 vpaddd .avx2_init(%rip), $D0, $D0 2054 cmp \$6*32, $inl 2055 jbe seal_avx2_192 2056 cmp \$10*32, $inl 2057 jbe seal_avx2_320 2058 vmovdqa $A0, $A1 2059 vmovdqa $A0, $A2 2060 vmovdqa $A0, $A3 2061 vmovdqa $B0, $B1 2062 vmovdqa $B0, $B2 2063 vmovdqa $B0, $B3 2064 vmovdqa $B0, $state1_store 2065 vmovdqa $C0, $C1 2066 vmovdqa $C0, $C2 2067 vmovdqa $C0, $C3 2068 vmovdqa $C0, $state2_store 2069 vmovdqa $D0, $D3 2070 vpaddd .avx2_inc(%rip), $D3, $D2 2071 vpaddd .avx2_inc(%rip), $D2, $D1 2072 vpaddd .avx2_inc(%rip), $D1, $D0 2073 vmovdqa $D0, $ctr0_store 2074 vmovdqa $D1, $ctr1_store 2075 vmovdqa $D2, $ctr2_store 2076 vmovdqa $D3, $ctr3_store 2077 mov \$10, $acc0 20781: \n"; 2079 foreach $l (@loop_body) {$code.=$l."\n";} 2080 @loop_body = split /\n/, $chacha_body; $code.=" 2081 dec $acc0 2082 jnz 1b\n"; 2083 &finalize_state_avx2(4); $code.=" 2084 vperm2i128 \$0x13, $C3, $D3, $C3 2085 vperm2i128 \$0x02, $A3, $B3, $D3 2086 vperm2i128 \$0x13, $A3, $B3, $A3 2087 vpand .clamp(%rip), $D3, $D3 2088 vmovdqa $D3, $r_store 2089 mov %r8, $itr2 2090 call poly_hash_ad_internal 2091 # Safely store 320 bytes (otherwise would handle with optimized call) 2092 vpxor 0*32($inp), $A3, $A3 2093 vpxor 1*32($inp), $C3, $C3 2094 vmovdqu $A3, 0*32($oup) 2095 vmovdqu $C3, 1*32($oup)\n"; 2096 &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3); 2097 &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3); 2098 &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.=" 2099 lea 10*32($inp), $inp 2100 sub \$10*32, $inl 2101 mov \$10*32, $itr1 2102 cmp \$4*32, $inl 2103 jbe seal_avx2_hash 2104 vpxor 0*32($inp), $A0, $A0 2105 vpxor 1*32($inp), $B0, $B0 2106 vpxor 2*32($inp), $C0, $C0 2107 vpxor 3*32($inp), $D0, $D0 2108 vmovdqu $A0, 10*32($oup) 2109 vmovdqu $B0, 11*32($oup) 2110 vmovdqu $C0, 12*32($oup) 2111 vmovdqu $D0, 13*32($oup) 2112 lea 4*32($inp), $inp 2113 sub \$4*32, $inl 2114 mov \$8, $itr1 2115 mov \$2, $itr2 2116 cmp \$4*32, $inl 2117 jbe seal_avx2_tail_128 2118 cmp \$8*32, $inl 2119 jbe seal_avx2_tail_256 2120 cmp \$12*32, $inl 2121 jbe seal_avx2_tail_384 2122 cmp \$16*32, $inl 2123 jbe seal_avx2_tail_512\n"; 2124 # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2125 &prep_state_avx2(4); 2126 foreach $l (@loop_body) {$code.=$l."\n";} 2127 @loop_body = split /\n/, $chacha_body; 2128 &emit_body(41); 2129 @loop_body = split /\n/, $chacha_body; $code.=" 2130 sub \$16, $oup 2131 mov \$9, $itr1 2132 jmp 4f 21331: \n"; 2134 &prep_state_avx2(4); $code.=" 2135 mov \$10, $itr1 21362: \n"; 2137 &poly_add("0*8($oup)"); 2138 &emit_body(10); 2139 &poly_stage1_mulx(); 2140 &emit_body(9); 2141 &poly_stage2_mulx(); 2142 &emit_body(12); 2143 &poly_stage3_mulx(); 2144 &emit_body(10); 2145 &poly_reduce_stage(); $code.=" 21464: \n"; 2147 &emit_body(9); 2148 &poly_add("2*8($oup)"); 2149 &emit_body(8); 2150 &poly_stage1_mulx(); 2151 &emit_body(18); 2152 &poly_stage2_mulx(); 2153 &emit_body(18); 2154 &poly_stage3_mulx(); 2155 &emit_body(9); 2156 &poly_reduce_stage(); 2157 &emit_body(8); 2158 &poly_add("4*8($oup)"); $code.=" 2159 lea 6*8($oup), $oup\n"; 2160 &emit_body(18); 2161 &poly_stage1_mulx(); 2162 &emit_body(8); 2163 &poly_stage2_mulx(); 2164 &emit_body(8); 2165 &poly_stage3_mulx(); 2166 &emit_body(18); 2167 &poly_reduce_stage(); 2168 foreach $l (@loop_body) {$code.=$l."\n";} 2169 @loop_body = split /\n/, $chacha_body; $code.=" 2170 dec $itr1 2171 jne 2b\n"; 2172 &finalize_state_avx2(4); $code.=" 2173 lea 4*8($oup), $oup 2174 vmovdqa $A0, $tmp_store\n"; 2175 &poly_add("-4*8($oup)"); 2176 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2177 vmovdqa $tmp_store, $A0\n"; 2178 &poly_mul(); 2179 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2180 &poly_add("-2*8($oup)"); 2181 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2182 &poly_mul(); 2183 &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.=" 2184 lea 16*32($inp), $inp 2185 sub \$16*32, $inl 2186 cmp \$16*32, $inl 2187 jg 1b\n"; 2188 &poly_add("0*8($oup)"); 2189 &poly_mul(); 2190 &poly_add("2*8($oup)"); 2191 &poly_mul(); $code.=" 2192 lea 4*8($oup), $oup 2193 mov \$10, $itr1 2194 xor $itr2, $itr2 2195 cmp \$4*32, $inl 2196 ja 3f 2197############################################################################### 2198seal_avx2_tail_128:\n"; 2199 &prep_state_avx2(1); $code.=" 22001: \n"; 2201 &poly_add("0($oup)"); 2202 &poly_mul(); $code.=" 2203 lea 2*8($oup), $oup 22042: \n"; 2205 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2206 &poly_add("0*8($oup)"); 2207 &poly_mul(); 2208 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2209 &poly_add("2*8($oup)"); 2210 &poly_mul(); $code.=" 2211 lea 4*8($oup), $oup 2212 dec $itr1 2213 jg 1b 2214 dec $itr2 2215 jge 2b\n"; 2216 &finalize_state_avx2(1); 2217 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2218 jmp seal_avx2_short_loop 22193: 2220 cmp \$8*32, $inl 2221 ja 3f 2222############################################################################### 2223seal_avx2_tail_256:\n"; 2224 &prep_state_avx2(2); $code.=" 22251: \n"; 2226 &poly_add("0($oup)"); 2227 &poly_mul(); $code.=" 2228 lea 2*8($oup), $oup 22292: \n"; 2230 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2231 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2232 &poly_add("0*8($oup)"); 2233 &poly_mul(); 2234 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2235 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2236 &poly_add("2*8($oup)"); 2237 &poly_mul(); $code.=" 2238 lea 4*8($oup), $oup 2239 dec $itr1 2240 jg 1b 2241 dec $itr2 2242 jge 2b\n"; 2243 &finalize_state_avx2(2); 2244 &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0); 2245 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2246 mov \$4*32, $itr1 2247 lea 4*32($inp), $inp 2248 sub \$4*32, $inl 2249 jmp seal_avx2_hash 22503: 2251 cmp \$12*32, $inl 2252 ja seal_avx2_tail_512 2253############################################################################### 2254seal_avx2_tail_384:\n"; 2255 &prep_state_avx2(3); $code.=" 22561: \n"; 2257 &poly_add("0($oup)"); 2258 &poly_mul(); $code.=" 2259 lea 2*8($oup), $oup 22602: \n"; 2261 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2262 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2263 &poly_add("0*8($oup)"); 2264 &poly_mul(); 2265 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2266 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2267 &poly_add("2*8($oup)"); 2268 &poly_mul(); 2269 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2270 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2271 lea 4*8($oup), $oup 2272 dec $itr1 2273 jg 1b 2274 dec $itr2 2275 jge 2b\n"; 2276 &finalize_state_avx2(3); 2277 &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0); 2278 &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0); 2279 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2280 mov \$8*32, $itr1 2281 lea 8*32($inp), $inp 2282 sub \$8*32, $inl 2283 jmp seal_avx2_hash 2284############################################################################### 2285seal_avx2_tail_512:\n"; 2286 &prep_state_avx2(4); $code.=" 22871: \n"; 2288 &poly_add("0($oup)"); 2289 &poly_mul_mulx(); $code.=" 2290 lea 2*8($oup), $oup 22912: \n"; 2292 &emit_body(20); 2293 &poly_add("0*8($oup)"); 2294 &emit_body(20); 2295 &poly_stage1_mulx(); 2296 &emit_body(20); 2297 &poly_stage2_mulx(); 2298 &emit_body(20); 2299 &poly_stage3_mulx(); 2300 &emit_body(20); 2301 &poly_reduce_stage(); 2302 &emit_body(20); 2303 &poly_add("2*8($oup)"); 2304 &emit_body(20); 2305 &poly_stage1_mulx(); 2306 &emit_body(20); 2307 &poly_stage2_mulx(); 2308 &emit_body(20); 2309 &poly_stage3_mulx(); 2310 &emit_body(20); 2311 &poly_reduce_stage(); 2312 foreach $l (@loop_body) {$code.=$l."\n";} 2313 @loop_body = split /\n/, $chacha_body; $code.=" 2314 lea 4*8($oup), $oup 2315 dec $itr1 2316 jg 1b 2317 dec $itr2 2318 jge 2b\n"; 2319 &finalize_state_avx2(4); $code.=" 2320 vmovdqa $A0, $tmp_store\n"; 2321 &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.=" 2322 vmovdqa $tmp_store, $A0\n"; 2323 &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3); 2324 &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3); 2325 &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.=" 2326 mov \$12*32, $itr1 2327 lea 12*32($inp), $inp 2328 sub \$12*32, $inl 2329 jmp seal_avx2_hash 2330################################################################################ 2331seal_avx2_320: 2332 vmovdqa $A0, $A1 2333 vmovdqa $A0, $A2 2334 vmovdqa $B0, $B1 2335 vmovdqa $B0, $B2 2336 vmovdqa $C0, $C1 2337 vmovdqa $C0, $C2 2338 vpaddd .avx2_inc(%rip), $D0, $D1 2339 vpaddd .avx2_inc(%rip), $D1, $D2 2340 vmovdqa $B0, $T1 2341 vmovdqa $C0, $T2 2342 vmovdqa $D0, $ctr0_store 2343 vmovdqa $D1, $ctr1_store 2344 vmovdqa $D2, $ctr2_store 2345 mov \$10, $acc0 23461: \n"; 2347 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2348 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2349 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left"); 2350 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2351 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); 2352 &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.=" 2353 dec $acc0 2354 jne 1b 2355 vpaddd .chacha20_consts(%rip), $A0, $A0 2356 vpaddd .chacha20_consts(%rip), $A1, $A1 2357 vpaddd .chacha20_consts(%rip), $A2, $A2 2358 vpaddd $T1, $B0, $B0 2359 vpaddd $T1, $B1, $B1 2360 vpaddd $T1, $B2, $B2 2361 vpaddd $T2, $C0, $C0 2362 vpaddd $T2, $C1, $C1 2363 vpaddd $T2, $C2, $C2 2364 vpaddd $ctr0_store, $D0, $D0 2365 vpaddd $ctr1_store, $D1, $D1 2366 vpaddd $ctr2_store, $D2, $D2 2367 vperm2i128 \$0x02, $A0, $B0, $T0 2368 # Clamp and store the key 2369 vpand .clamp(%rip), $T0, $T0 2370 vmovdqa $T0, $r_store 2371 # Stream for up to 320 bytes 2372 vperm2i128 \$0x13, $A0, $B0, $A0 2373 vperm2i128 \$0x13, $C0, $D0, $B0 2374 vperm2i128 \$0x02, $A1, $B1, $C0 2375 vperm2i128 \$0x02, $C1, $D1, $D0 2376 vperm2i128 \$0x13, $A1, $B1, $A1 2377 vperm2i128 \$0x13, $C1, $D1, $B1 2378 vperm2i128 \$0x02, $A2, $B2, $C1 2379 vperm2i128 \$0x02, $C2, $D2, $D1 2380 vperm2i128 \$0x13, $A2, $B2, $A2 2381 vperm2i128 \$0x13, $C2, $D2, $B2 2382 jmp seal_avx2_short 2383################################################################################ 2384seal_avx2_192: 2385 vmovdqa $A0, $A1 2386 vmovdqa $A0, $A2 2387 vmovdqa $B0, $B1 2388 vmovdqa $B0, $B2 2389 vmovdqa $C0, $C1 2390 vmovdqa $C0, $C2 2391 vpaddd .avx2_inc(%rip), $D0, $D1 2392 vmovdqa $D0, $T2 2393 vmovdqa $D1, $T3 2394 mov \$10, $acc0 23951: \n"; 2396 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left"); 2397 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); 2398 &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); 2399 &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.=" 2400 dec $acc0 2401 jne 1b 2402 vpaddd $A2, $A0, $A0 2403 vpaddd $A2, $A1, $A1 2404 vpaddd $B2, $B0, $B0 2405 vpaddd $B2, $B1, $B1 2406 vpaddd $C2, $C0, $C0 2407 vpaddd $C2, $C1, $C1 2408 vpaddd $T2, $D0, $D0 2409 vpaddd $T3, $D1, $D1 2410 vperm2i128 \$0x02, $A0, $B0, $T0 2411 # Clamp and store the key 2412 vpand .clamp(%rip), $T0, $T0 2413 vmovdqa $T0, $r_store 2414 # Stream for up to 192 bytes 2415 vperm2i128 \$0x13, $A0, $B0, $A0 2416 vperm2i128 \$0x13, $C0, $D0, $B0 2417 vperm2i128 \$0x02, $A1, $B1, $C0 2418 vperm2i128 \$0x02, $C1, $D1, $D0 2419 vperm2i128 \$0x13, $A1, $B1, $A1 2420 vperm2i128 \$0x13, $C1, $D1, $B1 2421seal_avx2_short: 2422 mov %r8, $itr2 2423 call poly_hash_ad_internal 2424 xor $itr1, $itr1 2425seal_avx2_hash: 2426 cmp \$16, $itr1 2427 jb seal_avx2_short_loop\n"; 2428 &poly_add("0($oup)"); 2429 &poly_mul(); $code.=" 2430 sub \$16, $itr1 2431 add \$16, $oup 2432 jmp seal_avx2_hash 2433seal_avx2_short_loop: 2434 cmp \$32, $inl 2435 jb seal_avx2_short_tail 2436 sub \$32, $inl 2437 # Encrypt 2438 vpxor ($inp), $A0, $A0 2439 vmovdqu $A0, ($oup) 2440 lea 1*32($inp), $inp 2441 # Load + hash\n"; 2442 &poly_add("0*8($oup)"); 2443 &poly_mul(); 2444 &poly_add("2*8($oup)"); 2445 &poly_mul(); $code.=" 2446 lea 1*32($oup), $oup 2447 # Shift stream 2448 vmovdqa $B0, $A0 2449 vmovdqa $C0, $B0 2450 vmovdqa $D0, $C0 2451 vmovdqa $A1, $D0 2452 vmovdqa $B1, $A1 2453 vmovdqa $C1, $B1 2454 vmovdqa $D1, $C1 2455 vmovdqa $A2, $D1 2456 vmovdqa $B2, $A2 2457 jmp seal_avx2_short_loop 2458seal_avx2_short_tail: 2459 cmp \$16, $inl 2460 jb 1f 2461 sub \$16, $inl 2462 vpxor ($inp), $A0x, $A3x 2463 vmovdqu $A3x, ($oup) 2464 lea 1*16($inp), $inp\n"; 2465 &poly_add("0*8($oup)"); 2466 &poly_mul(); $code.=" 2467 lea 1*16($oup), $oup 2468 vextracti128 \$1, $A0, $A0x 24691: 2470 vzeroupper 2471 jmp seal_sse_tail_16 2472.cfi_endproc 2473"; 2474} 2475 2476if (!$win64) { 2477 $code =~ s/\`([^\`]*)\`/eval $1/gem; 2478 print $code; 2479} else { 2480 print <<___; 2481.text 2482.globl dummy_chacha20_poly1305_asm 2483.type dummy_chacha20_poly1305_asm,\@abi-omnipotent 2484dummy_chacha20_poly1305_asm: 2485 ret 2486___ 2487} 2488 2489close STDOUT or die "error closing STDOUT"; 2490