1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# This file was patched in BoringSSL to remove the variable-time 4-bit 94# implementation. 95 96$flavour = shift; 97$output = shift; 98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 99 100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 101 102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 105die "can't locate x86_64-xlate.pl"; 106 107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be 108# computed incorrectly. 109# 110# In upstream, this is controlled by shelling out to the compiler to check 111# versions, but BoringSSL is intended to be used with pre-generated perlasm 112# output, so this isn't useful anyway. 113$avx = 1; 114 115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 116*STDOUT=*OUT; 117 118$do4xaggr=1; 119 120 121$code=<<___; 122.text 123___ 124 125 126###################################################################### 127# PCLMULQDQ version. 128 129@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 130 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 131 132($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 133($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 134 135sub clmul64x64_T2 { # minimal register pressure 136my ($Xhi,$Xi,$Hkey,$HK)=@_; 137 138if (!defined($HK)) { $HK = $T2; 139$code.=<<___; 140 movdqa $Xi,$Xhi # 141 pshufd \$0b01001110,$Xi,$T1 142 pshufd \$0b01001110,$Hkey,$T2 143 pxor $Xi,$T1 # 144 pxor $Hkey,$T2 145___ 146} else { 147$code.=<<___; 148 movdqa $Xi,$Xhi # 149 pshufd \$0b01001110,$Xi,$T1 150 pxor $Xi,$T1 # 151___ 152} 153$code.=<<___; 154 pclmulqdq \$0x00,$Hkey,$Xi ####### 155 pclmulqdq \$0x11,$Hkey,$Xhi ####### 156 pclmulqdq \$0x00,$HK,$T1 ####### 157 pxor $Xi,$T1 # 158 pxor $Xhi,$T1 # 159 160 movdqa $T1,$T2 # 161 psrldq \$8,$T1 162 pslldq \$8,$T2 # 163 pxor $T1,$Xhi 164 pxor $T2,$Xi # 165___ 166} 167 168sub reduction_alg9 { # 17/11 times faster than Intel version 169my ($Xhi,$Xi) = @_; 170 171$code.=<<___; 172 # 1st phase 173 movdqa $Xi,$T2 # 174 movdqa $Xi,$T1 175 psllq \$5,$Xi 176 pxor $Xi,$T1 # 177 psllq \$1,$Xi 178 pxor $T1,$Xi # 179 psllq \$57,$Xi # 180 movdqa $Xi,$T1 # 181 pslldq \$8,$Xi 182 psrldq \$8,$T1 # 183 pxor $T2,$Xi 184 pxor $T1,$Xhi # 185 186 # 2nd phase 187 movdqa $Xi,$T2 188 psrlq \$1,$Xi 189 pxor $T2,$Xhi # 190 pxor $Xi,$T2 191 psrlq \$5,$Xi 192 pxor $T2,$Xi # 193 psrlq \$1,$Xi # 194 pxor $Xhi,$Xi # 195___ 196} 197 198{ my ($Htbl,$Xip)=@_4args; 199 my $HK="%xmm6"; 200 201$code.=<<___; 202.globl gcm_init_clmul 203.type gcm_init_clmul,\@abi-omnipotent 204.align 16 205gcm_init_clmul: 206.cfi_startproc 207.seh_startproc 208 _CET_ENDBR 209.L_init_clmul: 210___ 211$code.=<<___ if ($win64); 212 sub \$0x18,%rsp 213.seh_stackalloc 0x18 214 movaps %xmm6,(%rsp) 215.seh_savexmm %xmm6, 0 216.seh_endprologue 217___ 218$code.=<<___; 219 movdqu ($Xip),$Hkey 220 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 221 222 # <<1 twist 223 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 224 movdqa $Hkey,$T1 225 psllq \$1,$Hkey 226 pxor $T3,$T3 # 227 psrlq \$63,$T1 228 pcmpgtd $T2,$T3 # broadcast carry bit 229 pslldq \$8,$T1 230 por $T1,$Hkey # H<<=1 231 232 # magic reduction 233 pand .L0x1c2_polynomial(%rip),$T3 234 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 235 236 # calculate H^2 237 pshufd \$0b01001110,$Hkey,$HK 238 movdqa $Hkey,$Xi 239 pxor $Hkey,$HK 240___ 241 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 242 &reduction_alg9 ($Xhi,$Xi); 243$code.=<<___; 244 pshufd \$0b01001110,$Hkey,$T1 245 pshufd \$0b01001110,$Xi,$T2 246 pxor $Hkey,$T1 # Karatsuba pre-processing 247 movdqu $Hkey,0x00($Htbl) # save H 248 pxor $Xi,$T2 # Karatsuba pre-processing 249 movdqu $Xi,0x10($Htbl) # save H^2 250 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 251 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 252___ 253if ($do4xaggr) { 254 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 255 &reduction_alg9 ($Xhi,$Xi); 256$code.=<<___; 257 movdqa $Xi,$T3 258___ 259 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 260 &reduction_alg9 ($Xhi,$Xi); 261$code.=<<___; 262 pshufd \$0b01001110,$T3,$T1 263 pshufd \$0b01001110,$Xi,$T2 264 pxor $T3,$T1 # Karatsuba pre-processing 265 movdqu $T3,0x30($Htbl) # save H^3 266 pxor $Xi,$T2 # Karatsuba pre-processing 267 movdqu $Xi,0x40($Htbl) # save H^4 268 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 269 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 270___ 271} 272$code.=<<___ if ($win64); 273 movaps (%rsp),%xmm6 274 lea 0x18(%rsp),%rsp 275___ 276$code.=<<___; 277 ret 278.cfi_endproc 279.seh_endproc 280.size gcm_init_clmul,.-gcm_init_clmul 281___ 282} 283 284{ my ($Xip,$Htbl)=@_4args; 285 286$code.=<<___; 287.globl gcm_gmult_clmul 288.type gcm_gmult_clmul,\@abi-omnipotent 289.align 16 290gcm_gmult_clmul: 291.cfi_startproc 292 _CET_ENDBR 293.L_gmult_clmul: 294 movdqu ($Xip),$Xi 295 movdqa .Lbswap_mask(%rip),$T3 296 movdqu ($Htbl),$Hkey 297 movdqu 0x20($Htbl),$T2 298 pshufb $T3,$Xi 299___ 300 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 301$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 302 # experimental alternative. special thing about is that there 303 # no dependency between the two multiplications... 304 mov \$`0xE1<<1`,%eax 305 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 306 mov \$0x07,%r11d 307 movq %rax,$T1 308 movq %r10,$T2 309 movq %r11,$T3 # borrow $T3 310 pand $Xi,$T3 311 pshufb $T3,$T2 # ($Xi&7)·0xE0 312 movq %rax,$T3 313 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 314 pxor $Xi,$T2 315 pslldq \$15,$T2 316 paddd $T2,$T2 # <<(64+56+1) 317 pxor $T2,$Xi 318 pclmulqdq \$0x01,$T3,$Xi 319 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 320 psrldq \$1,$T1 321 pxor $T1,$Xhi 322 pslldq \$7,$Xi 323 pxor $Xhi,$Xi 324___ 325$code.=<<___; 326 pshufb $T3,$Xi 327 movdqu $Xi,($Xip) 328 ret 329.cfi_endproc 330.size gcm_gmult_clmul,.-gcm_gmult_clmul 331___ 332} 333 334{ my ($Xip,$Htbl,$inp,$len)=@_4args; 335 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 336 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 337 338$code.=<<___; 339.globl gcm_ghash_clmul 340.type gcm_ghash_clmul,\@abi-omnipotent 341.align 32 342gcm_ghash_clmul: 343.cfi_startproc 344.seh_startproc 345 _CET_ENDBR 346.L_ghash_clmul: 347___ 348$code.=<<___ if ($win64); 349 lea -0x88(%rsp),%rax 350 lea -0x20(%rax),%rsp 351.seh_stackalloc 0x20+0x88 352 movaps %xmm6,-0x20(%rax) 353.seh_savexmm %xmm6, 0x20-0x20 354 movaps %xmm7,-0x10(%rax) 355.seh_savexmm %xmm7, 0x20-0x10 356 movaps %xmm8,0(%rax) 357.seh_savexmm %xmm8, 0x20+0 358 movaps %xmm9,0x10(%rax) 359.seh_savexmm %xmm9, 0x20+0x10 360 movaps %xmm10,0x20(%rax) 361.seh_savexmm %xmm10, 0x20+0x20 362 movaps %xmm11,0x30(%rax) 363.seh_savexmm %xmm11, 0x20+0x30 364 movaps %xmm12,0x40(%rax) 365.seh_savexmm %xmm12, 0x20+0x40 366 movaps %xmm13,0x50(%rax) 367.seh_savexmm %xmm13, 0x20+0x50 368 movaps %xmm14,0x60(%rax) 369.seh_savexmm %xmm14, 0x20+0x60 370 movaps %xmm15,0x70(%rax) 371.seh_savexmm %xmm15, 0x20+0x70 372.seh_endprologue 373___ 374$code.=<<___; 375 movdqa .Lbswap_mask(%rip),$T3 376 377 movdqu ($Xip),$Xi 378 movdqu ($Htbl),$Hkey 379 movdqu 0x20($Htbl),$HK 380 pshufb $T3,$Xi 381 382 sub \$0x10,$len 383 jz .Lodd_tail 384 385 movdqu 0x10($Htbl),$Hkey2 386___ 387if ($do4xaggr) { 388my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 389 390$code.=<<___; 391 cmp \$0x30,$len 392 jb .Lskip4x 393 394 sub \$0x30,$len 395 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 396 movdqu 0x30($Htbl),$Hkey3 397 movdqu 0x40($Htbl),$Hkey4 398 399 ####### 400 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 401 # 402 movdqu 0x30($inp),$Xln 403 movdqu 0x20($inp),$Xl 404 pshufb $T3,$Xln 405 pshufb $T3,$Xl 406 movdqa $Xln,$Xhn 407 pshufd \$0b01001110,$Xln,$Xmn 408 pxor $Xln,$Xmn 409 pclmulqdq \$0x00,$Hkey,$Xln 410 pclmulqdq \$0x11,$Hkey,$Xhn 411 pclmulqdq \$0x00,$HK,$Xmn 412 413 movdqa $Xl,$Xh 414 pshufd \$0b01001110,$Xl,$Xm 415 pxor $Xl,$Xm 416 pclmulqdq \$0x00,$Hkey2,$Xl 417 pclmulqdq \$0x11,$Hkey2,$Xh 418 pclmulqdq \$0x10,$HK,$Xm 419 xorps $Xl,$Xln 420 xorps $Xh,$Xhn 421 movups 0x50($Htbl),$HK 422 xorps $Xm,$Xmn 423 424 movdqu 0x10($inp),$Xl 425 movdqu 0($inp),$T1 426 pshufb $T3,$Xl 427 pshufb $T3,$T1 428 movdqa $Xl,$Xh 429 pshufd \$0b01001110,$Xl,$Xm 430 pxor $T1,$Xi 431 pxor $Xl,$Xm 432 pclmulqdq \$0x00,$Hkey3,$Xl 433 movdqa $Xi,$Xhi 434 pshufd \$0b01001110,$Xi,$T1 435 pxor $Xi,$T1 436 pclmulqdq \$0x11,$Hkey3,$Xh 437 pclmulqdq \$0x00,$HK,$Xm 438 xorps $Xl,$Xln 439 xorps $Xh,$Xhn 440 441 lea 0x40($inp),$inp 442 sub \$0x40,$len 443 jc .Ltail4x 444 445 jmp .Lmod4_loop 446.align 32 447.Lmod4_loop: 448 pclmulqdq \$0x00,$Hkey4,$Xi 449 xorps $Xm,$Xmn 450 movdqu 0x30($inp),$Xl 451 pshufb $T3,$Xl 452 pclmulqdq \$0x11,$Hkey4,$Xhi 453 xorps $Xln,$Xi 454 movdqu 0x20($inp),$Xln 455 movdqa $Xl,$Xh 456 pclmulqdq \$0x10,$HK,$T1 457 pshufd \$0b01001110,$Xl,$Xm 458 xorps $Xhn,$Xhi 459 pxor $Xl,$Xm 460 pshufb $T3,$Xln 461 movups 0x20($Htbl),$HK 462 xorps $Xmn,$T1 463 pclmulqdq \$0x00,$Hkey,$Xl 464 pshufd \$0b01001110,$Xln,$Xmn 465 466 pxor $Xi,$T1 # aggregated Karatsuba post-processing 467 movdqa $Xln,$Xhn 468 pxor $Xhi,$T1 # 469 pxor $Xln,$Xmn 470 movdqa $T1,$T2 # 471 pclmulqdq \$0x11,$Hkey,$Xh 472 pslldq \$8,$T1 473 psrldq \$8,$T2 # 474 pxor $T1,$Xi 475 movdqa .L7_mask(%rip),$T1 476 pxor $T2,$Xhi # 477 movq %rax,$T2 478 479 pand $Xi,$T1 # 1st phase 480 pshufb $T1,$T2 # 481 pxor $Xi,$T2 # 482 pclmulqdq \$0x00,$HK,$Xm 483 psllq \$57,$T2 # 484 movdqa $T2,$T1 # 485 pslldq \$8,$T2 486 pclmulqdq \$0x00,$Hkey2,$Xln 487 psrldq \$8,$T1 # 488 pxor $T2,$Xi 489 pxor $T1,$Xhi # 490 movdqu 0($inp),$T1 491 492 movdqa $Xi,$T2 # 2nd phase 493 psrlq \$1,$Xi 494 pclmulqdq \$0x11,$Hkey2,$Xhn 495 xorps $Xl,$Xln 496 movdqu 0x10($inp),$Xl 497 pshufb $T3,$Xl 498 pclmulqdq \$0x10,$HK,$Xmn 499 xorps $Xh,$Xhn 500 movups 0x50($Htbl),$HK 501 pshufb $T3,$T1 502 pxor $T2,$Xhi # 503 pxor $Xi,$T2 504 psrlq \$5,$Xi 505 506 movdqa $Xl,$Xh 507 pxor $Xm,$Xmn 508 pshufd \$0b01001110,$Xl,$Xm 509 pxor $T2,$Xi # 510 pxor $T1,$Xhi 511 pxor $Xl,$Xm 512 pclmulqdq \$0x00,$Hkey3,$Xl 513 psrlq \$1,$Xi # 514 pxor $Xhi,$Xi # 515 movdqa $Xi,$Xhi 516 pclmulqdq \$0x11,$Hkey3,$Xh 517 xorps $Xl,$Xln 518 pshufd \$0b01001110,$Xi,$T1 519 pxor $Xi,$T1 520 521 pclmulqdq \$0x00,$HK,$Xm 522 xorps $Xh,$Xhn 523 524 lea 0x40($inp),$inp 525 sub \$0x40,$len 526 jnc .Lmod4_loop 527 528.Ltail4x: 529 pclmulqdq \$0x00,$Hkey4,$Xi 530 pclmulqdq \$0x11,$Hkey4,$Xhi 531 pclmulqdq \$0x10,$HK,$T1 532 xorps $Xm,$Xmn 533 xorps $Xln,$Xi 534 xorps $Xhn,$Xhi 535 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 536 pxor $Xmn,$T1 537 538 pxor $Xhi,$T1 # 539 pxor $Xi,$Xhi 540 541 movdqa $T1,$T2 # 542 psrldq \$8,$T1 543 pslldq \$8,$T2 # 544 pxor $T1,$Xhi 545 pxor $T2,$Xi # 546___ 547 &reduction_alg9($Xhi,$Xi); 548$code.=<<___; 549 add \$0x40,$len 550 jz .Ldone 551 movdqu 0x20($Htbl),$HK 552 sub \$0x10,$len 553 jz .Lodd_tail 554.Lskip4x: 555___ 556} 557$code.=<<___; 558 ####### 559 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 560 # [(H*Ii+1) + (H*Xi+1)] mod P = 561 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 562 # 563 movdqu ($inp),$T1 # Ii 564 movdqu 16($inp),$Xln # Ii+1 565 pshufb $T3,$T1 566 pshufb $T3,$Xln 567 pxor $T1,$Xi # Ii+Xi 568 569 movdqa $Xln,$Xhn 570 pshufd \$0b01001110,$Xln,$Xmn 571 pxor $Xln,$Xmn 572 pclmulqdq \$0x00,$Hkey,$Xln 573 pclmulqdq \$0x11,$Hkey,$Xhn 574 pclmulqdq \$0x00,$HK,$Xmn 575 576 lea 32($inp),$inp # i+=2 577 nop 578 sub \$0x20,$len 579 jbe .Leven_tail 580 nop 581 jmp .Lmod_loop 582 583.align 32 584.Lmod_loop: 585 movdqa $Xi,$Xhi 586 movdqa $Xmn,$T1 587 pshufd \$0b01001110,$Xi,$Xmn # 588 pxor $Xi,$Xmn # 589 590 pclmulqdq \$0x00,$Hkey2,$Xi 591 pclmulqdq \$0x11,$Hkey2,$Xhi 592 pclmulqdq \$0x10,$HK,$Xmn 593 594 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 595 pxor $Xhn,$Xhi 596 movdqu ($inp),$T2 # Ii 597 pxor $Xi,$T1 # aggregated Karatsuba post-processing 598 pshufb $T3,$T2 599 movdqu 16($inp),$Xln # Ii+1 600 601 pxor $Xhi,$T1 602 pxor $T2,$Xhi # "Ii+Xi", consume early 603 pxor $T1,$Xmn 604 pshufb $T3,$Xln 605 movdqa $Xmn,$T1 # 606 psrldq \$8,$T1 607 pslldq \$8,$Xmn # 608 pxor $T1,$Xhi 609 pxor $Xmn,$Xi # 610 611 movdqa $Xln,$Xhn # 612 613 movdqa $Xi,$T2 # 1st phase 614 movdqa $Xi,$T1 615 psllq \$5,$Xi 616 pxor $Xi,$T1 # 617 pclmulqdq \$0x00,$Hkey,$Xln ####### 618 psllq \$1,$Xi 619 pxor $T1,$Xi # 620 psllq \$57,$Xi # 621 movdqa $Xi,$T1 # 622 pslldq \$8,$Xi 623 psrldq \$8,$T1 # 624 pxor $T2,$Xi 625 pshufd \$0b01001110,$Xhn,$Xmn 626 pxor $T1,$Xhi # 627 pxor $Xhn,$Xmn # 628 629 movdqa $Xi,$T2 # 2nd phase 630 psrlq \$1,$Xi 631 pclmulqdq \$0x11,$Hkey,$Xhn ####### 632 pxor $T2,$Xhi # 633 pxor $Xi,$T2 634 psrlq \$5,$Xi 635 pxor $T2,$Xi # 636 lea 32($inp),$inp 637 psrlq \$1,$Xi # 638 pclmulqdq \$0x00,$HK,$Xmn ####### 639 pxor $Xhi,$Xi # 640 641 sub \$0x20,$len 642 ja .Lmod_loop 643 644.Leven_tail: 645 movdqa $Xi,$Xhi 646 movdqa $Xmn,$T1 647 pshufd \$0b01001110,$Xi,$Xmn # 648 pxor $Xi,$Xmn # 649 650 pclmulqdq \$0x00,$Hkey2,$Xi 651 pclmulqdq \$0x11,$Hkey2,$Xhi 652 pclmulqdq \$0x10,$HK,$Xmn 653 654 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 655 pxor $Xhn,$Xhi 656 pxor $Xi,$T1 657 pxor $Xhi,$T1 658 pxor $T1,$Xmn 659 movdqa $Xmn,$T1 # 660 psrldq \$8,$T1 661 pslldq \$8,$Xmn # 662 pxor $T1,$Xhi 663 pxor $Xmn,$Xi # 664___ 665 &reduction_alg9 ($Xhi,$Xi); 666$code.=<<___; 667 test $len,$len 668 jnz .Ldone 669 670.Lodd_tail: 671 movdqu ($inp),$T1 # Ii 672 pshufb $T3,$T1 673 pxor $T1,$Xi # Ii+Xi 674___ 675 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 676 &reduction_alg9 ($Xhi,$Xi); 677$code.=<<___; 678.Ldone: 679 pshufb $T3,$Xi 680 movdqu $Xi,($Xip) 681___ 682$code.=<<___ if ($win64); 683 movaps (%rsp),%xmm6 684 movaps 0x10(%rsp),%xmm7 685 movaps 0x20(%rsp),%xmm8 686 movaps 0x30(%rsp),%xmm9 687 movaps 0x40(%rsp),%xmm10 688 movaps 0x50(%rsp),%xmm11 689 movaps 0x60(%rsp),%xmm12 690 movaps 0x70(%rsp),%xmm13 691 movaps 0x80(%rsp),%xmm14 692 movaps 0x90(%rsp),%xmm15 693 lea 0xa8(%rsp),%rsp 694___ 695$code.=<<___; 696 ret 697.cfi_endproc 698.seh_endproc 699.size gcm_ghash_clmul,.-gcm_ghash_clmul 700___ 701} 702 703$code.=<<___; 704.globl gcm_init_avx 705.type gcm_init_avx,\@abi-omnipotent 706.align 32 707gcm_init_avx: 708.cfi_startproc 709.seh_startproc 710 _CET_ENDBR 711___ 712if ($avx) { 713my ($Htbl,$Xip)=@_4args; 714my $HK="%xmm6"; 715 716$code.=<<___ if ($win64); 717 sub \$0x18,%rsp 718.seh_stackalloc 0x18 719 movaps %xmm6,(%rsp) 720.seh_savexmm %xmm6, 0 721.seh_endprologue 722___ 723$code.=<<___; 724 vzeroupper 725 726 vmovdqu ($Xip),$Hkey 727 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 728 729 # <<1 twist 730 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 731 vpsrlq \$63,$Hkey,$T1 732 vpsllq \$1,$Hkey,$Hkey 733 vpxor $T3,$T3,$T3 # 734 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 735 vpslldq \$8,$T1,$T1 736 vpor $T1,$Hkey,$Hkey # H<<=1 737 738 # magic reduction 739 vpand .L0x1c2_polynomial(%rip),$T3,$T3 740 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 741 742 vpunpckhqdq $Hkey,$Hkey,$HK 743 vmovdqa $Hkey,$Xi 744 vpxor $Hkey,$HK,$HK 745 mov \$4,%r10 # up to H^8 746 jmp .Linit_start_avx 747___ 748 749sub clmul64x64_avx { 750my ($Xhi,$Xi,$Hkey,$HK)=@_; 751 752if (!defined($HK)) { $HK = $T2; 753$code.=<<___; 754 vpunpckhqdq $Xi,$Xi,$T1 755 vpunpckhqdq $Hkey,$Hkey,$T2 756 vpxor $Xi,$T1,$T1 # 757 vpxor $Hkey,$T2,$T2 758___ 759} else { 760$code.=<<___; 761 vpunpckhqdq $Xi,$Xi,$T1 762 vpxor $Xi,$T1,$T1 # 763___ 764} 765$code.=<<___; 766 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 767 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 768 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 769 vpxor $Xi,$Xhi,$T2 # 770 vpxor $T2,$T1,$T1 # 771 772 vpslldq \$8,$T1,$T2 # 773 vpsrldq \$8,$T1,$T1 774 vpxor $T2,$Xi,$Xi # 775 vpxor $T1,$Xhi,$Xhi 776___ 777} 778 779sub reduction_avx { 780my ($Xhi,$Xi) = @_; 781 782$code.=<<___; 783 vpsllq \$57,$Xi,$T1 # 1st phase 784 vpsllq \$62,$Xi,$T2 785 vpxor $T1,$T2,$T2 # 786 vpsllq \$63,$Xi,$T1 787 vpxor $T1,$T2,$T2 # 788 vpslldq \$8,$T2,$T1 # 789 vpsrldq \$8,$T2,$T2 790 vpxor $T1,$Xi,$Xi # 791 vpxor $T2,$Xhi,$Xhi 792 793 vpsrlq \$1,$Xi,$T2 # 2nd phase 794 vpxor $Xi,$Xhi,$Xhi 795 vpxor $T2,$Xi,$Xi # 796 vpsrlq \$5,$T2,$T2 797 vpxor $T2,$Xi,$Xi # 798 vpsrlq \$1,$Xi,$Xi # 799 vpxor $Xhi,$Xi,$Xi # 800___ 801} 802 803$code.=<<___; 804.align 32 805.Linit_loop_avx: 806 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 807 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 808___ 809 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 810 &reduction_avx ($Xhi,$Xi); 811$code.=<<___; 812.Linit_start_avx: 813 vmovdqa $Xi,$T3 814___ 815 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 816 &reduction_avx ($Xhi,$Xi); 817$code.=<<___; 818 vpshufd \$0b01001110,$T3,$T1 819 vpshufd \$0b01001110,$Xi,$T2 820 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 821 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 822 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 823 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 824 lea 0x30($Htbl),$Htbl 825 sub \$1,%r10 826 jnz .Linit_loop_avx 827 828 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 829 vmovdqu $T3,-0x10($Htbl) 830 831 vzeroupper 832___ 833$code.=<<___ if ($win64); 834 movaps (%rsp),%xmm6 835 lea 0x18(%rsp),%rsp 836___ 837$code.=<<___; 838 ret 839.seh_endproc 840.cfi_endproc 841.size gcm_init_avx,.-gcm_init_avx 842___ 843} else { 844$code.=<<___; 845 jmp .L_init_clmul 846.size gcm_init_avx,.-gcm_init_avx 847___ 848} 849 850$code.=<<___; 851.globl gcm_gmult_avx 852.type gcm_gmult_avx,\@abi-omnipotent 853.align 32 854gcm_gmult_avx: 855.cfi_startproc 856 _CET_ENDBR 857 jmp .L_gmult_clmul 858.cfi_endproc 859.size gcm_gmult_avx,.-gcm_gmult_avx 860___ 861 862$code.=<<___; 863.globl gcm_ghash_avx 864.type gcm_ghash_avx,\@abi-omnipotent 865.align 32 866gcm_ghash_avx: 867.cfi_startproc 868.seh_startproc 869 _CET_ENDBR 870___ 871if ($avx) { 872my ($Xip,$Htbl,$inp,$len)=@_4args; 873my ($Xlo,$Xhi,$Xmi, 874 $Zlo,$Zhi,$Zmi, 875 $Hkey,$HK,$T1,$T2, 876 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 877 878$code.=<<___ if ($win64); 879 lea -0x88(%rsp),%rax 880 lea -0x20(%rax),%rsp 881.seh_stackalloc 0x20+0x88 882 movaps %xmm6,-0x20(%rax) 883.seh_savexmm %xmm6, 0x20-0x20 884 movaps %xmm7,-0x10(%rax) 885.seh_savexmm %xmm7, 0x20-0x10 886 movaps %xmm8,0(%rax) 887.seh_savexmm %xmm8, 0x20+0 888 movaps %xmm9,0x10(%rax) 889.seh_savexmm %xmm9, 0x20+0x10 890 movaps %xmm10,0x20(%rax) 891.seh_savexmm %xmm10, 0x20+0x20 892 movaps %xmm11,0x30(%rax) 893.seh_savexmm %xmm11, 0x20+0x30 894 movaps %xmm12,0x40(%rax) 895.seh_savexmm %xmm12, 0x20+0x40 896 movaps %xmm13,0x50(%rax) 897.seh_savexmm %xmm13, 0x20+0x50 898 movaps %xmm14,0x60(%rax) 899.seh_savexmm %xmm14, 0x20+0x60 900 movaps %xmm15,0x70(%rax) 901.seh_savexmm %xmm15, 0x20+0x70 902.seh_endprologue 903___ 904$code.=<<___; 905 vzeroupper 906 907 vmovdqu ($Xip),$Xi # load $Xi 908 lea .L0x1c2_polynomial(%rip),%r10 909 lea 0x40($Htbl),$Htbl # size optimization 910 vmovdqu .Lbswap_mask(%rip),$bswap 911 vpshufb $bswap,$Xi,$Xi 912 cmp \$0x80,$len 913 jb .Lshort_avx 914 sub \$0x80,$len 915 916 vmovdqu 0x70($inp),$Ii # I[7] 917 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 918 vpshufb $bswap,$Ii,$Ii 919 vmovdqu 0x20-0x40($Htbl),$HK 920 921 vpunpckhqdq $Ii,$Ii,$T2 922 vmovdqu 0x60($inp),$Ij # I[6] 923 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 924 vpxor $Ii,$T2,$T2 925 vpshufb $bswap,$Ij,$Ij 926 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 927 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 928 vpunpckhqdq $Ij,$Ij,$T1 929 vmovdqu 0x50($inp),$Ii # I[5] 930 vpclmulqdq \$0x00,$HK,$T2,$Xmi 931 vpxor $Ij,$T1,$T1 932 933 vpshufb $bswap,$Ii,$Ii 934 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 935 vpunpckhqdq $Ii,$Ii,$T2 936 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 937 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 938 vpxor $Ii,$T2,$T2 939 vmovdqu 0x40($inp),$Ij # I[4] 940 vpclmulqdq \$0x10,$HK,$T1,$Zmi 941 vmovdqu 0x50-0x40($Htbl),$HK 942 943 vpshufb $bswap,$Ij,$Ij 944 vpxor $Xlo,$Zlo,$Zlo 945 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 946 vpxor $Xhi,$Zhi,$Zhi 947 vpunpckhqdq $Ij,$Ij,$T1 948 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 949 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 950 vpxor $Xmi,$Zmi,$Zmi 951 vpclmulqdq \$0x00,$HK,$T2,$Xmi 952 vpxor $Ij,$T1,$T1 953 954 vmovdqu 0x30($inp),$Ii # I[3] 955 vpxor $Zlo,$Xlo,$Xlo 956 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 957 vpxor $Zhi,$Xhi,$Xhi 958 vpshufb $bswap,$Ii,$Ii 959 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 960 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 961 vpxor $Zmi,$Xmi,$Xmi 962 vpunpckhqdq $Ii,$Ii,$T2 963 vpclmulqdq \$0x10,$HK,$T1,$Zmi 964 vmovdqu 0x80-0x40($Htbl),$HK 965 vpxor $Ii,$T2,$T2 966 967 vmovdqu 0x20($inp),$Ij # I[2] 968 vpxor $Xlo,$Zlo,$Zlo 969 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 970 vpxor $Xhi,$Zhi,$Zhi 971 vpshufb $bswap,$Ij,$Ij 972 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 973 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 974 vpxor $Xmi,$Zmi,$Zmi 975 vpunpckhqdq $Ij,$Ij,$T1 976 vpclmulqdq \$0x00,$HK,$T2,$Xmi 977 vpxor $Ij,$T1,$T1 978 979 vmovdqu 0x10($inp),$Ii # I[1] 980 vpxor $Zlo,$Xlo,$Xlo 981 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 982 vpxor $Zhi,$Xhi,$Xhi 983 vpshufb $bswap,$Ii,$Ii 984 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 985 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 986 vpxor $Zmi,$Xmi,$Xmi 987 vpunpckhqdq $Ii,$Ii,$T2 988 vpclmulqdq \$0x10,$HK,$T1,$Zmi 989 vmovdqu 0xb0-0x40($Htbl),$HK 990 vpxor $Ii,$T2,$T2 991 992 vmovdqu ($inp),$Ij # I[0] 993 vpxor $Xlo,$Zlo,$Zlo 994 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 995 vpxor $Xhi,$Zhi,$Zhi 996 vpshufb $bswap,$Ij,$Ij 997 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 998 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 999 vpxor $Xmi,$Zmi,$Zmi 1000 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1001 1002 lea 0x80($inp),$inp 1003 cmp \$0x80,$len 1004 jb .Ltail_avx 1005 1006 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1007 sub \$0x80,$len 1008 jmp .Loop8x_avx 1009 1010.align 32 1011.Loop8x_avx: 1012 vpunpckhqdq $Ij,$Ij,$T1 1013 vmovdqu 0x70($inp),$Ii # I[7] 1014 vpxor $Xlo,$Zlo,$Zlo 1015 vpxor $Ij,$T1,$T1 1016 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1017 vpshufb $bswap,$Ii,$Ii 1018 vpxor $Xhi,$Zhi,$Zhi 1019 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1020 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1021 vpunpckhqdq $Ii,$Ii,$T2 1022 vpxor $Xmi,$Zmi,$Zmi 1023 vpclmulqdq \$0x00,$HK,$T1,$Tred 1024 vmovdqu 0x20-0x40($Htbl),$HK 1025 vpxor $Ii,$T2,$T2 1026 1027 vmovdqu 0x60($inp),$Ij # I[6] 1028 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1029 vpxor $Zlo,$Xi,$Xi # collect result 1030 vpshufb $bswap,$Ij,$Ij 1031 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1032 vxorps $Zhi,$Xo,$Xo 1033 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1034 vpunpckhqdq $Ij,$Ij,$T1 1035 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1036 vpxor $Zmi,$Tred,$Tred 1037 vxorps $Ij,$T1,$T1 1038 1039 vmovdqu 0x50($inp),$Ii # I[5] 1040 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1041 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1042 vpxor $Xo,$Tred,$Tred 1043 vpslldq \$8,$Tred,$T2 1044 vpxor $Xlo,$Zlo,$Zlo 1045 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1046 vpsrldq \$8,$Tred,$Tred 1047 vpxor $T2, $Xi, $Xi 1048 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1049 vpshufb $bswap,$Ii,$Ii 1050 vxorps $Tred,$Xo, $Xo 1051 vpxor $Xhi,$Zhi,$Zhi 1052 vpunpckhqdq $Ii,$Ii,$T2 1053 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1054 vmovdqu 0x50-0x40($Htbl),$HK 1055 vpxor $Ii,$T2,$T2 1056 vpxor $Xmi,$Zmi,$Zmi 1057 1058 vmovdqu 0x40($inp),$Ij # I[4] 1059 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1060 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1061 vpshufb $bswap,$Ij,$Ij 1062 vpxor $Zlo,$Xlo,$Xlo 1063 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1064 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1065 vpunpckhqdq $Ij,$Ij,$T1 1066 vpxor $Zhi,$Xhi,$Xhi 1067 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1068 vxorps $Ij,$T1,$T1 1069 vpxor $Zmi,$Xmi,$Xmi 1070 1071 vmovdqu 0x30($inp),$Ii # I[3] 1072 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1073 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1074 vpshufb $bswap,$Ii,$Ii 1075 vpxor $Xlo,$Zlo,$Zlo 1076 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1077 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1078 vpunpckhqdq $Ii,$Ii,$T2 1079 vpxor $Xhi,$Zhi,$Zhi 1080 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1081 vmovdqu 0x80-0x40($Htbl),$HK 1082 vpxor $Ii,$T2,$T2 1083 vpxor $Xmi,$Zmi,$Zmi 1084 1085 vmovdqu 0x20($inp),$Ij # I[2] 1086 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1087 vpshufb $bswap,$Ij,$Ij 1088 vpxor $Zlo,$Xlo,$Xlo 1089 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1090 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1091 vpunpckhqdq $Ij,$Ij,$T1 1092 vpxor $Zhi,$Xhi,$Xhi 1093 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1094 vpxor $Ij,$T1,$T1 1095 vpxor $Zmi,$Xmi,$Xmi 1096 vxorps $Tred,$Xi,$Xi 1097 1098 vmovdqu 0x10($inp),$Ii # I[1] 1099 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1100 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1101 vpshufb $bswap,$Ii,$Ii 1102 vpxor $Xlo,$Zlo,$Zlo 1103 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1104 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1105 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1106 vxorps $Xo,$Tred,$Tred 1107 vpunpckhqdq $Ii,$Ii,$T2 1108 vpxor $Xhi,$Zhi,$Zhi 1109 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1110 vmovdqu 0xb0-0x40($Htbl),$HK 1111 vpxor $Ii,$T2,$T2 1112 vpxor $Xmi,$Zmi,$Zmi 1113 1114 vmovdqu ($inp),$Ij # I[0] 1115 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1116 vpshufb $bswap,$Ij,$Ij 1117 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1118 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1119 vpxor $Tred,$Ij,$Ij 1120 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1121 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1122 1123 lea 0x80($inp),$inp 1124 sub \$0x80,$len 1125 jnc .Loop8x_avx 1126 1127 add \$0x80,$len 1128 jmp .Ltail_no_xor_avx 1129 1130.align 32 1131.Lshort_avx: 1132 vmovdqu -0x10($inp,$len),$Ii # very last word 1133 lea ($inp,$len),$inp 1134 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1135 vmovdqu 0x20-0x40($Htbl),$HK 1136 vpshufb $bswap,$Ii,$Ij 1137 1138 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1139 vmovdqa $Xhi,$Zhi # $Zhi and 1140 vmovdqa $Xmi,$Zmi # $Zmi 1141 sub \$0x10,$len 1142 jz .Ltail_avx 1143 1144 vpunpckhqdq $Ij,$Ij,$T1 1145 vpxor $Xlo,$Zlo,$Zlo 1146 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1147 vpxor $Ij,$T1,$T1 1148 vmovdqu -0x20($inp),$Ii 1149 vpxor $Xhi,$Zhi,$Zhi 1150 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1151 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1152 vpshufb $bswap,$Ii,$Ij 1153 vpxor $Xmi,$Zmi,$Zmi 1154 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1155 vpsrldq \$8,$HK,$HK 1156 sub \$0x10,$len 1157 jz .Ltail_avx 1158 1159 vpunpckhqdq $Ij,$Ij,$T1 1160 vpxor $Xlo,$Zlo,$Zlo 1161 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1162 vpxor $Ij,$T1,$T1 1163 vmovdqu -0x30($inp),$Ii 1164 vpxor $Xhi,$Zhi,$Zhi 1165 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1166 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1167 vpshufb $bswap,$Ii,$Ij 1168 vpxor $Xmi,$Zmi,$Zmi 1169 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1170 vmovdqu 0x50-0x40($Htbl),$HK 1171 sub \$0x10,$len 1172 jz .Ltail_avx 1173 1174 vpunpckhqdq $Ij,$Ij,$T1 1175 vpxor $Xlo,$Zlo,$Zlo 1176 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1177 vpxor $Ij,$T1,$T1 1178 vmovdqu -0x40($inp),$Ii 1179 vpxor $Xhi,$Zhi,$Zhi 1180 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1181 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1182 vpshufb $bswap,$Ii,$Ij 1183 vpxor $Xmi,$Zmi,$Zmi 1184 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1185 vpsrldq \$8,$HK,$HK 1186 sub \$0x10,$len 1187 jz .Ltail_avx 1188 1189 vpunpckhqdq $Ij,$Ij,$T1 1190 vpxor $Xlo,$Zlo,$Zlo 1191 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1192 vpxor $Ij,$T1,$T1 1193 vmovdqu -0x50($inp),$Ii 1194 vpxor $Xhi,$Zhi,$Zhi 1195 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1196 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1197 vpshufb $bswap,$Ii,$Ij 1198 vpxor $Xmi,$Zmi,$Zmi 1199 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1200 vmovdqu 0x80-0x40($Htbl),$HK 1201 sub \$0x10,$len 1202 jz .Ltail_avx 1203 1204 vpunpckhqdq $Ij,$Ij,$T1 1205 vpxor $Xlo,$Zlo,$Zlo 1206 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1207 vpxor $Ij,$T1,$T1 1208 vmovdqu -0x60($inp),$Ii 1209 vpxor $Xhi,$Zhi,$Zhi 1210 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1211 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1212 vpshufb $bswap,$Ii,$Ij 1213 vpxor $Xmi,$Zmi,$Zmi 1214 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1215 vpsrldq \$8,$HK,$HK 1216 sub \$0x10,$len 1217 jz .Ltail_avx 1218 1219 vpunpckhqdq $Ij,$Ij,$T1 1220 vpxor $Xlo,$Zlo,$Zlo 1221 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1222 vpxor $Ij,$T1,$T1 1223 vmovdqu -0x70($inp),$Ii 1224 vpxor $Xhi,$Zhi,$Zhi 1225 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1226 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1227 vpshufb $bswap,$Ii,$Ij 1228 vpxor $Xmi,$Zmi,$Zmi 1229 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1230 vmovq 0xb8-0x40($Htbl),$HK 1231 sub \$0x10,$len 1232 jmp .Ltail_avx 1233 1234.align 32 1235.Ltail_avx: 1236 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1237.Ltail_no_xor_avx: 1238 vpunpckhqdq $Ij,$Ij,$T1 1239 vpxor $Xlo,$Zlo,$Zlo 1240 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1241 vpxor $Ij,$T1,$T1 1242 vpxor $Xhi,$Zhi,$Zhi 1243 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1244 vpxor $Xmi,$Zmi,$Zmi 1245 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1246 1247 vmovdqu (%r10),$Tred 1248 1249 vpxor $Xlo,$Zlo,$Xi 1250 vpxor $Xhi,$Zhi,$Xo 1251 vpxor $Xmi,$Zmi,$Zmi 1252 1253 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1254 vpxor $Xo, $Zmi,$Zmi 1255 vpslldq \$8, $Zmi,$T2 1256 vpsrldq \$8, $Zmi,$Zmi 1257 vpxor $T2, $Xi, $Xi 1258 vpxor $Zmi,$Xo, $Xo 1259 1260 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1261 vpalignr \$8,$Xi,$Xi,$Xi 1262 vpxor $T2,$Xi,$Xi 1263 1264 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1265 vpalignr \$8,$Xi,$Xi,$Xi 1266 vpxor $Xo,$Xi,$Xi 1267 vpxor $T2,$Xi,$Xi 1268 1269 cmp \$0,$len 1270 jne .Lshort_avx 1271 1272 vpshufb $bswap,$Xi,$Xi 1273 vmovdqu $Xi,($Xip) 1274 vzeroupper 1275___ 1276$code.=<<___ if ($win64); 1277 movaps (%rsp),%xmm6 1278 movaps 0x10(%rsp),%xmm7 1279 movaps 0x20(%rsp),%xmm8 1280 movaps 0x30(%rsp),%xmm9 1281 movaps 0x40(%rsp),%xmm10 1282 movaps 0x50(%rsp),%xmm11 1283 movaps 0x60(%rsp),%xmm12 1284 movaps 0x70(%rsp),%xmm13 1285 movaps 0x80(%rsp),%xmm14 1286 movaps 0x90(%rsp),%xmm15 1287 lea 0xa8(%rsp),%rsp 1288___ 1289$code.=<<___; 1290 ret 1291.cfi_endproc 1292.seh_endproc 1293.size gcm_ghash_avx,.-gcm_ghash_avx 1294___ 1295} else { 1296$code.=<<___; 1297 jmp .L_ghash_clmul 1298.size gcm_ghash_avx,.-gcm_ghash_avx 1299___ 1300} 1301 1302$code.=<<___; 1303.section .rodata 1304.align 64 1305.Lbswap_mask: 1306 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1307.L0x1c2_polynomial: 1308 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1309.L7_mask: 1310 .long 7,0,7,0 1311.align 64 1312 1313.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1314.align 64 1315.text 1316___ 1317 1318$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1319 1320print $code; 1321 1322close STDOUT or die "error closing STDOUT: $!"; 1323