1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# This file was patched in BoringSSL to remove the variable-time 4-bit 94# implementation. 95 96$flavour = shift; 97$output = shift; 98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 99 100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 101 102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 105die "can't locate x86_64-xlate.pl"; 106 107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be 108# computed incorrectly. 109# 110# In upstream, this is controlled by shelling out to the compiler to check 111# versions, but BoringSSL is intended to be used with pre-generated perlasm 112# output, so this isn't useful anyway. 113$avx = 1; 114 115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 116*STDOUT=*OUT; 117 118$do4xaggr=1; 119 120 121$code=<<___; 122.text 123.extern OPENSSL_ia32cap_P 124___ 125 126 127###################################################################### 128# PCLMULQDQ version. 129 130@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 131 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 132 133($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 134($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 135 136sub clmul64x64_T2 { # minimal register pressure 137my ($Xhi,$Xi,$Hkey,$HK)=@_; 138 139if (!defined($HK)) { $HK = $T2; 140$code.=<<___; 141 movdqa $Xi,$Xhi # 142 pshufd \$0b01001110,$Xi,$T1 143 pshufd \$0b01001110,$Hkey,$T2 144 pxor $Xi,$T1 # 145 pxor $Hkey,$T2 146___ 147} else { 148$code.=<<___; 149 movdqa $Xi,$Xhi # 150 pshufd \$0b01001110,$Xi,$T1 151 pxor $Xi,$T1 # 152___ 153} 154$code.=<<___; 155 pclmulqdq \$0x00,$Hkey,$Xi ####### 156 pclmulqdq \$0x11,$Hkey,$Xhi ####### 157 pclmulqdq \$0x00,$HK,$T1 ####### 158 pxor $Xi,$T1 # 159 pxor $Xhi,$T1 # 160 161 movdqa $T1,$T2 # 162 psrldq \$8,$T1 163 pslldq \$8,$T2 # 164 pxor $T1,$Xhi 165 pxor $T2,$Xi # 166___ 167} 168 169sub reduction_alg9 { # 17/11 times faster than Intel version 170my ($Xhi,$Xi) = @_; 171 172$code.=<<___; 173 # 1st phase 174 movdqa $Xi,$T2 # 175 movdqa $Xi,$T1 176 psllq \$5,$Xi 177 pxor $Xi,$T1 # 178 psllq \$1,$Xi 179 pxor $T1,$Xi # 180 psllq \$57,$Xi # 181 movdqa $Xi,$T1 # 182 pslldq \$8,$Xi 183 psrldq \$8,$T1 # 184 pxor $T2,$Xi 185 pxor $T1,$Xhi # 186 187 # 2nd phase 188 movdqa $Xi,$T2 189 psrlq \$1,$Xi 190 pxor $T2,$Xhi # 191 pxor $Xi,$T2 192 psrlq \$5,$Xi 193 pxor $T2,$Xi # 194 psrlq \$1,$Xi # 195 pxor $Xhi,$Xi # 196___ 197} 198 199{ my ($Htbl,$Xip)=@_4args; 200 my $HK="%xmm6"; 201 202$code.=<<___; 203.globl gcm_init_clmul 204.type gcm_init_clmul,\@abi-omnipotent 205.align 16 206gcm_init_clmul: 207.cfi_startproc 208.seh_startproc 209 _CET_ENDBR 210.L_init_clmul: 211___ 212$code.=<<___ if ($win64); 213 sub \$0x18,%rsp 214.seh_allocstack 0x18 215 movaps %xmm6,(%rsp) 216.seh_savexmm128 %xmm6, 0 217___ 218$code.=<<___; 219 movdqu ($Xip),$Hkey 220 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 221 222 # <<1 twist 223 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 224 movdqa $Hkey,$T1 225 psllq \$1,$Hkey 226 pxor $T3,$T3 # 227 psrlq \$63,$T1 228 pcmpgtd $T2,$T3 # broadcast carry bit 229 pslldq \$8,$T1 230 por $T1,$Hkey # H<<=1 231 232 # magic reduction 233 pand .L0x1c2_polynomial(%rip),$T3 234 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 235 236 # calculate H^2 237 pshufd \$0b01001110,$Hkey,$HK 238 movdqa $Hkey,$Xi 239 pxor $Hkey,$HK 240___ 241 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 242 &reduction_alg9 ($Xhi,$Xi); 243$code.=<<___; 244 pshufd \$0b01001110,$Hkey,$T1 245 pshufd \$0b01001110,$Xi,$T2 246 pxor $Hkey,$T1 # Karatsuba pre-processing 247 movdqu $Hkey,0x00($Htbl) # save H 248 pxor $Xi,$T2 # Karatsuba pre-processing 249 movdqu $Xi,0x10($Htbl) # save H^2 250 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 251 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 252___ 253if ($do4xaggr) { 254 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 255 &reduction_alg9 ($Xhi,$Xi); 256$code.=<<___; 257 movdqa $Xi,$T3 258___ 259 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 260 &reduction_alg9 ($Xhi,$Xi); 261$code.=<<___; 262 pshufd \$0b01001110,$T3,$T1 263 pshufd \$0b01001110,$Xi,$T2 264 pxor $T3,$T1 # Karatsuba pre-processing 265 movdqu $T3,0x30($Htbl) # save H^3 266 pxor $Xi,$T2 # Karatsuba pre-processing 267 movdqu $Xi,0x40($Htbl) # save H^4 268 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 269 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 270___ 271} 272$code.=<<___ if ($win64); 273 movaps (%rsp),%xmm6 274 lea 0x18(%rsp),%rsp 275___ 276$code.=<<___; 277 ret 278.cfi_endproc 279.seh_endproc 280.size gcm_init_clmul,.-gcm_init_clmul 281___ 282} 283 284{ my ($Xip,$Htbl)=@_4args; 285 286$code.=<<___; 287.globl gcm_gmult_clmul 288.type gcm_gmult_clmul,\@abi-omnipotent 289.align 16 290gcm_gmult_clmul: 291.cfi_startproc 292 _CET_ENDBR 293.L_gmult_clmul: 294 movdqu ($Xip),$Xi 295 movdqa .Lbswap_mask(%rip),$T3 296 movdqu ($Htbl),$Hkey 297 movdqu 0x20($Htbl),$T2 298 pshufb $T3,$Xi 299___ 300 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 301$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 302 # experimental alternative. special thing about is that there 303 # no dependency between the two multiplications... 304 mov \$`0xE1<<1`,%eax 305 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 306 mov \$0x07,%r11d 307 movq %rax,$T1 308 movq %r10,$T2 309 movq %r11,$T3 # borrow $T3 310 pand $Xi,$T3 311 pshufb $T3,$T2 # ($Xi&7)·0xE0 312 movq %rax,$T3 313 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 314 pxor $Xi,$T2 315 pslldq \$15,$T2 316 paddd $T2,$T2 # <<(64+56+1) 317 pxor $T2,$Xi 318 pclmulqdq \$0x01,$T3,$Xi 319 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 320 psrldq \$1,$T1 321 pxor $T1,$Xhi 322 pslldq \$7,$Xi 323 pxor $Xhi,$Xi 324___ 325$code.=<<___; 326 pshufb $T3,$Xi 327 movdqu $Xi,($Xip) 328 ret 329.cfi_endproc 330.size gcm_gmult_clmul,.-gcm_gmult_clmul 331___ 332} 333 334{ my ($Xip,$Htbl,$inp,$len)=@_4args; 335 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 336 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 337 338$code.=<<___; 339.globl gcm_ghash_clmul 340.type gcm_ghash_clmul,\@abi-omnipotent 341.align 32 342gcm_ghash_clmul: 343.cfi_startproc 344.seh_startproc 345 _CET_ENDBR 346.L_ghash_clmul: 347___ 348$code.=<<___ if ($win64); 349 lea -0x88(%rsp),%rax 350 lea -0x20(%rax),%rsp 351.seh_allocstack 0x20+0x88 352 movaps %xmm6,-0x20(%rax) 353.seh_savexmm128 %xmm6, 0x20-0x20 354 movaps %xmm7,-0x10(%rax) 355.seh_savexmm128 %xmm7, 0x20-0x10 356 movaps %xmm8,0(%rax) 357.seh_savexmm128 %xmm8, 0x20+0 358 movaps %xmm9,0x10(%rax) 359.seh_savexmm128 %xmm9, 0x20+0x10 360 movaps %xmm10,0x20(%rax) 361.seh_savexmm128 %xmm10, 0x20+0x20 362 movaps %xmm11,0x30(%rax) 363.seh_savexmm128 %xmm11, 0x20+0x30 364 movaps %xmm12,0x40(%rax) 365.seh_savexmm128 %xmm12, 0x20+0x40 366 movaps %xmm13,0x50(%rax) 367.seh_savexmm128 %xmm13, 0x20+0x50 368 movaps %xmm14,0x60(%rax) 369.seh_savexmm128 %xmm14, 0x20+0x60 370 movaps %xmm15,0x70(%rax) 371.seh_savexmm128 %xmm15, 0x20+0x70 372___ 373$code.=<<___; 374 movdqa .Lbswap_mask(%rip),$T3 375 376 movdqu ($Xip),$Xi 377 movdqu ($Htbl),$Hkey 378 movdqu 0x20($Htbl),$HK 379 pshufb $T3,$Xi 380 381 sub \$0x10,$len 382 jz .Lodd_tail 383 384 movdqu 0x10($Htbl),$Hkey2 385___ 386if ($do4xaggr) { 387my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 388 389$code.=<<___; 390 leaq OPENSSL_ia32cap_P(%rip),%rax 391 mov 4(%rax),%eax 392 cmp \$0x30,$len 393 jb .Lskip4x 394 395 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 396 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 397 je .Lskip4x 398 399 sub \$0x30,$len 400 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 401 movdqu 0x30($Htbl),$Hkey3 402 movdqu 0x40($Htbl),$Hkey4 403 404 ####### 405 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 406 # 407 movdqu 0x30($inp),$Xln 408 movdqu 0x20($inp),$Xl 409 pshufb $T3,$Xln 410 pshufb $T3,$Xl 411 movdqa $Xln,$Xhn 412 pshufd \$0b01001110,$Xln,$Xmn 413 pxor $Xln,$Xmn 414 pclmulqdq \$0x00,$Hkey,$Xln 415 pclmulqdq \$0x11,$Hkey,$Xhn 416 pclmulqdq \$0x00,$HK,$Xmn 417 418 movdqa $Xl,$Xh 419 pshufd \$0b01001110,$Xl,$Xm 420 pxor $Xl,$Xm 421 pclmulqdq \$0x00,$Hkey2,$Xl 422 pclmulqdq \$0x11,$Hkey2,$Xh 423 pclmulqdq \$0x10,$HK,$Xm 424 xorps $Xl,$Xln 425 xorps $Xh,$Xhn 426 movups 0x50($Htbl),$HK 427 xorps $Xm,$Xmn 428 429 movdqu 0x10($inp),$Xl 430 movdqu 0($inp),$T1 431 pshufb $T3,$Xl 432 pshufb $T3,$T1 433 movdqa $Xl,$Xh 434 pshufd \$0b01001110,$Xl,$Xm 435 pxor $T1,$Xi 436 pxor $Xl,$Xm 437 pclmulqdq \$0x00,$Hkey3,$Xl 438 movdqa $Xi,$Xhi 439 pshufd \$0b01001110,$Xi,$T1 440 pxor $Xi,$T1 441 pclmulqdq \$0x11,$Hkey3,$Xh 442 pclmulqdq \$0x00,$HK,$Xm 443 xorps $Xl,$Xln 444 xorps $Xh,$Xhn 445 446 lea 0x40($inp),$inp 447 sub \$0x40,$len 448 jc .Ltail4x 449 450 jmp .Lmod4_loop 451.align 32 452.Lmod4_loop: 453 pclmulqdq \$0x00,$Hkey4,$Xi 454 xorps $Xm,$Xmn 455 movdqu 0x30($inp),$Xl 456 pshufb $T3,$Xl 457 pclmulqdq \$0x11,$Hkey4,$Xhi 458 xorps $Xln,$Xi 459 movdqu 0x20($inp),$Xln 460 movdqa $Xl,$Xh 461 pclmulqdq \$0x10,$HK,$T1 462 pshufd \$0b01001110,$Xl,$Xm 463 xorps $Xhn,$Xhi 464 pxor $Xl,$Xm 465 pshufb $T3,$Xln 466 movups 0x20($Htbl),$HK 467 xorps $Xmn,$T1 468 pclmulqdq \$0x00,$Hkey,$Xl 469 pshufd \$0b01001110,$Xln,$Xmn 470 471 pxor $Xi,$T1 # aggregated Karatsuba post-processing 472 movdqa $Xln,$Xhn 473 pxor $Xhi,$T1 # 474 pxor $Xln,$Xmn 475 movdqa $T1,$T2 # 476 pclmulqdq \$0x11,$Hkey,$Xh 477 pslldq \$8,$T1 478 psrldq \$8,$T2 # 479 pxor $T1,$Xi 480 movdqa .L7_mask(%rip),$T1 481 pxor $T2,$Xhi # 482 movq %rax,$T2 483 484 pand $Xi,$T1 # 1st phase 485 pshufb $T1,$T2 # 486 pxor $Xi,$T2 # 487 pclmulqdq \$0x00,$HK,$Xm 488 psllq \$57,$T2 # 489 movdqa $T2,$T1 # 490 pslldq \$8,$T2 491 pclmulqdq \$0x00,$Hkey2,$Xln 492 psrldq \$8,$T1 # 493 pxor $T2,$Xi 494 pxor $T1,$Xhi # 495 movdqu 0($inp),$T1 496 497 movdqa $Xi,$T2 # 2nd phase 498 psrlq \$1,$Xi 499 pclmulqdq \$0x11,$Hkey2,$Xhn 500 xorps $Xl,$Xln 501 movdqu 0x10($inp),$Xl 502 pshufb $T3,$Xl 503 pclmulqdq \$0x10,$HK,$Xmn 504 xorps $Xh,$Xhn 505 movups 0x50($Htbl),$HK 506 pshufb $T3,$T1 507 pxor $T2,$Xhi # 508 pxor $Xi,$T2 509 psrlq \$5,$Xi 510 511 movdqa $Xl,$Xh 512 pxor $Xm,$Xmn 513 pshufd \$0b01001110,$Xl,$Xm 514 pxor $T2,$Xi # 515 pxor $T1,$Xhi 516 pxor $Xl,$Xm 517 pclmulqdq \$0x00,$Hkey3,$Xl 518 psrlq \$1,$Xi # 519 pxor $Xhi,$Xi # 520 movdqa $Xi,$Xhi 521 pclmulqdq \$0x11,$Hkey3,$Xh 522 xorps $Xl,$Xln 523 pshufd \$0b01001110,$Xi,$T1 524 pxor $Xi,$T1 525 526 pclmulqdq \$0x00,$HK,$Xm 527 xorps $Xh,$Xhn 528 529 lea 0x40($inp),$inp 530 sub \$0x40,$len 531 jnc .Lmod4_loop 532 533.Ltail4x: 534 pclmulqdq \$0x00,$Hkey4,$Xi 535 pclmulqdq \$0x11,$Hkey4,$Xhi 536 pclmulqdq \$0x10,$HK,$T1 537 xorps $Xm,$Xmn 538 xorps $Xln,$Xi 539 xorps $Xhn,$Xhi 540 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 541 pxor $Xmn,$T1 542 543 pxor $Xhi,$T1 # 544 pxor $Xi,$Xhi 545 546 movdqa $T1,$T2 # 547 psrldq \$8,$T1 548 pslldq \$8,$T2 # 549 pxor $T1,$Xhi 550 pxor $T2,$Xi # 551___ 552 &reduction_alg9($Xhi,$Xi); 553$code.=<<___; 554 add \$0x40,$len 555 jz .Ldone 556 movdqu 0x20($Htbl),$HK 557 sub \$0x10,$len 558 jz .Lodd_tail 559.Lskip4x: 560___ 561} 562$code.=<<___; 563 ####### 564 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 565 # [(H*Ii+1) + (H*Xi+1)] mod P = 566 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 567 # 568 movdqu ($inp),$T1 # Ii 569 movdqu 16($inp),$Xln # Ii+1 570 pshufb $T3,$T1 571 pshufb $T3,$Xln 572 pxor $T1,$Xi # Ii+Xi 573 574 movdqa $Xln,$Xhn 575 pshufd \$0b01001110,$Xln,$Xmn 576 pxor $Xln,$Xmn 577 pclmulqdq \$0x00,$Hkey,$Xln 578 pclmulqdq \$0x11,$Hkey,$Xhn 579 pclmulqdq \$0x00,$HK,$Xmn 580 581 lea 32($inp),$inp # i+=2 582 nop 583 sub \$0x20,$len 584 jbe .Leven_tail 585 nop 586 jmp .Lmod_loop 587 588.align 32 589.Lmod_loop: 590 movdqa $Xi,$Xhi 591 movdqa $Xmn,$T1 592 pshufd \$0b01001110,$Xi,$Xmn # 593 pxor $Xi,$Xmn # 594 595 pclmulqdq \$0x00,$Hkey2,$Xi 596 pclmulqdq \$0x11,$Hkey2,$Xhi 597 pclmulqdq \$0x10,$HK,$Xmn 598 599 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 600 pxor $Xhn,$Xhi 601 movdqu ($inp),$T2 # Ii 602 pxor $Xi,$T1 # aggregated Karatsuba post-processing 603 pshufb $T3,$T2 604 movdqu 16($inp),$Xln # Ii+1 605 606 pxor $Xhi,$T1 607 pxor $T2,$Xhi # "Ii+Xi", consume early 608 pxor $T1,$Xmn 609 pshufb $T3,$Xln 610 movdqa $Xmn,$T1 # 611 psrldq \$8,$T1 612 pslldq \$8,$Xmn # 613 pxor $T1,$Xhi 614 pxor $Xmn,$Xi # 615 616 movdqa $Xln,$Xhn # 617 618 movdqa $Xi,$T2 # 1st phase 619 movdqa $Xi,$T1 620 psllq \$5,$Xi 621 pxor $Xi,$T1 # 622 pclmulqdq \$0x00,$Hkey,$Xln ####### 623 psllq \$1,$Xi 624 pxor $T1,$Xi # 625 psllq \$57,$Xi # 626 movdqa $Xi,$T1 # 627 pslldq \$8,$Xi 628 psrldq \$8,$T1 # 629 pxor $T2,$Xi 630 pshufd \$0b01001110,$Xhn,$Xmn 631 pxor $T1,$Xhi # 632 pxor $Xhn,$Xmn # 633 634 movdqa $Xi,$T2 # 2nd phase 635 psrlq \$1,$Xi 636 pclmulqdq \$0x11,$Hkey,$Xhn ####### 637 pxor $T2,$Xhi # 638 pxor $Xi,$T2 639 psrlq \$5,$Xi 640 pxor $T2,$Xi # 641 lea 32($inp),$inp 642 psrlq \$1,$Xi # 643 pclmulqdq \$0x00,$HK,$Xmn ####### 644 pxor $Xhi,$Xi # 645 646 sub \$0x20,$len 647 ja .Lmod_loop 648 649.Leven_tail: 650 movdqa $Xi,$Xhi 651 movdqa $Xmn,$T1 652 pshufd \$0b01001110,$Xi,$Xmn # 653 pxor $Xi,$Xmn # 654 655 pclmulqdq \$0x00,$Hkey2,$Xi 656 pclmulqdq \$0x11,$Hkey2,$Xhi 657 pclmulqdq \$0x10,$HK,$Xmn 658 659 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 660 pxor $Xhn,$Xhi 661 pxor $Xi,$T1 662 pxor $Xhi,$T1 663 pxor $T1,$Xmn 664 movdqa $Xmn,$T1 # 665 psrldq \$8,$T1 666 pslldq \$8,$Xmn # 667 pxor $T1,$Xhi 668 pxor $Xmn,$Xi # 669___ 670 &reduction_alg9 ($Xhi,$Xi); 671$code.=<<___; 672 test $len,$len 673 jnz .Ldone 674 675.Lodd_tail: 676 movdqu ($inp),$T1 # Ii 677 pshufb $T3,$T1 678 pxor $T1,$Xi # Ii+Xi 679___ 680 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 681 &reduction_alg9 ($Xhi,$Xi); 682$code.=<<___; 683.Ldone: 684 pshufb $T3,$Xi 685 movdqu $Xi,($Xip) 686___ 687$code.=<<___ if ($win64); 688 movaps (%rsp),%xmm6 689 movaps 0x10(%rsp),%xmm7 690 movaps 0x20(%rsp),%xmm8 691 movaps 0x30(%rsp),%xmm9 692 movaps 0x40(%rsp),%xmm10 693 movaps 0x50(%rsp),%xmm11 694 movaps 0x60(%rsp),%xmm12 695 movaps 0x70(%rsp),%xmm13 696 movaps 0x80(%rsp),%xmm14 697 movaps 0x90(%rsp),%xmm15 698 lea 0xa8(%rsp),%rsp 699___ 700$code.=<<___; 701 ret 702.cfi_endproc 703.seh_endproc 704.size gcm_ghash_clmul,.-gcm_ghash_clmul 705___ 706} 707 708$code.=<<___; 709.globl gcm_init_avx 710.type gcm_init_avx,\@abi-omnipotent 711.align 32 712gcm_init_avx: 713.cfi_startproc 714 _CET_ENDBR 715___ 716if ($avx) { 717my ($Htbl,$Xip)=@_4args; 718my $HK="%xmm6"; 719 720$code.=<<___ if ($win64); 721.seh_startproc 722 sub \$0x18,%rsp 723.seh_allocstack 0x18 724 movaps %xmm6,(%rsp) 725.seh_savexmm128 %xmm6, 0 726___ 727$code.=<<___; 728 vzeroupper 729 730 vmovdqu ($Xip),$Hkey 731 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 732 733 # <<1 twist 734 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 735 vpsrlq \$63,$Hkey,$T1 736 vpsllq \$1,$Hkey,$Hkey 737 vpxor $T3,$T3,$T3 # 738 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 739 vpslldq \$8,$T1,$T1 740 vpor $T1,$Hkey,$Hkey # H<<=1 741 742 # magic reduction 743 vpand .L0x1c2_polynomial(%rip),$T3,$T3 744 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 745 746 vpunpckhqdq $Hkey,$Hkey,$HK 747 vmovdqa $Hkey,$Xi 748 vpxor $Hkey,$HK,$HK 749 mov \$4,%r10 # up to H^8 750 jmp .Linit_start_avx 751___ 752 753sub clmul64x64_avx { 754my ($Xhi,$Xi,$Hkey,$HK)=@_; 755 756if (!defined($HK)) { $HK = $T2; 757$code.=<<___; 758 vpunpckhqdq $Xi,$Xi,$T1 759 vpunpckhqdq $Hkey,$Hkey,$T2 760 vpxor $Xi,$T1,$T1 # 761 vpxor $Hkey,$T2,$T2 762___ 763} else { 764$code.=<<___; 765 vpunpckhqdq $Xi,$Xi,$T1 766 vpxor $Xi,$T1,$T1 # 767___ 768} 769$code.=<<___; 770 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 771 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 772 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 773 vpxor $Xi,$Xhi,$T2 # 774 vpxor $T2,$T1,$T1 # 775 776 vpslldq \$8,$T1,$T2 # 777 vpsrldq \$8,$T1,$T1 778 vpxor $T2,$Xi,$Xi # 779 vpxor $T1,$Xhi,$Xhi 780___ 781} 782 783sub reduction_avx { 784my ($Xhi,$Xi) = @_; 785 786$code.=<<___; 787 vpsllq \$57,$Xi,$T1 # 1st phase 788 vpsllq \$62,$Xi,$T2 789 vpxor $T1,$T2,$T2 # 790 vpsllq \$63,$Xi,$T1 791 vpxor $T1,$T2,$T2 # 792 vpslldq \$8,$T2,$T1 # 793 vpsrldq \$8,$T2,$T2 794 vpxor $T1,$Xi,$Xi # 795 vpxor $T2,$Xhi,$Xhi 796 797 vpsrlq \$1,$Xi,$T2 # 2nd phase 798 vpxor $Xi,$Xhi,$Xhi 799 vpxor $T2,$Xi,$Xi # 800 vpsrlq \$5,$T2,$T2 801 vpxor $T2,$Xi,$Xi # 802 vpsrlq \$1,$Xi,$Xi # 803 vpxor $Xhi,$Xi,$Xi # 804___ 805} 806 807$code.=<<___; 808.align 32 809.Linit_loop_avx: 810 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 811 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 812___ 813 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 814 &reduction_avx ($Xhi,$Xi); 815$code.=<<___; 816.Linit_start_avx: 817 vmovdqa $Xi,$T3 818___ 819 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 820 &reduction_avx ($Xhi,$Xi); 821$code.=<<___; 822 vpshufd \$0b01001110,$T3,$T1 823 vpshufd \$0b01001110,$Xi,$T2 824 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 825 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 826 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 827 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 828 lea 0x30($Htbl),$Htbl 829 sub \$1,%r10 830 jnz .Linit_loop_avx 831 832 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 833 vmovdqu $T3,-0x10($Htbl) 834 835 vzeroupper 836___ 837$code.=<<___ if ($win64); 838 movaps (%rsp),%xmm6 839 lea 0x18(%rsp),%rsp 840___ 841$code.=<<___; 842 ret 843.seh_endproc 844.cfi_endproc 845.size gcm_init_avx,.-gcm_init_avx 846___ 847} else { 848$code.=<<___; 849 jmp .L_init_clmul 850.size gcm_init_avx,.-gcm_init_avx 851___ 852} 853 854$code.=<<___; 855.globl gcm_ghash_avx 856.type gcm_ghash_avx,\@abi-omnipotent 857.align 32 858gcm_ghash_avx: 859.cfi_startproc 860 _CET_ENDBR 861___ 862if ($avx) { 863my ($Xip,$Htbl,$inp,$len)=@_4args; 864my ($Xlo,$Xhi,$Xmi, 865 $Zlo,$Zhi,$Zmi, 866 $Hkey,$HK,$T1,$T2, 867 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 868 869$code.=<<___ if ($win64); 870.seh_startproc 871 lea -0x88(%rsp),%rax 872 lea -0x20(%rax),%rsp 873.seh_allocstack 0x20+0x88 874 movaps %xmm6,-0x20(%rax) 875.seh_savexmm128 %xmm6, 0x20-0x20 876 movaps %xmm7,-0x10(%rax) 877.seh_savexmm128 %xmm7, 0x20-0x10 878 movaps %xmm8,0(%rax) 879.seh_savexmm128 %xmm8, 0x20+0 880 movaps %xmm9,0x10(%rax) 881.seh_savexmm128 %xmm9, 0x20+0x10 882 movaps %xmm10,0x20(%rax) 883.seh_savexmm128 %xmm10, 0x20+0x20 884 movaps %xmm11,0x30(%rax) 885.seh_savexmm128 %xmm11, 0x20+0x30 886 movaps %xmm12,0x40(%rax) 887.seh_savexmm128 %xmm12, 0x20+0x40 888 movaps %xmm13,0x50(%rax) 889.seh_savexmm128 %xmm13, 0x20+0x50 890 movaps %xmm14,0x60(%rax) 891.seh_savexmm128 %xmm14, 0x20+0x60 892 movaps %xmm15,0x70(%rax) 893.seh_savexmm128 %xmm15, 0x20+0x70 894___ 895$code.=<<___; 896 vzeroupper 897 898 vmovdqu ($Xip),$Xi # load $Xi 899 lea .L0x1c2_polynomial(%rip),%r10 900 lea 0x40($Htbl),$Htbl # size optimization 901 vmovdqu .Lbswap_mask(%rip),$bswap 902 vpshufb $bswap,$Xi,$Xi 903 cmp \$0x80,$len 904 jb .Lshort_avx 905 sub \$0x80,$len 906 907 vmovdqu 0x70($inp),$Ii # I[7] 908 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 909 vpshufb $bswap,$Ii,$Ii 910 vmovdqu 0x20-0x40($Htbl),$HK 911 912 vpunpckhqdq $Ii,$Ii,$T2 913 vmovdqu 0x60($inp),$Ij # I[6] 914 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 915 vpxor $Ii,$T2,$T2 916 vpshufb $bswap,$Ij,$Ij 917 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 918 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 919 vpunpckhqdq $Ij,$Ij,$T1 920 vmovdqu 0x50($inp),$Ii # I[5] 921 vpclmulqdq \$0x00,$HK,$T2,$Xmi 922 vpxor $Ij,$T1,$T1 923 924 vpshufb $bswap,$Ii,$Ii 925 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 926 vpunpckhqdq $Ii,$Ii,$T2 927 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 928 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 929 vpxor $Ii,$T2,$T2 930 vmovdqu 0x40($inp),$Ij # I[4] 931 vpclmulqdq \$0x10,$HK,$T1,$Zmi 932 vmovdqu 0x50-0x40($Htbl),$HK 933 934 vpshufb $bswap,$Ij,$Ij 935 vpxor $Xlo,$Zlo,$Zlo 936 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 937 vpxor $Xhi,$Zhi,$Zhi 938 vpunpckhqdq $Ij,$Ij,$T1 939 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 940 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 941 vpxor $Xmi,$Zmi,$Zmi 942 vpclmulqdq \$0x00,$HK,$T2,$Xmi 943 vpxor $Ij,$T1,$T1 944 945 vmovdqu 0x30($inp),$Ii # I[3] 946 vpxor $Zlo,$Xlo,$Xlo 947 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 948 vpxor $Zhi,$Xhi,$Xhi 949 vpshufb $bswap,$Ii,$Ii 950 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 951 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 952 vpxor $Zmi,$Xmi,$Xmi 953 vpunpckhqdq $Ii,$Ii,$T2 954 vpclmulqdq \$0x10,$HK,$T1,$Zmi 955 vmovdqu 0x80-0x40($Htbl),$HK 956 vpxor $Ii,$T2,$T2 957 958 vmovdqu 0x20($inp),$Ij # I[2] 959 vpxor $Xlo,$Zlo,$Zlo 960 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 961 vpxor $Xhi,$Zhi,$Zhi 962 vpshufb $bswap,$Ij,$Ij 963 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 964 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 965 vpxor $Xmi,$Zmi,$Zmi 966 vpunpckhqdq $Ij,$Ij,$T1 967 vpclmulqdq \$0x00,$HK,$T2,$Xmi 968 vpxor $Ij,$T1,$T1 969 970 vmovdqu 0x10($inp),$Ii # I[1] 971 vpxor $Zlo,$Xlo,$Xlo 972 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 973 vpxor $Zhi,$Xhi,$Xhi 974 vpshufb $bswap,$Ii,$Ii 975 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 976 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 977 vpxor $Zmi,$Xmi,$Xmi 978 vpunpckhqdq $Ii,$Ii,$T2 979 vpclmulqdq \$0x10,$HK,$T1,$Zmi 980 vmovdqu 0xb0-0x40($Htbl),$HK 981 vpxor $Ii,$T2,$T2 982 983 vmovdqu ($inp),$Ij # I[0] 984 vpxor $Xlo,$Zlo,$Zlo 985 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 986 vpxor $Xhi,$Zhi,$Zhi 987 vpshufb $bswap,$Ij,$Ij 988 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 989 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 990 vpxor $Xmi,$Zmi,$Zmi 991 vpclmulqdq \$0x10,$HK,$T2,$Xmi 992 993 lea 0x80($inp),$inp 994 cmp \$0x80,$len 995 jb .Ltail_avx 996 997 vpxor $Xi,$Ij,$Ij # accumulate $Xi 998 sub \$0x80,$len 999 jmp .Loop8x_avx 1000 1001.align 32 1002.Loop8x_avx: 1003 vpunpckhqdq $Ij,$Ij,$T1 1004 vmovdqu 0x70($inp),$Ii # I[7] 1005 vpxor $Xlo,$Zlo,$Zlo 1006 vpxor $Ij,$T1,$T1 1007 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1008 vpshufb $bswap,$Ii,$Ii 1009 vpxor $Xhi,$Zhi,$Zhi 1010 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1011 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1012 vpunpckhqdq $Ii,$Ii,$T2 1013 vpxor $Xmi,$Zmi,$Zmi 1014 vpclmulqdq \$0x00,$HK,$T1,$Tred 1015 vmovdqu 0x20-0x40($Htbl),$HK 1016 vpxor $Ii,$T2,$T2 1017 1018 vmovdqu 0x60($inp),$Ij # I[6] 1019 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1020 vpxor $Zlo,$Xi,$Xi # collect result 1021 vpshufb $bswap,$Ij,$Ij 1022 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1023 vxorps $Zhi,$Xo,$Xo 1024 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1025 vpunpckhqdq $Ij,$Ij,$T1 1026 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1027 vpxor $Zmi,$Tred,$Tred 1028 vxorps $Ij,$T1,$T1 1029 1030 vmovdqu 0x50($inp),$Ii # I[5] 1031 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1032 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1033 vpxor $Xo,$Tred,$Tred 1034 vpslldq \$8,$Tred,$T2 1035 vpxor $Xlo,$Zlo,$Zlo 1036 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1037 vpsrldq \$8,$Tred,$Tred 1038 vpxor $T2, $Xi, $Xi 1039 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1040 vpshufb $bswap,$Ii,$Ii 1041 vxorps $Tred,$Xo, $Xo 1042 vpxor $Xhi,$Zhi,$Zhi 1043 vpunpckhqdq $Ii,$Ii,$T2 1044 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1045 vmovdqu 0x50-0x40($Htbl),$HK 1046 vpxor $Ii,$T2,$T2 1047 vpxor $Xmi,$Zmi,$Zmi 1048 1049 vmovdqu 0x40($inp),$Ij # I[4] 1050 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1051 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1052 vpshufb $bswap,$Ij,$Ij 1053 vpxor $Zlo,$Xlo,$Xlo 1054 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1055 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1056 vpunpckhqdq $Ij,$Ij,$T1 1057 vpxor $Zhi,$Xhi,$Xhi 1058 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1059 vxorps $Ij,$T1,$T1 1060 vpxor $Zmi,$Xmi,$Xmi 1061 1062 vmovdqu 0x30($inp),$Ii # I[3] 1063 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1064 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1065 vpshufb $bswap,$Ii,$Ii 1066 vpxor $Xlo,$Zlo,$Zlo 1067 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1068 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1069 vpunpckhqdq $Ii,$Ii,$T2 1070 vpxor $Xhi,$Zhi,$Zhi 1071 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1072 vmovdqu 0x80-0x40($Htbl),$HK 1073 vpxor $Ii,$T2,$T2 1074 vpxor $Xmi,$Zmi,$Zmi 1075 1076 vmovdqu 0x20($inp),$Ij # I[2] 1077 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1078 vpshufb $bswap,$Ij,$Ij 1079 vpxor $Zlo,$Xlo,$Xlo 1080 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1081 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1082 vpunpckhqdq $Ij,$Ij,$T1 1083 vpxor $Zhi,$Xhi,$Xhi 1084 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1085 vpxor $Ij,$T1,$T1 1086 vpxor $Zmi,$Xmi,$Xmi 1087 vxorps $Tred,$Xi,$Xi 1088 1089 vmovdqu 0x10($inp),$Ii # I[1] 1090 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1091 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1092 vpshufb $bswap,$Ii,$Ii 1093 vpxor $Xlo,$Zlo,$Zlo 1094 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1095 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1096 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1097 vxorps $Xo,$Tred,$Tred 1098 vpunpckhqdq $Ii,$Ii,$T2 1099 vpxor $Xhi,$Zhi,$Zhi 1100 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1101 vmovdqu 0xb0-0x40($Htbl),$HK 1102 vpxor $Ii,$T2,$T2 1103 vpxor $Xmi,$Zmi,$Zmi 1104 1105 vmovdqu ($inp),$Ij # I[0] 1106 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1107 vpshufb $bswap,$Ij,$Ij 1108 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1109 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1110 vpxor $Tred,$Ij,$Ij 1111 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1112 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1113 1114 lea 0x80($inp),$inp 1115 sub \$0x80,$len 1116 jnc .Loop8x_avx 1117 1118 add \$0x80,$len 1119 jmp .Ltail_no_xor_avx 1120 1121.align 32 1122.Lshort_avx: 1123 vmovdqu -0x10($inp,$len),$Ii # very last word 1124 lea ($inp,$len),$inp 1125 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1126 vmovdqu 0x20-0x40($Htbl),$HK 1127 vpshufb $bswap,$Ii,$Ij 1128 1129 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1130 vmovdqa $Xhi,$Zhi # $Zhi and 1131 vmovdqa $Xmi,$Zmi # $Zmi 1132 sub \$0x10,$len 1133 jz .Ltail_avx 1134 1135 vpunpckhqdq $Ij,$Ij,$T1 1136 vpxor $Xlo,$Zlo,$Zlo 1137 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1138 vpxor $Ij,$T1,$T1 1139 vmovdqu -0x20($inp),$Ii 1140 vpxor $Xhi,$Zhi,$Zhi 1141 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1142 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1143 vpshufb $bswap,$Ii,$Ij 1144 vpxor $Xmi,$Zmi,$Zmi 1145 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1146 vpsrldq \$8,$HK,$HK 1147 sub \$0x10,$len 1148 jz .Ltail_avx 1149 1150 vpunpckhqdq $Ij,$Ij,$T1 1151 vpxor $Xlo,$Zlo,$Zlo 1152 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1153 vpxor $Ij,$T1,$T1 1154 vmovdqu -0x30($inp),$Ii 1155 vpxor $Xhi,$Zhi,$Zhi 1156 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1157 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1158 vpshufb $bswap,$Ii,$Ij 1159 vpxor $Xmi,$Zmi,$Zmi 1160 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1161 vmovdqu 0x50-0x40($Htbl),$HK 1162 sub \$0x10,$len 1163 jz .Ltail_avx 1164 1165 vpunpckhqdq $Ij,$Ij,$T1 1166 vpxor $Xlo,$Zlo,$Zlo 1167 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1168 vpxor $Ij,$T1,$T1 1169 vmovdqu -0x40($inp),$Ii 1170 vpxor $Xhi,$Zhi,$Zhi 1171 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1172 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1173 vpshufb $bswap,$Ii,$Ij 1174 vpxor $Xmi,$Zmi,$Zmi 1175 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1176 vpsrldq \$8,$HK,$HK 1177 sub \$0x10,$len 1178 jz .Ltail_avx 1179 1180 vpunpckhqdq $Ij,$Ij,$T1 1181 vpxor $Xlo,$Zlo,$Zlo 1182 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1183 vpxor $Ij,$T1,$T1 1184 vmovdqu -0x50($inp),$Ii 1185 vpxor $Xhi,$Zhi,$Zhi 1186 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1187 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1188 vpshufb $bswap,$Ii,$Ij 1189 vpxor $Xmi,$Zmi,$Zmi 1190 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1191 vmovdqu 0x80-0x40($Htbl),$HK 1192 sub \$0x10,$len 1193 jz .Ltail_avx 1194 1195 vpunpckhqdq $Ij,$Ij,$T1 1196 vpxor $Xlo,$Zlo,$Zlo 1197 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1198 vpxor $Ij,$T1,$T1 1199 vmovdqu -0x60($inp),$Ii 1200 vpxor $Xhi,$Zhi,$Zhi 1201 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1202 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1203 vpshufb $bswap,$Ii,$Ij 1204 vpxor $Xmi,$Zmi,$Zmi 1205 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1206 vpsrldq \$8,$HK,$HK 1207 sub \$0x10,$len 1208 jz .Ltail_avx 1209 1210 vpunpckhqdq $Ij,$Ij,$T1 1211 vpxor $Xlo,$Zlo,$Zlo 1212 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1213 vpxor $Ij,$T1,$T1 1214 vmovdqu -0x70($inp),$Ii 1215 vpxor $Xhi,$Zhi,$Zhi 1216 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1217 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1218 vpshufb $bswap,$Ii,$Ij 1219 vpxor $Xmi,$Zmi,$Zmi 1220 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1221 vmovq 0xb8-0x40($Htbl),$HK 1222 sub \$0x10,$len 1223 jmp .Ltail_avx 1224 1225.align 32 1226.Ltail_avx: 1227 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1228.Ltail_no_xor_avx: 1229 vpunpckhqdq $Ij,$Ij,$T1 1230 vpxor $Xlo,$Zlo,$Zlo 1231 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1232 vpxor $Ij,$T1,$T1 1233 vpxor $Xhi,$Zhi,$Zhi 1234 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1235 vpxor $Xmi,$Zmi,$Zmi 1236 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1237 1238 vmovdqu (%r10),$Tred 1239 1240 vpxor $Xlo,$Zlo,$Xi 1241 vpxor $Xhi,$Zhi,$Xo 1242 vpxor $Xmi,$Zmi,$Zmi 1243 1244 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1245 vpxor $Xo, $Zmi,$Zmi 1246 vpslldq \$8, $Zmi,$T2 1247 vpsrldq \$8, $Zmi,$Zmi 1248 vpxor $T2, $Xi, $Xi 1249 vpxor $Zmi,$Xo, $Xo 1250 1251 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1252 vpalignr \$8,$Xi,$Xi,$Xi 1253 vpxor $T2,$Xi,$Xi 1254 1255 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1256 vpalignr \$8,$Xi,$Xi,$Xi 1257 vpxor $Xo,$Xi,$Xi 1258 vpxor $T2,$Xi,$Xi 1259 1260 cmp \$0,$len 1261 jne .Lshort_avx 1262 1263 vpshufb $bswap,$Xi,$Xi 1264 vmovdqu $Xi,($Xip) 1265 vzeroupper 1266___ 1267$code.=<<___ if ($win64); 1268 movaps (%rsp),%xmm6 1269 movaps 0x10(%rsp),%xmm7 1270 movaps 0x20(%rsp),%xmm8 1271 movaps 0x30(%rsp),%xmm9 1272 movaps 0x40(%rsp),%xmm10 1273 movaps 0x50(%rsp),%xmm11 1274 movaps 0x60(%rsp),%xmm12 1275 movaps 0x70(%rsp),%xmm13 1276 movaps 0x80(%rsp),%xmm14 1277 movaps 0x90(%rsp),%xmm15 1278 lea 0xa8(%rsp),%rsp 1279___ 1280$code.=<<___; 1281 ret 1282.cfi_endproc 1283.seh_endproc 1284.size gcm_ghash_avx,.-gcm_ghash_avx 1285___ 1286} else { 1287$code.=<<___; 1288 jmp .L_ghash_clmul 1289.size gcm_ghash_avx,.-gcm_ghash_avx 1290___ 1291} 1292 1293$code.=<<___; 1294.section .rodata 1295.align 64 1296.Lbswap_mask: 1297 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1298.L0x1c2_polynomial: 1299 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1300.L7_mask: 1301 .long 7,0,7,0 1302.align 64 1303 1304.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1305.align 64 1306.text 1307___ 1308 1309$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1310 1311print $code; 1312 1313close STDOUT or die "error closing STDOUT: $!"; 1314