1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# This file was patched in BoringSSL to remove the variable-time 4-bit 94# implementation. 95 96$flavour = shift; 97$output = shift; 98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 99 100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 101 102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 105die "can't locate x86_64-xlate.pl"; 106 107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be 108# computed incorrectly. 109# 110# In upstream, this is controlled by shelling out to the compiler to check 111# versions, but BoringSSL is intended to be used with pre-generated perlasm 112# output, so this isn't useful anyway. 113$avx = 1; 114 115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 116*STDOUT=*OUT; 117 118$do4xaggr=1; 119 120 121$code=<<___; 122.text 123___ 124 125 126###################################################################### 127# PCLMULQDQ version. 128 129@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 130 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 131 132($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 133($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 134 135sub clmul64x64_T2 { # minimal register pressure 136my ($Xhi,$Xi,$Hkey,$HK)=@_; 137 138if (!defined($HK)) { $HK = $T2; 139$code.=<<___; 140 movdqa $Xi,$Xhi # 141 pshufd \$0b01001110,$Xi,$T1 142 pshufd \$0b01001110,$Hkey,$T2 143 pxor $Xi,$T1 # 144 pxor $Hkey,$T2 145___ 146} else { 147$code.=<<___; 148 movdqa $Xi,$Xhi # 149 pshufd \$0b01001110,$Xi,$T1 150 pxor $Xi,$T1 # 151___ 152} 153$code.=<<___; 154 pclmulqdq \$0x00,$Hkey,$Xi ####### 155 pclmulqdq \$0x11,$Hkey,$Xhi ####### 156 pclmulqdq \$0x00,$HK,$T1 ####### 157 pxor $Xi,$T1 # 158 pxor $Xhi,$T1 # 159 160 movdqa $T1,$T2 # 161 psrldq \$8,$T1 162 pslldq \$8,$T2 # 163 pxor $T1,$Xhi 164 pxor $T2,$Xi # 165___ 166} 167 168sub reduction_alg9 { # 17/11 times faster than Intel version 169my ($Xhi,$Xi) = @_; 170 171$code.=<<___; 172 # 1st phase 173 movdqa $Xi,$T2 # 174 movdqa $Xi,$T1 175 psllq \$5,$Xi 176 pxor $Xi,$T1 # 177 psllq \$1,$Xi 178 pxor $T1,$Xi # 179 psllq \$57,$Xi # 180 movdqa $Xi,$T1 # 181 pslldq \$8,$Xi 182 psrldq \$8,$T1 # 183 pxor $T2,$Xi 184 pxor $T1,$Xhi # 185 186 # 2nd phase 187 movdqa $Xi,$T2 188 psrlq \$1,$Xi 189 pxor $T2,$Xhi # 190 pxor $Xi,$T2 191 psrlq \$5,$Xi 192 pxor $T2,$Xi # 193 psrlq \$1,$Xi # 194 pxor $Xhi,$Xi # 195___ 196} 197 198{ my ($Htbl,$Xip)=@_4args; 199 my $HK="%xmm6"; 200 201$code.=<<___; 202.globl gcm_init_clmul 203.type gcm_init_clmul,\@abi-omnipotent 204.align 16 205gcm_init_clmul: 206.cfi_startproc 207.seh_startproc 208 _CET_ENDBR 209.L_init_clmul: 210___ 211$code.=<<___ if ($win64); 212 sub \$0x18,%rsp 213.seh_allocstack 0x18 214 movaps %xmm6,(%rsp) 215.seh_savexmm128 %xmm6, 0 216___ 217$code.=<<___; 218 movdqu ($Xip),$Hkey 219 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 220 221 # <<1 twist 222 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 223 movdqa $Hkey,$T1 224 psllq \$1,$Hkey 225 pxor $T3,$T3 # 226 psrlq \$63,$T1 227 pcmpgtd $T2,$T3 # broadcast carry bit 228 pslldq \$8,$T1 229 por $T1,$Hkey # H<<=1 230 231 # magic reduction 232 pand .L0x1c2_polynomial(%rip),$T3 233 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 234 235 # calculate H^2 236 pshufd \$0b01001110,$Hkey,$HK 237 movdqa $Hkey,$Xi 238 pxor $Hkey,$HK 239___ 240 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 241 &reduction_alg9 ($Xhi,$Xi); 242$code.=<<___; 243 pshufd \$0b01001110,$Hkey,$T1 244 pshufd \$0b01001110,$Xi,$T2 245 pxor $Hkey,$T1 # Karatsuba pre-processing 246 movdqu $Hkey,0x00($Htbl) # save H 247 pxor $Xi,$T2 # Karatsuba pre-processing 248 movdqu $Xi,0x10($Htbl) # save H^2 249 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 250 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 251___ 252if ($do4xaggr) { 253 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 254 &reduction_alg9 ($Xhi,$Xi); 255$code.=<<___; 256 movdqa $Xi,$T3 257___ 258 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 259 &reduction_alg9 ($Xhi,$Xi); 260$code.=<<___; 261 pshufd \$0b01001110,$T3,$T1 262 pshufd \$0b01001110,$Xi,$T2 263 pxor $T3,$T1 # Karatsuba pre-processing 264 movdqu $T3,0x30($Htbl) # save H^3 265 pxor $Xi,$T2 # Karatsuba pre-processing 266 movdqu $Xi,0x40($Htbl) # save H^4 267 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 268 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 269___ 270} 271$code.=<<___ if ($win64); 272 movaps (%rsp),%xmm6 273 lea 0x18(%rsp),%rsp 274___ 275$code.=<<___; 276 ret 277.cfi_endproc 278.seh_endproc 279.size gcm_init_clmul,.-gcm_init_clmul 280___ 281} 282 283{ my ($Xip,$Htbl)=@_4args; 284 285$code.=<<___; 286.globl gcm_gmult_clmul 287.type gcm_gmult_clmul,\@abi-omnipotent 288.align 16 289gcm_gmult_clmul: 290.cfi_startproc 291 _CET_ENDBR 292.L_gmult_clmul: 293 movdqu ($Xip),$Xi 294 movdqa .Lbswap_mask(%rip),$T3 295 movdqu ($Htbl),$Hkey 296 movdqu 0x20($Htbl),$T2 297 pshufb $T3,$Xi 298___ 299 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 300$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 301 # experimental alternative. special thing about is that there 302 # no dependency between the two multiplications... 303 mov \$`0xE1<<1`,%eax 304 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 305 mov \$0x07,%r11d 306 movq %rax,$T1 307 movq %r10,$T2 308 movq %r11,$T3 # borrow $T3 309 pand $Xi,$T3 310 pshufb $T3,$T2 # ($Xi&7)·0xE0 311 movq %rax,$T3 312 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 313 pxor $Xi,$T2 314 pslldq \$15,$T2 315 paddd $T2,$T2 # <<(64+56+1) 316 pxor $T2,$Xi 317 pclmulqdq \$0x01,$T3,$Xi 318 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 319 psrldq \$1,$T1 320 pxor $T1,$Xhi 321 pslldq \$7,$Xi 322 pxor $Xhi,$Xi 323___ 324$code.=<<___; 325 pshufb $T3,$Xi 326 movdqu $Xi,($Xip) 327 ret 328.cfi_endproc 329.size gcm_gmult_clmul,.-gcm_gmult_clmul 330___ 331} 332 333{ my ($Xip,$Htbl,$inp,$len)=@_4args; 334 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 335 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 336 337$code.=<<___; 338.globl gcm_ghash_clmul 339.type gcm_ghash_clmul,\@abi-omnipotent 340.align 32 341gcm_ghash_clmul: 342.cfi_startproc 343.seh_startproc 344 _CET_ENDBR 345.L_ghash_clmul: 346___ 347$code.=<<___ if ($win64); 348 lea -0x88(%rsp),%rax 349 lea -0x20(%rax),%rsp 350.seh_allocstack 0x20+0x88 351 movaps %xmm6,-0x20(%rax) 352.seh_savexmm128 %xmm6, 0x20-0x20 353 movaps %xmm7,-0x10(%rax) 354.seh_savexmm128 %xmm7, 0x20-0x10 355 movaps %xmm8,0(%rax) 356.seh_savexmm128 %xmm8, 0x20+0 357 movaps %xmm9,0x10(%rax) 358.seh_savexmm128 %xmm9, 0x20+0x10 359 movaps %xmm10,0x20(%rax) 360.seh_savexmm128 %xmm10, 0x20+0x20 361 movaps %xmm11,0x30(%rax) 362.seh_savexmm128 %xmm11, 0x20+0x30 363 movaps %xmm12,0x40(%rax) 364.seh_savexmm128 %xmm12, 0x20+0x40 365 movaps %xmm13,0x50(%rax) 366.seh_savexmm128 %xmm13, 0x20+0x50 367 movaps %xmm14,0x60(%rax) 368.seh_savexmm128 %xmm14, 0x20+0x60 369 movaps %xmm15,0x70(%rax) 370.seh_savexmm128 %xmm15, 0x20+0x70 371___ 372$code.=<<___; 373 movdqa .Lbswap_mask(%rip),$T3 374 375 movdqu ($Xip),$Xi 376 movdqu ($Htbl),$Hkey 377 movdqu 0x20($Htbl),$HK 378 pshufb $T3,$Xi 379 380 sub \$0x10,$len 381 jz .Lodd_tail 382 383 movdqu 0x10($Htbl),$Hkey2 384___ 385if ($do4xaggr) { 386my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 387 388$code.=<<___; 389 cmp \$0x30,$len 390 jb .Lskip4x 391 392 sub \$0x30,$len 393 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 394 movdqu 0x30($Htbl),$Hkey3 395 movdqu 0x40($Htbl),$Hkey4 396 397 ####### 398 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 399 # 400 movdqu 0x30($inp),$Xln 401 movdqu 0x20($inp),$Xl 402 pshufb $T3,$Xln 403 pshufb $T3,$Xl 404 movdqa $Xln,$Xhn 405 pshufd \$0b01001110,$Xln,$Xmn 406 pxor $Xln,$Xmn 407 pclmulqdq \$0x00,$Hkey,$Xln 408 pclmulqdq \$0x11,$Hkey,$Xhn 409 pclmulqdq \$0x00,$HK,$Xmn 410 411 movdqa $Xl,$Xh 412 pshufd \$0b01001110,$Xl,$Xm 413 pxor $Xl,$Xm 414 pclmulqdq \$0x00,$Hkey2,$Xl 415 pclmulqdq \$0x11,$Hkey2,$Xh 416 pclmulqdq \$0x10,$HK,$Xm 417 xorps $Xl,$Xln 418 xorps $Xh,$Xhn 419 movups 0x50($Htbl),$HK 420 xorps $Xm,$Xmn 421 422 movdqu 0x10($inp),$Xl 423 movdqu 0($inp),$T1 424 pshufb $T3,$Xl 425 pshufb $T3,$T1 426 movdqa $Xl,$Xh 427 pshufd \$0b01001110,$Xl,$Xm 428 pxor $T1,$Xi 429 pxor $Xl,$Xm 430 pclmulqdq \$0x00,$Hkey3,$Xl 431 movdqa $Xi,$Xhi 432 pshufd \$0b01001110,$Xi,$T1 433 pxor $Xi,$T1 434 pclmulqdq \$0x11,$Hkey3,$Xh 435 pclmulqdq \$0x00,$HK,$Xm 436 xorps $Xl,$Xln 437 xorps $Xh,$Xhn 438 439 lea 0x40($inp),$inp 440 sub \$0x40,$len 441 jc .Ltail4x 442 443 jmp .Lmod4_loop 444.align 32 445.Lmod4_loop: 446 pclmulqdq \$0x00,$Hkey4,$Xi 447 xorps $Xm,$Xmn 448 movdqu 0x30($inp),$Xl 449 pshufb $T3,$Xl 450 pclmulqdq \$0x11,$Hkey4,$Xhi 451 xorps $Xln,$Xi 452 movdqu 0x20($inp),$Xln 453 movdqa $Xl,$Xh 454 pclmulqdq \$0x10,$HK,$T1 455 pshufd \$0b01001110,$Xl,$Xm 456 xorps $Xhn,$Xhi 457 pxor $Xl,$Xm 458 pshufb $T3,$Xln 459 movups 0x20($Htbl),$HK 460 xorps $Xmn,$T1 461 pclmulqdq \$0x00,$Hkey,$Xl 462 pshufd \$0b01001110,$Xln,$Xmn 463 464 pxor $Xi,$T1 # aggregated Karatsuba post-processing 465 movdqa $Xln,$Xhn 466 pxor $Xhi,$T1 # 467 pxor $Xln,$Xmn 468 movdqa $T1,$T2 # 469 pclmulqdq \$0x11,$Hkey,$Xh 470 pslldq \$8,$T1 471 psrldq \$8,$T2 # 472 pxor $T1,$Xi 473 movdqa .L7_mask(%rip),$T1 474 pxor $T2,$Xhi # 475 movq %rax,$T2 476 477 pand $Xi,$T1 # 1st phase 478 pshufb $T1,$T2 # 479 pxor $Xi,$T2 # 480 pclmulqdq \$0x00,$HK,$Xm 481 psllq \$57,$T2 # 482 movdqa $T2,$T1 # 483 pslldq \$8,$T2 484 pclmulqdq \$0x00,$Hkey2,$Xln 485 psrldq \$8,$T1 # 486 pxor $T2,$Xi 487 pxor $T1,$Xhi # 488 movdqu 0($inp),$T1 489 490 movdqa $Xi,$T2 # 2nd phase 491 psrlq \$1,$Xi 492 pclmulqdq \$0x11,$Hkey2,$Xhn 493 xorps $Xl,$Xln 494 movdqu 0x10($inp),$Xl 495 pshufb $T3,$Xl 496 pclmulqdq \$0x10,$HK,$Xmn 497 xorps $Xh,$Xhn 498 movups 0x50($Htbl),$HK 499 pshufb $T3,$T1 500 pxor $T2,$Xhi # 501 pxor $Xi,$T2 502 psrlq \$5,$Xi 503 504 movdqa $Xl,$Xh 505 pxor $Xm,$Xmn 506 pshufd \$0b01001110,$Xl,$Xm 507 pxor $T2,$Xi # 508 pxor $T1,$Xhi 509 pxor $Xl,$Xm 510 pclmulqdq \$0x00,$Hkey3,$Xl 511 psrlq \$1,$Xi # 512 pxor $Xhi,$Xi # 513 movdqa $Xi,$Xhi 514 pclmulqdq \$0x11,$Hkey3,$Xh 515 xorps $Xl,$Xln 516 pshufd \$0b01001110,$Xi,$T1 517 pxor $Xi,$T1 518 519 pclmulqdq \$0x00,$HK,$Xm 520 xorps $Xh,$Xhn 521 522 lea 0x40($inp),$inp 523 sub \$0x40,$len 524 jnc .Lmod4_loop 525 526.Ltail4x: 527 pclmulqdq \$0x00,$Hkey4,$Xi 528 pclmulqdq \$0x11,$Hkey4,$Xhi 529 pclmulqdq \$0x10,$HK,$T1 530 xorps $Xm,$Xmn 531 xorps $Xln,$Xi 532 xorps $Xhn,$Xhi 533 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 534 pxor $Xmn,$T1 535 536 pxor $Xhi,$T1 # 537 pxor $Xi,$Xhi 538 539 movdqa $T1,$T2 # 540 psrldq \$8,$T1 541 pslldq \$8,$T2 # 542 pxor $T1,$Xhi 543 pxor $T2,$Xi # 544___ 545 &reduction_alg9($Xhi,$Xi); 546$code.=<<___; 547 add \$0x40,$len 548 jz .Ldone 549 movdqu 0x20($Htbl),$HK 550 sub \$0x10,$len 551 jz .Lodd_tail 552.Lskip4x: 553___ 554} 555$code.=<<___; 556 ####### 557 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 558 # [(H*Ii+1) + (H*Xi+1)] mod P = 559 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 560 # 561 movdqu ($inp),$T1 # Ii 562 movdqu 16($inp),$Xln # Ii+1 563 pshufb $T3,$T1 564 pshufb $T3,$Xln 565 pxor $T1,$Xi # Ii+Xi 566 567 movdqa $Xln,$Xhn 568 pshufd \$0b01001110,$Xln,$Xmn 569 pxor $Xln,$Xmn 570 pclmulqdq \$0x00,$Hkey,$Xln 571 pclmulqdq \$0x11,$Hkey,$Xhn 572 pclmulqdq \$0x00,$HK,$Xmn 573 574 lea 32($inp),$inp # i+=2 575 nop 576 sub \$0x20,$len 577 jbe .Leven_tail 578 nop 579 jmp .Lmod_loop 580 581.align 32 582.Lmod_loop: 583 movdqa $Xi,$Xhi 584 movdqa $Xmn,$T1 585 pshufd \$0b01001110,$Xi,$Xmn # 586 pxor $Xi,$Xmn # 587 588 pclmulqdq \$0x00,$Hkey2,$Xi 589 pclmulqdq \$0x11,$Hkey2,$Xhi 590 pclmulqdq \$0x10,$HK,$Xmn 591 592 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 593 pxor $Xhn,$Xhi 594 movdqu ($inp),$T2 # Ii 595 pxor $Xi,$T1 # aggregated Karatsuba post-processing 596 pshufb $T3,$T2 597 movdqu 16($inp),$Xln # Ii+1 598 599 pxor $Xhi,$T1 600 pxor $T2,$Xhi # "Ii+Xi", consume early 601 pxor $T1,$Xmn 602 pshufb $T3,$Xln 603 movdqa $Xmn,$T1 # 604 psrldq \$8,$T1 605 pslldq \$8,$Xmn # 606 pxor $T1,$Xhi 607 pxor $Xmn,$Xi # 608 609 movdqa $Xln,$Xhn # 610 611 movdqa $Xi,$T2 # 1st phase 612 movdqa $Xi,$T1 613 psllq \$5,$Xi 614 pxor $Xi,$T1 # 615 pclmulqdq \$0x00,$Hkey,$Xln ####### 616 psllq \$1,$Xi 617 pxor $T1,$Xi # 618 psllq \$57,$Xi # 619 movdqa $Xi,$T1 # 620 pslldq \$8,$Xi 621 psrldq \$8,$T1 # 622 pxor $T2,$Xi 623 pshufd \$0b01001110,$Xhn,$Xmn 624 pxor $T1,$Xhi # 625 pxor $Xhn,$Xmn # 626 627 movdqa $Xi,$T2 # 2nd phase 628 psrlq \$1,$Xi 629 pclmulqdq \$0x11,$Hkey,$Xhn ####### 630 pxor $T2,$Xhi # 631 pxor $Xi,$T2 632 psrlq \$5,$Xi 633 pxor $T2,$Xi # 634 lea 32($inp),$inp 635 psrlq \$1,$Xi # 636 pclmulqdq \$0x00,$HK,$Xmn ####### 637 pxor $Xhi,$Xi # 638 639 sub \$0x20,$len 640 ja .Lmod_loop 641 642.Leven_tail: 643 movdqa $Xi,$Xhi 644 movdqa $Xmn,$T1 645 pshufd \$0b01001110,$Xi,$Xmn # 646 pxor $Xi,$Xmn # 647 648 pclmulqdq \$0x00,$Hkey2,$Xi 649 pclmulqdq \$0x11,$Hkey2,$Xhi 650 pclmulqdq \$0x10,$HK,$Xmn 651 652 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 653 pxor $Xhn,$Xhi 654 pxor $Xi,$T1 655 pxor $Xhi,$T1 656 pxor $T1,$Xmn 657 movdqa $Xmn,$T1 # 658 psrldq \$8,$T1 659 pslldq \$8,$Xmn # 660 pxor $T1,$Xhi 661 pxor $Xmn,$Xi # 662___ 663 &reduction_alg9 ($Xhi,$Xi); 664$code.=<<___; 665 test $len,$len 666 jnz .Ldone 667 668.Lodd_tail: 669 movdqu ($inp),$T1 # Ii 670 pshufb $T3,$T1 671 pxor $T1,$Xi # Ii+Xi 672___ 673 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 674 &reduction_alg9 ($Xhi,$Xi); 675$code.=<<___; 676.Ldone: 677 pshufb $T3,$Xi 678 movdqu $Xi,($Xip) 679___ 680$code.=<<___ if ($win64); 681 movaps (%rsp),%xmm6 682 movaps 0x10(%rsp),%xmm7 683 movaps 0x20(%rsp),%xmm8 684 movaps 0x30(%rsp),%xmm9 685 movaps 0x40(%rsp),%xmm10 686 movaps 0x50(%rsp),%xmm11 687 movaps 0x60(%rsp),%xmm12 688 movaps 0x70(%rsp),%xmm13 689 movaps 0x80(%rsp),%xmm14 690 movaps 0x90(%rsp),%xmm15 691 lea 0xa8(%rsp),%rsp 692___ 693$code.=<<___; 694 ret 695.cfi_endproc 696.seh_endproc 697.size gcm_ghash_clmul,.-gcm_ghash_clmul 698___ 699} 700 701$code.=<<___; 702.globl gcm_init_avx 703.type gcm_init_avx,\@abi-omnipotent 704.align 32 705gcm_init_avx: 706.cfi_startproc 707 _CET_ENDBR 708___ 709if ($avx) { 710my ($Htbl,$Xip)=@_4args; 711my $HK="%xmm6"; 712 713$code.=<<___ if ($win64); 714.seh_startproc 715 sub \$0x18,%rsp 716.seh_allocstack 0x18 717 movaps %xmm6,(%rsp) 718.seh_savexmm128 %xmm6, 0 719___ 720$code.=<<___; 721 vzeroupper 722 723 vmovdqu ($Xip),$Hkey 724 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 725 726 # <<1 twist 727 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 728 vpsrlq \$63,$Hkey,$T1 729 vpsllq \$1,$Hkey,$Hkey 730 vpxor $T3,$T3,$T3 # 731 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 732 vpslldq \$8,$T1,$T1 733 vpor $T1,$Hkey,$Hkey # H<<=1 734 735 # magic reduction 736 vpand .L0x1c2_polynomial(%rip),$T3,$T3 737 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 738 739 vpunpckhqdq $Hkey,$Hkey,$HK 740 vmovdqa $Hkey,$Xi 741 vpxor $Hkey,$HK,$HK 742 mov \$4,%r10 # up to H^8 743 jmp .Linit_start_avx 744___ 745 746sub clmul64x64_avx { 747my ($Xhi,$Xi,$Hkey,$HK)=@_; 748 749if (!defined($HK)) { $HK = $T2; 750$code.=<<___; 751 vpunpckhqdq $Xi,$Xi,$T1 752 vpunpckhqdq $Hkey,$Hkey,$T2 753 vpxor $Xi,$T1,$T1 # 754 vpxor $Hkey,$T2,$T2 755___ 756} else { 757$code.=<<___; 758 vpunpckhqdq $Xi,$Xi,$T1 759 vpxor $Xi,$T1,$T1 # 760___ 761} 762$code.=<<___; 763 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 764 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 765 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 766 vpxor $Xi,$Xhi,$T2 # 767 vpxor $T2,$T1,$T1 # 768 769 vpslldq \$8,$T1,$T2 # 770 vpsrldq \$8,$T1,$T1 771 vpxor $T2,$Xi,$Xi # 772 vpxor $T1,$Xhi,$Xhi 773___ 774} 775 776sub reduction_avx { 777my ($Xhi,$Xi) = @_; 778 779$code.=<<___; 780 vpsllq \$57,$Xi,$T1 # 1st phase 781 vpsllq \$62,$Xi,$T2 782 vpxor $T1,$T2,$T2 # 783 vpsllq \$63,$Xi,$T1 784 vpxor $T1,$T2,$T2 # 785 vpslldq \$8,$T2,$T1 # 786 vpsrldq \$8,$T2,$T2 787 vpxor $T1,$Xi,$Xi # 788 vpxor $T2,$Xhi,$Xhi 789 790 vpsrlq \$1,$Xi,$T2 # 2nd phase 791 vpxor $Xi,$Xhi,$Xhi 792 vpxor $T2,$Xi,$Xi # 793 vpsrlq \$5,$T2,$T2 794 vpxor $T2,$Xi,$Xi # 795 vpsrlq \$1,$Xi,$Xi # 796 vpxor $Xhi,$Xi,$Xi # 797___ 798} 799 800$code.=<<___; 801.align 32 802.Linit_loop_avx: 803 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 804 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 805___ 806 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 807 &reduction_avx ($Xhi,$Xi); 808$code.=<<___; 809.Linit_start_avx: 810 vmovdqa $Xi,$T3 811___ 812 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 813 &reduction_avx ($Xhi,$Xi); 814$code.=<<___; 815 vpshufd \$0b01001110,$T3,$T1 816 vpshufd \$0b01001110,$Xi,$T2 817 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 818 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 819 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 820 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 821 lea 0x30($Htbl),$Htbl 822 sub \$1,%r10 823 jnz .Linit_loop_avx 824 825 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 826 vmovdqu $T3,-0x10($Htbl) 827 828 vzeroupper 829___ 830$code.=<<___ if ($win64); 831 movaps (%rsp),%xmm6 832 lea 0x18(%rsp),%rsp 833___ 834$code.=<<___; 835 ret 836.seh_endproc 837.cfi_endproc 838.size gcm_init_avx,.-gcm_init_avx 839___ 840} else { 841$code.=<<___; 842 jmp .L_init_clmul 843.size gcm_init_avx,.-gcm_init_avx 844___ 845} 846 847$code.=<<___; 848.globl gcm_gmult_avx 849.type gcm_gmult_avx,\@abi-omnipotent 850.align 32 851gcm_gmult_avx: 852.cfi_startproc 853 _CET_ENDBR 854 jmp .L_gmult_clmul 855.cfi_endproc 856.size gcm_gmult_avx,.-gcm_gmult_avx 857___ 858 859$code.=<<___; 860.globl gcm_ghash_avx 861.type gcm_ghash_avx,\@abi-omnipotent 862.align 32 863gcm_ghash_avx: 864.cfi_startproc 865 _CET_ENDBR 866___ 867if ($avx) { 868my ($Xip,$Htbl,$inp,$len)=@_4args; 869my ($Xlo,$Xhi,$Xmi, 870 $Zlo,$Zhi,$Zmi, 871 $Hkey,$HK,$T1,$T2, 872 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 873 874$code.=<<___ if ($win64); 875.seh_startproc 876 lea -0x88(%rsp),%rax 877 lea -0x20(%rax),%rsp 878.seh_allocstack 0x20+0x88 879 movaps %xmm6,-0x20(%rax) 880.seh_savexmm128 %xmm6, 0x20-0x20 881 movaps %xmm7,-0x10(%rax) 882.seh_savexmm128 %xmm7, 0x20-0x10 883 movaps %xmm8,0(%rax) 884.seh_savexmm128 %xmm8, 0x20+0 885 movaps %xmm9,0x10(%rax) 886.seh_savexmm128 %xmm9, 0x20+0x10 887 movaps %xmm10,0x20(%rax) 888.seh_savexmm128 %xmm10, 0x20+0x20 889 movaps %xmm11,0x30(%rax) 890.seh_savexmm128 %xmm11, 0x20+0x30 891 movaps %xmm12,0x40(%rax) 892.seh_savexmm128 %xmm12, 0x20+0x40 893 movaps %xmm13,0x50(%rax) 894.seh_savexmm128 %xmm13, 0x20+0x50 895 movaps %xmm14,0x60(%rax) 896.seh_savexmm128 %xmm14, 0x20+0x60 897 movaps %xmm15,0x70(%rax) 898.seh_savexmm128 %xmm15, 0x20+0x70 899___ 900$code.=<<___; 901 vzeroupper 902 903 vmovdqu ($Xip),$Xi # load $Xi 904 lea .L0x1c2_polynomial(%rip),%r10 905 lea 0x40($Htbl),$Htbl # size optimization 906 vmovdqu .Lbswap_mask(%rip),$bswap 907 vpshufb $bswap,$Xi,$Xi 908 cmp \$0x80,$len 909 jb .Lshort_avx 910 sub \$0x80,$len 911 912 vmovdqu 0x70($inp),$Ii # I[7] 913 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 914 vpshufb $bswap,$Ii,$Ii 915 vmovdqu 0x20-0x40($Htbl),$HK 916 917 vpunpckhqdq $Ii,$Ii,$T2 918 vmovdqu 0x60($inp),$Ij # I[6] 919 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 920 vpxor $Ii,$T2,$T2 921 vpshufb $bswap,$Ij,$Ij 922 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 923 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 924 vpunpckhqdq $Ij,$Ij,$T1 925 vmovdqu 0x50($inp),$Ii # I[5] 926 vpclmulqdq \$0x00,$HK,$T2,$Xmi 927 vpxor $Ij,$T1,$T1 928 929 vpshufb $bswap,$Ii,$Ii 930 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 931 vpunpckhqdq $Ii,$Ii,$T2 932 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 933 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 934 vpxor $Ii,$T2,$T2 935 vmovdqu 0x40($inp),$Ij # I[4] 936 vpclmulqdq \$0x10,$HK,$T1,$Zmi 937 vmovdqu 0x50-0x40($Htbl),$HK 938 939 vpshufb $bswap,$Ij,$Ij 940 vpxor $Xlo,$Zlo,$Zlo 941 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 942 vpxor $Xhi,$Zhi,$Zhi 943 vpunpckhqdq $Ij,$Ij,$T1 944 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 945 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 946 vpxor $Xmi,$Zmi,$Zmi 947 vpclmulqdq \$0x00,$HK,$T2,$Xmi 948 vpxor $Ij,$T1,$T1 949 950 vmovdqu 0x30($inp),$Ii # I[3] 951 vpxor $Zlo,$Xlo,$Xlo 952 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 953 vpxor $Zhi,$Xhi,$Xhi 954 vpshufb $bswap,$Ii,$Ii 955 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 956 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 957 vpxor $Zmi,$Xmi,$Xmi 958 vpunpckhqdq $Ii,$Ii,$T2 959 vpclmulqdq \$0x10,$HK,$T1,$Zmi 960 vmovdqu 0x80-0x40($Htbl),$HK 961 vpxor $Ii,$T2,$T2 962 963 vmovdqu 0x20($inp),$Ij # I[2] 964 vpxor $Xlo,$Zlo,$Zlo 965 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 966 vpxor $Xhi,$Zhi,$Zhi 967 vpshufb $bswap,$Ij,$Ij 968 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 969 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 970 vpxor $Xmi,$Zmi,$Zmi 971 vpunpckhqdq $Ij,$Ij,$T1 972 vpclmulqdq \$0x00,$HK,$T2,$Xmi 973 vpxor $Ij,$T1,$T1 974 975 vmovdqu 0x10($inp),$Ii # I[1] 976 vpxor $Zlo,$Xlo,$Xlo 977 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 978 vpxor $Zhi,$Xhi,$Xhi 979 vpshufb $bswap,$Ii,$Ii 980 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 981 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 982 vpxor $Zmi,$Xmi,$Xmi 983 vpunpckhqdq $Ii,$Ii,$T2 984 vpclmulqdq \$0x10,$HK,$T1,$Zmi 985 vmovdqu 0xb0-0x40($Htbl),$HK 986 vpxor $Ii,$T2,$T2 987 988 vmovdqu ($inp),$Ij # I[0] 989 vpxor $Xlo,$Zlo,$Zlo 990 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 991 vpxor $Xhi,$Zhi,$Zhi 992 vpshufb $bswap,$Ij,$Ij 993 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 994 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 995 vpxor $Xmi,$Zmi,$Zmi 996 vpclmulqdq \$0x10,$HK,$T2,$Xmi 997 998 lea 0x80($inp),$inp 999 cmp \$0x80,$len 1000 jb .Ltail_avx 1001 1002 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1003 sub \$0x80,$len 1004 jmp .Loop8x_avx 1005 1006.align 32 1007.Loop8x_avx: 1008 vpunpckhqdq $Ij,$Ij,$T1 1009 vmovdqu 0x70($inp),$Ii # I[7] 1010 vpxor $Xlo,$Zlo,$Zlo 1011 vpxor $Ij,$T1,$T1 1012 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1013 vpshufb $bswap,$Ii,$Ii 1014 vpxor $Xhi,$Zhi,$Zhi 1015 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1016 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1017 vpunpckhqdq $Ii,$Ii,$T2 1018 vpxor $Xmi,$Zmi,$Zmi 1019 vpclmulqdq \$0x00,$HK,$T1,$Tred 1020 vmovdqu 0x20-0x40($Htbl),$HK 1021 vpxor $Ii,$T2,$T2 1022 1023 vmovdqu 0x60($inp),$Ij # I[6] 1024 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1025 vpxor $Zlo,$Xi,$Xi # collect result 1026 vpshufb $bswap,$Ij,$Ij 1027 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1028 vxorps $Zhi,$Xo,$Xo 1029 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1030 vpunpckhqdq $Ij,$Ij,$T1 1031 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1032 vpxor $Zmi,$Tred,$Tred 1033 vxorps $Ij,$T1,$T1 1034 1035 vmovdqu 0x50($inp),$Ii # I[5] 1036 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1037 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1038 vpxor $Xo,$Tred,$Tred 1039 vpslldq \$8,$Tred,$T2 1040 vpxor $Xlo,$Zlo,$Zlo 1041 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1042 vpsrldq \$8,$Tred,$Tred 1043 vpxor $T2, $Xi, $Xi 1044 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1045 vpshufb $bswap,$Ii,$Ii 1046 vxorps $Tred,$Xo, $Xo 1047 vpxor $Xhi,$Zhi,$Zhi 1048 vpunpckhqdq $Ii,$Ii,$T2 1049 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1050 vmovdqu 0x50-0x40($Htbl),$HK 1051 vpxor $Ii,$T2,$T2 1052 vpxor $Xmi,$Zmi,$Zmi 1053 1054 vmovdqu 0x40($inp),$Ij # I[4] 1055 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1056 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1057 vpshufb $bswap,$Ij,$Ij 1058 vpxor $Zlo,$Xlo,$Xlo 1059 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1060 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1061 vpunpckhqdq $Ij,$Ij,$T1 1062 vpxor $Zhi,$Xhi,$Xhi 1063 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1064 vxorps $Ij,$T1,$T1 1065 vpxor $Zmi,$Xmi,$Xmi 1066 1067 vmovdqu 0x30($inp),$Ii # I[3] 1068 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1069 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1070 vpshufb $bswap,$Ii,$Ii 1071 vpxor $Xlo,$Zlo,$Zlo 1072 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1073 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1074 vpunpckhqdq $Ii,$Ii,$T2 1075 vpxor $Xhi,$Zhi,$Zhi 1076 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1077 vmovdqu 0x80-0x40($Htbl),$HK 1078 vpxor $Ii,$T2,$T2 1079 vpxor $Xmi,$Zmi,$Zmi 1080 1081 vmovdqu 0x20($inp),$Ij # I[2] 1082 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1083 vpshufb $bswap,$Ij,$Ij 1084 vpxor $Zlo,$Xlo,$Xlo 1085 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1086 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1087 vpunpckhqdq $Ij,$Ij,$T1 1088 vpxor $Zhi,$Xhi,$Xhi 1089 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1090 vpxor $Ij,$T1,$T1 1091 vpxor $Zmi,$Xmi,$Xmi 1092 vxorps $Tred,$Xi,$Xi 1093 1094 vmovdqu 0x10($inp),$Ii # I[1] 1095 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1096 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1097 vpshufb $bswap,$Ii,$Ii 1098 vpxor $Xlo,$Zlo,$Zlo 1099 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1100 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1101 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1102 vxorps $Xo,$Tred,$Tred 1103 vpunpckhqdq $Ii,$Ii,$T2 1104 vpxor $Xhi,$Zhi,$Zhi 1105 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1106 vmovdqu 0xb0-0x40($Htbl),$HK 1107 vpxor $Ii,$T2,$T2 1108 vpxor $Xmi,$Zmi,$Zmi 1109 1110 vmovdqu ($inp),$Ij # I[0] 1111 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1112 vpshufb $bswap,$Ij,$Ij 1113 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1114 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1115 vpxor $Tred,$Ij,$Ij 1116 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1117 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1118 1119 lea 0x80($inp),$inp 1120 sub \$0x80,$len 1121 jnc .Loop8x_avx 1122 1123 add \$0x80,$len 1124 jmp .Ltail_no_xor_avx 1125 1126.align 32 1127.Lshort_avx: 1128 vmovdqu -0x10($inp,$len),$Ii # very last word 1129 lea ($inp,$len),$inp 1130 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1131 vmovdqu 0x20-0x40($Htbl),$HK 1132 vpshufb $bswap,$Ii,$Ij 1133 1134 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1135 vmovdqa $Xhi,$Zhi # $Zhi and 1136 vmovdqa $Xmi,$Zmi # $Zmi 1137 sub \$0x10,$len 1138 jz .Ltail_avx 1139 1140 vpunpckhqdq $Ij,$Ij,$T1 1141 vpxor $Xlo,$Zlo,$Zlo 1142 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1143 vpxor $Ij,$T1,$T1 1144 vmovdqu -0x20($inp),$Ii 1145 vpxor $Xhi,$Zhi,$Zhi 1146 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1147 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1148 vpshufb $bswap,$Ii,$Ij 1149 vpxor $Xmi,$Zmi,$Zmi 1150 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1151 vpsrldq \$8,$HK,$HK 1152 sub \$0x10,$len 1153 jz .Ltail_avx 1154 1155 vpunpckhqdq $Ij,$Ij,$T1 1156 vpxor $Xlo,$Zlo,$Zlo 1157 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1158 vpxor $Ij,$T1,$T1 1159 vmovdqu -0x30($inp),$Ii 1160 vpxor $Xhi,$Zhi,$Zhi 1161 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1162 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1163 vpshufb $bswap,$Ii,$Ij 1164 vpxor $Xmi,$Zmi,$Zmi 1165 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1166 vmovdqu 0x50-0x40($Htbl),$HK 1167 sub \$0x10,$len 1168 jz .Ltail_avx 1169 1170 vpunpckhqdq $Ij,$Ij,$T1 1171 vpxor $Xlo,$Zlo,$Zlo 1172 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1173 vpxor $Ij,$T1,$T1 1174 vmovdqu -0x40($inp),$Ii 1175 vpxor $Xhi,$Zhi,$Zhi 1176 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1177 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1178 vpshufb $bswap,$Ii,$Ij 1179 vpxor $Xmi,$Zmi,$Zmi 1180 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1181 vpsrldq \$8,$HK,$HK 1182 sub \$0x10,$len 1183 jz .Ltail_avx 1184 1185 vpunpckhqdq $Ij,$Ij,$T1 1186 vpxor $Xlo,$Zlo,$Zlo 1187 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1188 vpxor $Ij,$T1,$T1 1189 vmovdqu -0x50($inp),$Ii 1190 vpxor $Xhi,$Zhi,$Zhi 1191 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1192 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1193 vpshufb $bswap,$Ii,$Ij 1194 vpxor $Xmi,$Zmi,$Zmi 1195 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1196 vmovdqu 0x80-0x40($Htbl),$HK 1197 sub \$0x10,$len 1198 jz .Ltail_avx 1199 1200 vpunpckhqdq $Ij,$Ij,$T1 1201 vpxor $Xlo,$Zlo,$Zlo 1202 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1203 vpxor $Ij,$T1,$T1 1204 vmovdqu -0x60($inp),$Ii 1205 vpxor $Xhi,$Zhi,$Zhi 1206 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1207 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1208 vpshufb $bswap,$Ii,$Ij 1209 vpxor $Xmi,$Zmi,$Zmi 1210 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1211 vpsrldq \$8,$HK,$HK 1212 sub \$0x10,$len 1213 jz .Ltail_avx 1214 1215 vpunpckhqdq $Ij,$Ij,$T1 1216 vpxor $Xlo,$Zlo,$Zlo 1217 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1218 vpxor $Ij,$T1,$T1 1219 vmovdqu -0x70($inp),$Ii 1220 vpxor $Xhi,$Zhi,$Zhi 1221 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1222 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1223 vpshufb $bswap,$Ii,$Ij 1224 vpxor $Xmi,$Zmi,$Zmi 1225 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1226 vmovq 0xb8-0x40($Htbl),$HK 1227 sub \$0x10,$len 1228 jmp .Ltail_avx 1229 1230.align 32 1231.Ltail_avx: 1232 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1233.Ltail_no_xor_avx: 1234 vpunpckhqdq $Ij,$Ij,$T1 1235 vpxor $Xlo,$Zlo,$Zlo 1236 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1237 vpxor $Ij,$T1,$T1 1238 vpxor $Xhi,$Zhi,$Zhi 1239 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1240 vpxor $Xmi,$Zmi,$Zmi 1241 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1242 1243 vmovdqu (%r10),$Tred 1244 1245 vpxor $Xlo,$Zlo,$Xi 1246 vpxor $Xhi,$Zhi,$Xo 1247 vpxor $Xmi,$Zmi,$Zmi 1248 1249 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1250 vpxor $Xo, $Zmi,$Zmi 1251 vpslldq \$8, $Zmi,$T2 1252 vpsrldq \$8, $Zmi,$Zmi 1253 vpxor $T2, $Xi, $Xi 1254 vpxor $Zmi,$Xo, $Xo 1255 1256 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1257 vpalignr \$8,$Xi,$Xi,$Xi 1258 vpxor $T2,$Xi,$Xi 1259 1260 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1261 vpalignr \$8,$Xi,$Xi,$Xi 1262 vpxor $Xo,$Xi,$Xi 1263 vpxor $T2,$Xi,$Xi 1264 1265 cmp \$0,$len 1266 jne .Lshort_avx 1267 1268 vpshufb $bswap,$Xi,$Xi 1269 vmovdqu $Xi,($Xip) 1270 vzeroupper 1271___ 1272$code.=<<___ if ($win64); 1273 movaps (%rsp),%xmm6 1274 movaps 0x10(%rsp),%xmm7 1275 movaps 0x20(%rsp),%xmm8 1276 movaps 0x30(%rsp),%xmm9 1277 movaps 0x40(%rsp),%xmm10 1278 movaps 0x50(%rsp),%xmm11 1279 movaps 0x60(%rsp),%xmm12 1280 movaps 0x70(%rsp),%xmm13 1281 movaps 0x80(%rsp),%xmm14 1282 movaps 0x90(%rsp),%xmm15 1283 lea 0xa8(%rsp),%rsp 1284___ 1285$code.=<<___; 1286 ret 1287.cfi_endproc 1288.seh_endproc 1289.size gcm_ghash_avx,.-gcm_ghash_avx 1290___ 1291} else { 1292$code.=<<___; 1293 jmp .L_ghash_clmul 1294.size gcm_ghash_avx,.-gcm_ghash_avx 1295___ 1296} 1297 1298$code.=<<___; 1299.section .rodata 1300.align 64 1301.Lbswap_mask: 1302 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1303.L0x1c2_polynomial: 1304 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1305.L7_mask: 1306 .long 7,0,7,0 1307.align 64 1308 1309.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1310.align 64 1311.text 1312___ 1313 1314$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1315 1316print $code; 1317 1318close STDOUT or die "error closing STDOUT: $!"; 1319