1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# This file was patched in BoringSSL to remove the variable-time 4-bit 94# implementation. 95 96$flavour = shift; 97$output = shift; 98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 99 100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 101 102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 105die "can't locate x86_64-xlate.pl"; 106 107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be 108# computed incorrectly. 109# 110# In upstream, this is controlled by shelling out to the compiler to check 111# versions, but BoringSSL is intended to be used with pre-generated perlasm 112# output, so this isn't useful anyway. 113$avx = 1; 114 115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 116*STDOUT=*OUT; 117 118$do4xaggr=1; 119 120 121$code=<<___; 122.text 123.extern OPENSSL_ia32cap_P 124___ 125 126 127###################################################################### 128# PCLMULQDQ version. 129 130@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 131 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 132 133($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 134($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 135 136sub clmul64x64_T2 { # minimal register pressure 137my ($Xhi,$Xi,$Hkey,$HK)=@_; 138 139if (!defined($HK)) { $HK = $T2; 140$code.=<<___; 141 movdqa $Xi,$Xhi # 142 pshufd \$0b01001110,$Xi,$T1 143 pshufd \$0b01001110,$Hkey,$T2 144 pxor $Xi,$T1 # 145 pxor $Hkey,$T2 146___ 147} else { 148$code.=<<___; 149 movdqa $Xi,$Xhi # 150 pshufd \$0b01001110,$Xi,$T1 151 pxor $Xi,$T1 # 152___ 153} 154$code.=<<___; 155 pclmulqdq \$0x00,$Hkey,$Xi ####### 156 pclmulqdq \$0x11,$Hkey,$Xhi ####### 157 pclmulqdq \$0x00,$HK,$T1 ####### 158 pxor $Xi,$T1 # 159 pxor $Xhi,$T1 # 160 161 movdqa $T1,$T2 # 162 psrldq \$8,$T1 163 pslldq \$8,$T2 # 164 pxor $T1,$Xhi 165 pxor $T2,$Xi # 166___ 167} 168 169sub reduction_alg9 { # 17/11 times faster than Intel version 170my ($Xhi,$Xi) = @_; 171 172$code.=<<___; 173 # 1st phase 174 movdqa $Xi,$T2 # 175 movdqa $Xi,$T1 176 psllq \$5,$Xi 177 pxor $Xi,$T1 # 178 psllq \$1,$Xi 179 pxor $T1,$Xi # 180 psllq \$57,$Xi # 181 movdqa $Xi,$T1 # 182 pslldq \$8,$Xi 183 psrldq \$8,$T1 # 184 pxor $T2,$Xi 185 pxor $T1,$Xhi # 186 187 # 2nd phase 188 movdqa $Xi,$T2 189 psrlq \$1,$Xi 190 pxor $T2,$Xhi # 191 pxor $Xi,$T2 192 psrlq \$5,$Xi 193 pxor $T2,$Xi # 194 psrlq \$1,$Xi # 195 pxor $Xhi,$Xi # 196___ 197} 198 199{ my ($Htbl,$Xip)=@_4args; 200 my $HK="%xmm6"; 201 202$code.=<<___; 203.globl gcm_init_clmul 204.type gcm_init_clmul,\@abi-omnipotent 205.align 16 206gcm_init_clmul: 207.cfi_startproc 208.L_init_clmul: 209___ 210$code.=<<___ if ($win64); 211.LSEH_begin_gcm_init_clmul: 212 # I can't trust assembler to use specific encoding:-( 213 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 214 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 215___ 216$code.=<<___; 217 movdqu ($Xip),$Hkey 218 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 219 220 # <<1 twist 221 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 222 movdqa $Hkey,$T1 223 psllq \$1,$Hkey 224 pxor $T3,$T3 # 225 psrlq \$63,$T1 226 pcmpgtd $T2,$T3 # broadcast carry bit 227 pslldq \$8,$T1 228 por $T1,$Hkey # H<<=1 229 230 # magic reduction 231 pand .L0x1c2_polynomial(%rip),$T3 232 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 233 234 # calculate H^2 235 pshufd \$0b01001110,$Hkey,$HK 236 movdqa $Hkey,$Xi 237 pxor $Hkey,$HK 238___ 239 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 240 &reduction_alg9 ($Xhi,$Xi); 241$code.=<<___; 242 pshufd \$0b01001110,$Hkey,$T1 243 pshufd \$0b01001110,$Xi,$T2 244 pxor $Hkey,$T1 # Karatsuba pre-processing 245 movdqu $Hkey,0x00($Htbl) # save H 246 pxor $Xi,$T2 # Karatsuba pre-processing 247 movdqu $Xi,0x10($Htbl) # save H^2 248 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 249 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 250___ 251if ($do4xaggr) { 252 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 253 &reduction_alg9 ($Xhi,$Xi); 254$code.=<<___; 255 movdqa $Xi,$T3 256___ 257 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 258 &reduction_alg9 ($Xhi,$Xi); 259$code.=<<___; 260 pshufd \$0b01001110,$T3,$T1 261 pshufd \$0b01001110,$Xi,$T2 262 pxor $T3,$T1 # Karatsuba pre-processing 263 movdqu $T3,0x30($Htbl) # save H^3 264 pxor $Xi,$T2 # Karatsuba pre-processing 265 movdqu $Xi,0x40($Htbl) # save H^4 266 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 267 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 268___ 269} 270$code.=<<___ if ($win64); 271 movaps (%rsp),%xmm6 272 lea 0x18(%rsp),%rsp 273.LSEH_end_gcm_init_clmul: 274___ 275$code.=<<___; 276 ret 277.cfi_endproc 278.size gcm_init_clmul,.-gcm_init_clmul 279___ 280} 281 282{ my ($Xip,$Htbl)=@_4args; 283 284$code.=<<___; 285.globl gcm_gmult_clmul 286.type gcm_gmult_clmul,\@abi-omnipotent 287.align 16 288gcm_gmult_clmul: 289.cfi_startproc 290.L_gmult_clmul: 291 movdqu ($Xip),$Xi 292 movdqa .Lbswap_mask(%rip),$T3 293 movdqu ($Htbl),$Hkey 294 movdqu 0x20($Htbl),$T2 295 pshufb $T3,$Xi 296___ 297 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 298$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 299 # experimental alternative. special thing about is that there 300 # no dependency between the two multiplications... 301 mov \$`0xE1<<1`,%eax 302 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 303 mov \$0x07,%r11d 304 movq %rax,$T1 305 movq %r10,$T2 306 movq %r11,$T3 # borrow $T3 307 pand $Xi,$T3 308 pshufb $T3,$T2 # ($Xi&7)·0xE0 309 movq %rax,$T3 310 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 311 pxor $Xi,$T2 312 pslldq \$15,$T2 313 paddd $T2,$T2 # <<(64+56+1) 314 pxor $T2,$Xi 315 pclmulqdq \$0x01,$T3,$Xi 316 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 317 psrldq \$1,$T1 318 pxor $T1,$Xhi 319 pslldq \$7,$Xi 320 pxor $Xhi,$Xi 321___ 322$code.=<<___; 323 pshufb $T3,$Xi 324 movdqu $Xi,($Xip) 325 ret 326.cfi_endproc 327.size gcm_gmult_clmul,.-gcm_gmult_clmul 328___ 329} 330 331{ my ($Xip,$Htbl,$inp,$len)=@_4args; 332 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 333 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 334 335$code.=<<___; 336.globl gcm_ghash_clmul 337.type gcm_ghash_clmul,\@abi-omnipotent 338.align 32 339gcm_ghash_clmul: 340.cfi_startproc 341.L_ghash_clmul: 342___ 343$code.=<<___ if ($win64); 344 lea -0x88(%rsp),%rax 345.LSEH_begin_gcm_ghash_clmul: 346 # I can't trust assembler to use specific encoding:-( 347 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 348 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 349 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 350 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 351 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 352 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 353 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 354 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 355 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 356 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 357 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 358___ 359$code.=<<___; 360 movdqa .Lbswap_mask(%rip),$T3 361 362 movdqu ($Xip),$Xi 363 movdqu ($Htbl),$Hkey 364 movdqu 0x20($Htbl),$HK 365 pshufb $T3,$Xi 366 367 sub \$0x10,$len 368 jz .Lodd_tail 369 370 movdqu 0x10($Htbl),$Hkey2 371___ 372if ($do4xaggr) { 373my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 374 375$code.=<<___; 376 leaq OPENSSL_ia32cap_P(%rip),%rax 377 mov 4(%rax),%eax 378 cmp \$0x30,$len 379 jb .Lskip4x 380 381 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 382 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 383 je .Lskip4x 384 385 sub \$0x30,$len 386 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 387 movdqu 0x30($Htbl),$Hkey3 388 movdqu 0x40($Htbl),$Hkey4 389 390 ####### 391 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 392 # 393 movdqu 0x30($inp),$Xln 394 movdqu 0x20($inp),$Xl 395 pshufb $T3,$Xln 396 pshufb $T3,$Xl 397 movdqa $Xln,$Xhn 398 pshufd \$0b01001110,$Xln,$Xmn 399 pxor $Xln,$Xmn 400 pclmulqdq \$0x00,$Hkey,$Xln 401 pclmulqdq \$0x11,$Hkey,$Xhn 402 pclmulqdq \$0x00,$HK,$Xmn 403 404 movdqa $Xl,$Xh 405 pshufd \$0b01001110,$Xl,$Xm 406 pxor $Xl,$Xm 407 pclmulqdq \$0x00,$Hkey2,$Xl 408 pclmulqdq \$0x11,$Hkey2,$Xh 409 pclmulqdq \$0x10,$HK,$Xm 410 xorps $Xl,$Xln 411 xorps $Xh,$Xhn 412 movups 0x50($Htbl),$HK 413 xorps $Xm,$Xmn 414 415 movdqu 0x10($inp),$Xl 416 movdqu 0($inp),$T1 417 pshufb $T3,$Xl 418 pshufb $T3,$T1 419 movdqa $Xl,$Xh 420 pshufd \$0b01001110,$Xl,$Xm 421 pxor $T1,$Xi 422 pxor $Xl,$Xm 423 pclmulqdq \$0x00,$Hkey3,$Xl 424 movdqa $Xi,$Xhi 425 pshufd \$0b01001110,$Xi,$T1 426 pxor $Xi,$T1 427 pclmulqdq \$0x11,$Hkey3,$Xh 428 pclmulqdq \$0x00,$HK,$Xm 429 xorps $Xl,$Xln 430 xorps $Xh,$Xhn 431 432 lea 0x40($inp),$inp 433 sub \$0x40,$len 434 jc .Ltail4x 435 436 jmp .Lmod4_loop 437.align 32 438.Lmod4_loop: 439 pclmulqdq \$0x00,$Hkey4,$Xi 440 xorps $Xm,$Xmn 441 movdqu 0x30($inp),$Xl 442 pshufb $T3,$Xl 443 pclmulqdq \$0x11,$Hkey4,$Xhi 444 xorps $Xln,$Xi 445 movdqu 0x20($inp),$Xln 446 movdqa $Xl,$Xh 447 pclmulqdq \$0x10,$HK,$T1 448 pshufd \$0b01001110,$Xl,$Xm 449 xorps $Xhn,$Xhi 450 pxor $Xl,$Xm 451 pshufb $T3,$Xln 452 movups 0x20($Htbl),$HK 453 xorps $Xmn,$T1 454 pclmulqdq \$0x00,$Hkey,$Xl 455 pshufd \$0b01001110,$Xln,$Xmn 456 457 pxor $Xi,$T1 # aggregated Karatsuba post-processing 458 movdqa $Xln,$Xhn 459 pxor $Xhi,$T1 # 460 pxor $Xln,$Xmn 461 movdqa $T1,$T2 # 462 pclmulqdq \$0x11,$Hkey,$Xh 463 pslldq \$8,$T1 464 psrldq \$8,$T2 # 465 pxor $T1,$Xi 466 movdqa .L7_mask(%rip),$T1 467 pxor $T2,$Xhi # 468 movq %rax,$T2 469 470 pand $Xi,$T1 # 1st phase 471 pshufb $T1,$T2 # 472 pxor $Xi,$T2 # 473 pclmulqdq \$0x00,$HK,$Xm 474 psllq \$57,$T2 # 475 movdqa $T2,$T1 # 476 pslldq \$8,$T2 477 pclmulqdq \$0x00,$Hkey2,$Xln 478 psrldq \$8,$T1 # 479 pxor $T2,$Xi 480 pxor $T1,$Xhi # 481 movdqu 0($inp),$T1 482 483 movdqa $Xi,$T2 # 2nd phase 484 psrlq \$1,$Xi 485 pclmulqdq \$0x11,$Hkey2,$Xhn 486 xorps $Xl,$Xln 487 movdqu 0x10($inp),$Xl 488 pshufb $T3,$Xl 489 pclmulqdq \$0x10,$HK,$Xmn 490 xorps $Xh,$Xhn 491 movups 0x50($Htbl),$HK 492 pshufb $T3,$T1 493 pxor $T2,$Xhi # 494 pxor $Xi,$T2 495 psrlq \$5,$Xi 496 497 movdqa $Xl,$Xh 498 pxor $Xm,$Xmn 499 pshufd \$0b01001110,$Xl,$Xm 500 pxor $T2,$Xi # 501 pxor $T1,$Xhi 502 pxor $Xl,$Xm 503 pclmulqdq \$0x00,$Hkey3,$Xl 504 psrlq \$1,$Xi # 505 pxor $Xhi,$Xi # 506 movdqa $Xi,$Xhi 507 pclmulqdq \$0x11,$Hkey3,$Xh 508 xorps $Xl,$Xln 509 pshufd \$0b01001110,$Xi,$T1 510 pxor $Xi,$T1 511 512 pclmulqdq \$0x00,$HK,$Xm 513 xorps $Xh,$Xhn 514 515 lea 0x40($inp),$inp 516 sub \$0x40,$len 517 jnc .Lmod4_loop 518 519.Ltail4x: 520 pclmulqdq \$0x00,$Hkey4,$Xi 521 pclmulqdq \$0x11,$Hkey4,$Xhi 522 pclmulqdq \$0x10,$HK,$T1 523 xorps $Xm,$Xmn 524 xorps $Xln,$Xi 525 xorps $Xhn,$Xhi 526 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 527 pxor $Xmn,$T1 528 529 pxor $Xhi,$T1 # 530 pxor $Xi,$Xhi 531 532 movdqa $T1,$T2 # 533 psrldq \$8,$T1 534 pslldq \$8,$T2 # 535 pxor $T1,$Xhi 536 pxor $T2,$Xi # 537___ 538 &reduction_alg9($Xhi,$Xi); 539$code.=<<___; 540 add \$0x40,$len 541 jz .Ldone 542 movdqu 0x20($Htbl),$HK 543 sub \$0x10,$len 544 jz .Lodd_tail 545.Lskip4x: 546___ 547} 548$code.=<<___; 549 ####### 550 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 551 # [(H*Ii+1) + (H*Xi+1)] mod P = 552 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 553 # 554 movdqu ($inp),$T1 # Ii 555 movdqu 16($inp),$Xln # Ii+1 556 pshufb $T3,$T1 557 pshufb $T3,$Xln 558 pxor $T1,$Xi # Ii+Xi 559 560 movdqa $Xln,$Xhn 561 pshufd \$0b01001110,$Xln,$Xmn 562 pxor $Xln,$Xmn 563 pclmulqdq \$0x00,$Hkey,$Xln 564 pclmulqdq \$0x11,$Hkey,$Xhn 565 pclmulqdq \$0x00,$HK,$Xmn 566 567 lea 32($inp),$inp # i+=2 568 nop 569 sub \$0x20,$len 570 jbe .Leven_tail 571 nop 572 jmp .Lmod_loop 573 574.align 32 575.Lmod_loop: 576 movdqa $Xi,$Xhi 577 movdqa $Xmn,$T1 578 pshufd \$0b01001110,$Xi,$Xmn # 579 pxor $Xi,$Xmn # 580 581 pclmulqdq \$0x00,$Hkey2,$Xi 582 pclmulqdq \$0x11,$Hkey2,$Xhi 583 pclmulqdq \$0x10,$HK,$Xmn 584 585 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 586 pxor $Xhn,$Xhi 587 movdqu ($inp),$T2 # Ii 588 pxor $Xi,$T1 # aggregated Karatsuba post-processing 589 pshufb $T3,$T2 590 movdqu 16($inp),$Xln # Ii+1 591 592 pxor $Xhi,$T1 593 pxor $T2,$Xhi # "Ii+Xi", consume early 594 pxor $T1,$Xmn 595 pshufb $T3,$Xln 596 movdqa $Xmn,$T1 # 597 psrldq \$8,$T1 598 pslldq \$8,$Xmn # 599 pxor $T1,$Xhi 600 pxor $Xmn,$Xi # 601 602 movdqa $Xln,$Xhn # 603 604 movdqa $Xi,$T2 # 1st phase 605 movdqa $Xi,$T1 606 psllq \$5,$Xi 607 pxor $Xi,$T1 # 608 pclmulqdq \$0x00,$Hkey,$Xln ####### 609 psllq \$1,$Xi 610 pxor $T1,$Xi # 611 psllq \$57,$Xi # 612 movdqa $Xi,$T1 # 613 pslldq \$8,$Xi 614 psrldq \$8,$T1 # 615 pxor $T2,$Xi 616 pshufd \$0b01001110,$Xhn,$Xmn 617 pxor $T1,$Xhi # 618 pxor $Xhn,$Xmn # 619 620 movdqa $Xi,$T2 # 2nd phase 621 psrlq \$1,$Xi 622 pclmulqdq \$0x11,$Hkey,$Xhn ####### 623 pxor $T2,$Xhi # 624 pxor $Xi,$T2 625 psrlq \$5,$Xi 626 pxor $T2,$Xi # 627 lea 32($inp),$inp 628 psrlq \$1,$Xi # 629 pclmulqdq \$0x00,$HK,$Xmn ####### 630 pxor $Xhi,$Xi # 631 632 sub \$0x20,$len 633 ja .Lmod_loop 634 635.Leven_tail: 636 movdqa $Xi,$Xhi 637 movdqa $Xmn,$T1 638 pshufd \$0b01001110,$Xi,$Xmn # 639 pxor $Xi,$Xmn # 640 641 pclmulqdq \$0x00,$Hkey2,$Xi 642 pclmulqdq \$0x11,$Hkey2,$Xhi 643 pclmulqdq \$0x10,$HK,$Xmn 644 645 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 646 pxor $Xhn,$Xhi 647 pxor $Xi,$T1 648 pxor $Xhi,$T1 649 pxor $T1,$Xmn 650 movdqa $Xmn,$T1 # 651 psrldq \$8,$T1 652 pslldq \$8,$Xmn # 653 pxor $T1,$Xhi 654 pxor $Xmn,$Xi # 655___ 656 &reduction_alg9 ($Xhi,$Xi); 657$code.=<<___; 658 test $len,$len 659 jnz .Ldone 660 661.Lodd_tail: 662 movdqu ($inp),$T1 # Ii 663 pshufb $T3,$T1 664 pxor $T1,$Xi # Ii+Xi 665___ 666 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 667 &reduction_alg9 ($Xhi,$Xi); 668$code.=<<___; 669.Ldone: 670 pshufb $T3,$Xi 671 movdqu $Xi,($Xip) 672___ 673$code.=<<___ if ($win64); 674 movaps (%rsp),%xmm6 675 movaps 0x10(%rsp),%xmm7 676 movaps 0x20(%rsp),%xmm8 677 movaps 0x30(%rsp),%xmm9 678 movaps 0x40(%rsp),%xmm10 679 movaps 0x50(%rsp),%xmm11 680 movaps 0x60(%rsp),%xmm12 681 movaps 0x70(%rsp),%xmm13 682 movaps 0x80(%rsp),%xmm14 683 movaps 0x90(%rsp),%xmm15 684 lea 0xa8(%rsp),%rsp 685.LSEH_end_gcm_ghash_clmul: 686___ 687$code.=<<___; 688 ret 689.cfi_endproc 690.size gcm_ghash_clmul,.-gcm_ghash_clmul 691___ 692} 693 694$code.=<<___; 695.globl gcm_init_avx 696.type gcm_init_avx,\@abi-omnipotent 697.align 32 698gcm_init_avx: 699.cfi_startproc 700___ 701if ($avx) { 702my ($Htbl,$Xip)=@_4args; 703my $HK="%xmm6"; 704 705$code.=<<___ if ($win64); 706.LSEH_begin_gcm_init_avx: 707 # I can't trust assembler to use specific encoding:-( 708 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 709 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 710___ 711$code.=<<___; 712 vzeroupper 713 714 vmovdqu ($Xip),$Hkey 715 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 716 717 # <<1 twist 718 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 719 vpsrlq \$63,$Hkey,$T1 720 vpsllq \$1,$Hkey,$Hkey 721 vpxor $T3,$T3,$T3 # 722 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 723 vpslldq \$8,$T1,$T1 724 vpor $T1,$Hkey,$Hkey # H<<=1 725 726 # magic reduction 727 vpand .L0x1c2_polynomial(%rip),$T3,$T3 728 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 729 730 vpunpckhqdq $Hkey,$Hkey,$HK 731 vmovdqa $Hkey,$Xi 732 vpxor $Hkey,$HK,$HK 733 mov \$4,%r10 # up to H^8 734 jmp .Linit_start_avx 735___ 736 737sub clmul64x64_avx { 738my ($Xhi,$Xi,$Hkey,$HK)=@_; 739 740if (!defined($HK)) { $HK = $T2; 741$code.=<<___; 742 vpunpckhqdq $Xi,$Xi,$T1 743 vpunpckhqdq $Hkey,$Hkey,$T2 744 vpxor $Xi,$T1,$T1 # 745 vpxor $Hkey,$T2,$T2 746___ 747} else { 748$code.=<<___; 749 vpunpckhqdq $Xi,$Xi,$T1 750 vpxor $Xi,$T1,$T1 # 751___ 752} 753$code.=<<___; 754 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 755 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 756 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 757 vpxor $Xi,$Xhi,$T2 # 758 vpxor $T2,$T1,$T1 # 759 760 vpslldq \$8,$T1,$T2 # 761 vpsrldq \$8,$T1,$T1 762 vpxor $T2,$Xi,$Xi # 763 vpxor $T1,$Xhi,$Xhi 764___ 765} 766 767sub reduction_avx { 768my ($Xhi,$Xi) = @_; 769 770$code.=<<___; 771 vpsllq \$57,$Xi,$T1 # 1st phase 772 vpsllq \$62,$Xi,$T2 773 vpxor $T1,$T2,$T2 # 774 vpsllq \$63,$Xi,$T1 775 vpxor $T1,$T2,$T2 # 776 vpslldq \$8,$T2,$T1 # 777 vpsrldq \$8,$T2,$T2 778 vpxor $T1,$Xi,$Xi # 779 vpxor $T2,$Xhi,$Xhi 780 781 vpsrlq \$1,$Xi,$T2 # 2nd phase 782 vpxor $Xi,$Xhi,$Xhi 783 vpxor $T2,$Xi,$Xi # 784 vpsrlq \$5,$T2,$T2 785 vpxor $T2,$Xi,$Xi # 786 vpsrlq \$1,$Xi,$Xi # 787 vpxor $Xhi,$Xi,$Xi # 788___ 789} 790 791$code.=<<___; 792.align 32 793.Linit_loop_avx: 794 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 795 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 796___ 797 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 798 &reduction_avx ($Xhi,$Xi); 799$code.=<<___; 800.Linit_start_avx: 801 vmovdqa $Xi,$T3 802___ 803 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 804 &reduction_avx ($Xhi,$Xi); 805$code.=<<___; 806 vpshufd \$0b01001110,$T3,$T1 807 vpshufd \$0b01001110,$Xi,$T2 808 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 809 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 810 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 811 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 812 lea 0x30($Htbl),$Htbl 813 sub \$1,%r10 814 jnz .Linit_loop_avx 815 816 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 817 vmovdqu $T3,-0x10($Htbl) 818 819 vzeroupper 820___ 821$code.=<<___ if ($win64); 822 movaps (%rsp),%xmm6 823 lea 0x18(%rsp),%rsp 824.LSEH_end_gcm_init_avx: 825___ 826$code.=<<___; 827 ret 828.cfi_endproc 829.size gcm_init_avx,.-gcm_init_avx 830___ 831} else { 832$code.=<<___; 833 jmp .L_init_clmul 834.size gcm_init_avx,.-gcm_init_avx 835___ 836} 837 838$code.=<<___; 839.globl gcm_ghash_avx 840.type gcm_ghash_avx,\@abi-omnipotent 841.align 32 842gcm_ghash_avx: 843.cfi_startproc 844___ 845if ($avx) { 846my ($Xip,$Htbl,$inp,$len)=@_4args; 847my ($Xlo,$Xhi,$Xmi, 848 $Zlo,$Zhi,$Zmi, 849 $Hkey,$HK,$T1,$T2, 850 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 851 852$code.=<<___ if ($win64); 853 lea -0x88(%rsp),%rax 854.LSEH_begin_gcm_ghash_avx: 855 # I can't trust assembler to use specific encoding:-( 856 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 857 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 858 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 859 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 860 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 861 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 862 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 863 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 864 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 865 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 866 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 867___ 868$code.=<<___; 869 vzeroupper 870 871 vmovdqu ($Xip),$Xi # load $Xi 872 lea .L0x1c2_polynomial(%rip),%r10 873 lea 0x40($Htbl),$Htbl # size optimization 874 vmovdqu .Lbswap_mask(%rip),$bswap 875 vpshufb $bswap,$Xi,$Xi 876 cmp \$0x80,$len 877 jb .Lshort_avx 878 sub \$0x80,$len 879 880 vmovdqu 0x70($inp),$Ii # I[7] 881 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 882 vpshufb $bswap,$Ii,$Ii 883 vmovdqu 0x20-0x40($Htbl),$HK 884 885 vpunpckhqdq $Ii,$Ii,$T2 886 vmovdqu 0x60($inp),$Ij # I[6] 887 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 888 vpxor $Ii,$T2,$T2 889 vpshufb $bswap,$Ij,$Ij 890 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 891 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 892 vpunpckhqdq $Ij,$Ij,$T1 893 vmovdqu 0x50($inp),$Ii # I[5] 894 vpclmulqdq \$0x00,$HK,$T2,$Xmi 895 vpxor $Ij,$T1,$T1 896 897 vpshufb $bswap,$Ii,$Ii 898 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 899 vpunpckhqdq $Ii,$Ii,$T2 900 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 901 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 902 vpxor $Ii,$T2,$T2 903 vmovdqu 0x40($inp),$Ij # I[4] 904 vpclmulqdq \$0x10,$HK,$T1,$Zmi 905 vmovdqu 0x50-0x40($Htbl),$HK 906 907 vpshufb $bswap,$Ij,$Ij 908 vpxor $Xlo,$Zlo,$Zlo 909 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 910 vpxor $Xhi,$Zhi,$Zhi 911 vpunpckhqdq $Ij,$Ij,$T1 912 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 913 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 914 vpxor $Xmi,$Zmi,$Zmi 915 vpclmulqdq \$0x00,$HK,$T2,$Xmi 916 vpxor $Ij,$T1,$T1 917 918 vmovdqu 0x30($inp),$Ii # I[3] 919 vpxor $Zlo,$Xlo,$Xlo 920 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 921 vpxor $Zhi,$Xhi,$Xhi 922 vpshufb $bswap,$Ii,$Ii 923 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 924 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 925 vpxor $Zmi,$Xmi,$Xmi 926 vpunpckhqdq $Ii,$Ii,$T2 927 vpclmulqdq \$0x10,$HK,$T1,$Zmi 928 vmovdqu 0x80-0x40($Htbl),$HK 929 vpxor $Ii,$T2,$T2 930 931 vmovdqu 0x20($inp),$Ij # I[2] 932 vpxor $Xlo,$Zlo,$Zlo 933 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 934 vpxor $Xhi,$Zhi,$Zhi 935 vpshufb $bswap,$Ij,$Ij 936 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 937 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 938 vpxor $Xmi,$Zmi,$Zmi 939 vpunpckhqdq $Ij,$Ij,$T1 940 vpclmulqdq \$0x00,$HK,$T2,$Xmi 941 vpxor $Ij,$T1,$T1 942 943 vmovdqu 0x10($inp),$Ii # I[1] 944 vpxor $Zlo,$Xlo,$Xlo 945 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 946 vpxor $Zhi,$Xhi,$Xhi 947 vpshufb $bswap,$Ii,$Ii 948 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 949 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 950 vpxor $Zmi,$Xmi,$Xmi 951 vpunpckhqdq $Ii,$Ii,$T2 952 vpclmulqdq \$0x10,$HK,$T1,$Zmi 953 vmovdqu 0xb0-0x40($Htbl),$HK 954 vpxor $Ii,$T2,$T2 955 956 vmovdqu ($inp),$Ij # I[0] 957 vpxor $Xlo,$Zlo,$Zlo 958 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 959 vpxor $Xhi,$Zhi,$Zhi 960 vpshufb $bswap,$Ij,$Ij 961 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 962 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 963 vpxor $Xmi,$Zmi,$Zmi 964 vpclmulqdq \$0x10,$HK,$T2,$Xmi 965 966 lea 0x80($inp),$inp 967 cmp \$0x80,$len 968 jb .Ltail_avx 969 970 vpxor $Xi,$Ij,$Ij # accumulate $Xi 971 sub \$0x80,$len 972 jmp .Loop8x_avx 973 974.align 32 975.Loop8x_avx: 976 vpunpckhqdq $Ij,$Ij,$T1 977 vmovdqu 0x70($inp),$Ii # I[7] 978 vpxor $Xlo,$Zlo,$Zlo 979 vpxor $Ij,$T1,$T1 980 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 981 vpshufb $bswap,$Ii,$Ii 982 vpxor $Xhi,$Zhi,$Zhi 983 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 984 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 985 vpunpckhqdq $Ii,$Ii,$T2 986 vpxor $Xmi,$Zmi,$Zmi 987 vpclmulqdq \$0x00,$HK,$T1,$Tred 988 vmovdqu 0x20-0x40($Htbl),$HK 989 vpxor $Ii,$T2,$T2 990 991 vmovdqu 0x60($inp),$Ij # I[6] 992 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 993 vpxor $Zlo,$Xi,$Xi # collect result 994 vpshufb $bswap,$Ij,$Ij 995 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 996 vxorps $Zhi,$Xo,$Xo 997 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 998 vpunpckhqdq $Ij,$Ij,$T1 999 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1000 vpxor $Zmi,$Tred,$Tred 1001 vxorps $Ij,$T1,$T1 1002 1003 vmovdqu 0x50($inp),$Ii # I[5] 1004 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1005 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1006 vpxor $Xo,$Tred,$Tred 1007 vpslldq \$8,$Tred,$T2 1008 vpxor $Xlo,$Zlo,$Zlo 1009 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1010 vpsrldq \$8,$Tred,$Tred 1011 vpxor $T2, $Xi, $Xi 1012 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1013 vpshufb $bswap,$Ii,$Ii 1014 vxorps $Tred,$Xo, $Xo 1015 vpxor $Xhi,$Zhi,$Zhi 1016 vpunpckhqdq $Ii,$Ii,$T2 1017 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1018 vmovdqu 0x50-0x40($Htbl),$HK 1019 vpxor $Ii,$T2,$T2 1020 vpxor $Xmi,$Zmi,$Zmi 1021 1022 vmovdqu 0x40($inp),$Ij # I[4] 1023 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1024 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1025 vpshufb $bswap,$Ij,$Ij 1026 vpxor $Zlo,$Xlo,$Xlo 1027 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1028 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1029 vpunpckhqdq $Ij,$Ij,$T1 1030 vpxor $Zhi,$Xhi,$Xhi 1031 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1032 vxorps $Ij,$T1,$T1 1033 vpxor $Zmi,$Xmi,$Xmi 1034 1035 vmovdqu 0x30($inp),$Ii # I[3] 1036 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1037 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1038 vpshufb $bswap,$Ii,$Ii 1039 vpxor $Xlo,$Zlo,$Zlo 1040 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1041 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1042 vpunpckhqdq $Ii,$Ii,$T2 1043 vpxor $Xhi,$Zhi,$Zhi 1044 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1045 vmovdqu 0x80-0x40($Htbl),$HK 1046 vpxor $Ii,$T2,$T2 1047 vpxor $Xmi,$Zmi,$Zmi 1048 1049 vmovdqu 0x20($inp),$Ij # I[2] 1050 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1051 vpshufb $bswap,$Ij,$Ij 1052 vpxor $Zlo,$Xlo,$Xlo 1053 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1054 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1055 vpunpckhqdq $Ij,$Ij,$T1 1056 vpxor $Zhi,$Xhi,$Xhi 1057 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1058 vpxor $Ij,$T1,$T1 1059 vpxor $Zmi,$Xmi,$Xmi 1060 vxorps $Tred,$Xi,$Xi 1061 1062 vmovdqu 0x10($inp),$Ii # I[1] 1063 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1064 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1065 vpshufb $bswap,$Ii,$Ii 1066 vpxor $Xlo,$Zlo,$Zlo 1067 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1068 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1069 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1070 vxorps $Xo,$Tred,$Tred 1071 vpunpckhqdq $Ii,$Ii,$T2 1072 vpxor $Xhi,$Zhi,$Zhi 1073 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1074 vmovdqu 0xb0-0x40($Htbl),$HK 1075 vpxor $Ii,$T2,$T2 1076 vpxor $Xmi,$Zmi,$Zmi 1077 1078 vmovdqu ($inp),$Ij # I[0] 1079 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1080 vpshufb $bswap,$Ij,$Ij 1081 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1082 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1083 vpxor $Tred,$Ij,$Ij 1084 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1085 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1086 1087 lea 0x80($inp),$inp 1088 sub \$0x80,$len 1089 jnc .Loop8x_avx 1090 1091 add \$0x80,$len 1092 jmp .Ltail_no_xor_avx 1093 1094.align 32 1095.Lshort_avx: 1096 vmovdqu -0x10($inp,$len),$Ii # very last word 1097 lea ($inp,$len),$inp 1098 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1099 vmovdqu 0x20-0x40($Htbl),$HK 1100 vpshufb $bswap,$Ii,$Ij 1101 1102 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1103 vmovdqa $Xhi,$Zhi # $Zhi and 1104 vmovdqa $Xmi,$Zmi # $Zmi 1105 sub \$0x10,$len 1106 jz .Ltail_avx 1107 1108 vpunpckhqdq $Ij,$Ij,$T1 1109 vpxor $Xlo,$Zlo,$Zlo 1110 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1111 vpxor $Ij,$T1,$T1 1112 vmovdqu -0x20($inp),$Ii 1113 vpxor $Xhi,$Zhi,$Zhi 1114 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1115 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1116 vpshufb $bswap,$Ii,$Ij 1117 vpxor $Xmi,$Zmi,$Zmi 1118 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1119 vpsrldq \$8,$HK,$HK 1120 sub \$0x10,$len 1121 jz .Ltail_avx 1122 1123 vpunpckhqdq $Ij,$Ij,$T1 1124 vpxor $Xlo,$Zlo,$Zlo 1125 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1126 vpxor $Ij,$T1,$T1 1127 vmovdqu -0x30($inp),$Ii 1128 vpxor $Xhi,$Zhi,$Zhi 1129 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1130 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1131 vpshufb $bswap,$Ii,$Ij 1132 vpxor $Xmi,$Zmi,$Zmi 1133 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1134 vmovdqu 0x50-0x40($Htbl),$HK 1135 sub \$0x10,$len 1136 jz .Ltail_avx 1137 1138 vpunpckhqdq $Ij,$Ij,$T1 1139 vpxor $Xlo,$Zlo,$Zlo 1140 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1141 vpxor $Ij,$T1,$T1 1142 vmovdqu -0x40($inp),$Ii 1143 vpxor $Xhi,$Zhi,$Zhi 1144 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1145 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1146 vpshufb $bswap,$Ii,$Ij 1147 vpxor $Xmi,$Zmi,$Zmi 1148 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1149 vpsrldq \$8,$HK,$HK 1150 sub \$0x10,$len 1151 jz .Ltail_avx 1152 1153 vpunpckhqdq $Ij,$Ij,$T1 1154 vpxor $Xlo,$Zlo,$Zlo 1155 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1156 vpxor $Ij,$T1,$T1 1157 vmovdqu -0x50($inp),$Ii 1158 vpxor $Xhi,$Zhi,$Zhi 1159 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1160 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1161 vpshufb $bswap,$Ii,$Ij 1162 vpxor $Xmi,$Zmi,$Zmi 1163 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1164 vmovdqu 0x80-0x40($Htbl),$HK 1165 sub \$0x10,$len 1166 jz .Ltail_avx 1167 1168 vpunpckhqdq $Ij,$Ij,$T1 1169 vpxor $Xlo,$Zlo,$Zlo 1170 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1171 vpxor $Ij,$T1,$T1 1172 vmovdqu -0x60($inp),$Ii 1173 vpxor $Xhi,$Zhi,$Zhi 1174 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1175 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1176 vpshufb $bswap,$Ii,$Ij 1177 vpxor $Xmi,$Zmi,$Zmi 1178 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1179 vpsrldq \$8,$HK,$HK 1180 sub \$0x10,$len 1181 jz .Ltail_avx 1182 1183 vpunpckhqdq $Ij,$Ij,$T1 1184 vpxor $Xlo,$Zlo,$Zlo 1185 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1186 vpxor $Ij,$T1,$T1 1187 vmovdqu -0x70($inp),$Ii 1188 vpxor $Xhi,$Zhi,$Zhi 1189 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1190 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1191 vpshufb $bswap,$Ii,$Ij 1192 vpxor $Xmi,$Zmi,$Zmi 1193 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1194 vmovq 0xb8-0x40($Htbl),$HK 1195 sub \$0x10,$len 1196 jmp .Ltail_avx 1197 1198.align 32 1199.Ltail_avx: 1200 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1201.Ltail_no_xor_avx: 1202 vpunpckhqdq $Ij,$Ij,$T1 1203 vpxor $Xlo,$Zlo,$Zlo 1204 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1205 vpxor $Ij,$T1,$T1 1206 vpxor $Xhi,$Zhi,$Zhi 1207 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1208 vpxor $Xmi,$Zmi,$Zmi 1209 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1210 1211 vmovdqu (%r10),$Tred 1212 1213 vpxor $Xlo,$Zlo,$Xi 1214 vpxor $Xhi,$Zhi,$Xo 1215 vpxor $Xmi,$Zmi,$Zmi 1216 1217 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1218 vpxor $Xo, $Zmi,$Zmi 1219 vpslldq \$8, $Zmi,$T2 1220 vpsrldq \$8, $Zmi,$Zmi 1221 vpxor $T2, $Xi, $Xi 1222 vpxor $Zmi,$Xo, $Xo 1223 1224 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1225 vpalignr \$8,$Xi,$Xi,$Xi 1226 vpxor $T2,$Xi,$Xi 1227 1228 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1229 vpalignr \$8,$Xi,$Xi,$Xi 1230 vpxor $Xo,$Xi,$Xi 1231 vpxor $T2,$Xi,$Xi 1232 1233 cmp \$0,$len 1234 jne .Lshort_avx 1235 1236 vpshufb $bswap,$Xi,$Xi 1237 vmovdqu $Xi,($Xip) 1238 vzeroupper 1239___ 1240$code.=<<___ if ($win64); 1241 movaps (%rsp),%xmm6 1242 movaps 0x10(%rsp),%xmm7 1243 movaps 0x20(%rsp),%xmm8 1244 movaps 0x30(%rsp),%xmm9 1245 movaps 0x40(%rsp),%xmm10 1246 movaps 0x50(%rsp),%xmm11 1247 movaps 0x60(%rsp),%xmm12 1248 movaps 0x70(%rsp),%xmm13 1249 movaps 0x80(%rsp),%xmm14 1250 movaps 0x90(%rsp),%xmm15 1251 lea 0xa8(%rsp),%rsp 1252.LSEH_end_gcm_ghash_avx: 1253___ 1254$code.=<<___; 1255 ret 1256.cfi_endproc 1257.size gcm_ghash_avx,.-gcm_ghash_avx 1258___ 1259} else { 1260$code.=<<___; 1261 jmp .L_ghash_clmul 1262.size gcm_ghash_avx,.-gcm_ghash_avx 1263___ 1264} 1265 1266$code.=<<___; 1267.align 64 1268.Lbswap_mask: 1269 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1270.L0x1c2_polynomial: 1271 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1272.L7_mask: 1273 .long 7,0,7,0 1274.align 64 1275 1276.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1277.align 64 1278___ 1279 1280if ($win64) { 1281$code.=<<___; 1282.section .pdata 1283.align 4 1284 .rva .LSEH_begin_gcm_init_clmul 1285 .rva .LSEH_end_gcm_init_clmul 1286 .rva .LSEH_info_gcm_init_clmul 1287 1288 .rva .LSEH_begin_gcm_ghash_clmul 1289 .rva .LSEH_end_gcm_ghash_clmul 1290 .rva .LSEH_info_gcm_ghash_clmul 1291___ 1292$code.=<<___ if ($avx); 1293 .rva .LSEH_begin_gcm_init_avx 1294 .rva .LSEH_end_gcm_init_avx 1295 .rva .LSEH_info_gcm_init_clmul 1296 1297 .rva .LSEH_begin_gcm_ghash_avx 1298 .rva .LSEH_end_gcm_ghash_avx 1299 .rva .LSEH_info_gcm_ghash_clmul 1300___ 1301$code.=<<___; 1302.section .xdata 1303.align 8 1304.LSEH_info_gcm_init_clmul: 1305 .byte 0x01,0x08,0x03,0x00 1306 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1307 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1308.LSEH_info_gcm_ghash_clmul: 1309 .byte 0x01,0x33,0x16,0x00 1310 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1311 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1312 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1313 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1314 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1315 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1316 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1317 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1318 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1319 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1320 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1321___ 1322} 1323 1324$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1325 1326print $code; 1327 1328close STDOUT or die "error closing STDOUT"; 1329