1#!/usr/bin/env perl 2# Copyright 2024 The BoringSSL Authors 3# 4# Permission to use, copy, modify, and/or distribute this software for any 5# purpose with or without fee is hereby granted, provided that the above 6# copyright notice and this permission notice appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15# 16#------------------------------------------------------------------------------ 17# 18# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version) 19# 20# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512 21# / AVX10. This means it can only use 16 vector registers instead of 32, the 22# maximum vector length is 32 bytes, and some instructions such as vpternlogd 23# and masked loads/stores are unavailable. However, it is able to run on CPUs 24# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan" 25# server processors) and some Intel client CPUs such as Alder Lake. 26# 27# This implementation also uses Karatsuba multiplication instead of schoolbook 28# multiplication for GHASH in its main loop. This does not help much on Intel, 29# but it improves performance by ~5% on AMD Zen 3 which is the main target for 30# this implementation. Other factors weighing slightly in favor of Karatsuba 31# multiplication in this implementation are the lower maximum vector length 32# (which means there is space left in the Htable array to cache the halves of 33# the key powers XOR'd together) and the unavailability of the vpternlogd 34# instruction (which helped schoolbook a bit more than Karatsuba). 35 36use strict; 37 38my $flavour = shift; 39my $output = shift; 40if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } 41 42my $win64; 43my @argregs; 44if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { 45 $win64 = 1; 46 @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); 47} 48else { 49 $win64 = 0; 50 @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); 51} 52 53$0 =~ m/(.*[\/\\])[^\/\\]+$/; 54my $dir = $1; 55my $xlate; 56( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) 57 or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) 58 or die "can't locate x86_64-xlate.pl"; 59 60open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; 61*STDOUT = *OUT; 62 63my $g_cur_func_name; 64my $g_cur_func_uses_seh; 65my @g_cur_func_saved_gpregs; 66my @g_cur_func_saved_xmmregs; 67 68sub _begin_func { 69 my ( $funcname, $uses_seh ) = @_; 70 $g_cur_func_name = $funcname; 71 $g_cur_func_uses_seh = $uses_seh; 72 @g_cur_func_saved_gpregs = (); 73 @g_cur_func_saved_xmmregs = (); 74 return <<___; 75.globl $funcname 76.type $funcname,\@abi-omnipotent 77.align 32 78$funcname: 79 .cfi_startproc 80 @{[ $uses_seh ? ".seh_startproc" : "" ]} 81 _CET_ENDBR 82___ 83} 84 85# Push a list of general purpose registers onto the stack. 86sub _save_gpregs { 87 my @gpregs = @_; 88 my $code = ""; 89 die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; 90 die "_save_gpregs can only be called once per function" 91 if @g_cur_func_saved_gpregs; 92 die "Order must be _save_gpregs, then _save_xmmregs" 93 if @g_cur_func_saved_xmmregs; 94 @g_cur_func_saved_gpregs = @gpregs; 95 for my $reg (@gpregs) { 96 $code .= "push $reg\n"; 97 if ($win64) { 98 $code .= ".seh_pushreg $reg\n"; 99 } 100 else { 101 $code .= ".cfi_push $reg\n"; 102 } 103 } 104 return $code; 105} 106 107# Push a list of xmm registers onto the stack if the target is Windows. 108sub _save_xmmregs { 109 my @xmmregs = @_; 110 my $num_xmmregs = scalar @xmmregs; 111 my $code = ""; 112 die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; 113 die "_save_xmmregs can only be called once per function" 114 if @g_cur_func_saved_xmmregs; 115 if ( $win64 and $num_xmmregs > 0 ) { 116 @g_cur_func_saved_xmmregs = @xmmregs; 117 my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; 118 my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); 119 $code .= "sub \$$alloc_size, %rsp\n"; 120 $code .= ".seh_stackalloc $alloc_size\n"; 121 for my $i ( 0 .. $num_xmmregs - 1 ) { 122 my $reg_num = $xmmregs[$i]; 123 my $pos = 16 * $i; 124 $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; 125 $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; 126 } 127 } 128 return $code; 129} 130 131sub _end_func { 132 my $code = ""; 133 134 # Restore any xmm registers that were saved earlier. 135 my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; 136 if ( $win64 and $num_xmmregs > 0 ) { 137 my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; 138 my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); 139 for my $i ( 0 .. $num_xmmregs - 1 ) { 140 my $reg_num = $g_cur_func_saved_xmmregs[$i]; 141 my $pos = 16 * $i; 142 $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; 143 } 144 $code .= "add \$$alloc_size, %rsp\n"; 145 } 146 147 # Restore any general purpose registers that were saved earlier. 148 for my $reg ( reverse @g_cur_func_saved_gpregs ) { 149 $code .= "pop $reg\n"; 150 if ( !$win64 ) { 151 $code .= ".cfi_pop $reg\n"; 152 } 153 } 154 155 $code .= <<___; 156 ret 157 @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} 158 .cfi_endproc 159 .size $g_cur_func_name, . - $g_cur_func_name 160___ 161 return $code; 162} 163 164my $code = <<___; 165.section .rodata 166.align 16 167 168 # A shuffle mask that reflects the bytes of 16-byte blocks 169.Lbswap_mask: 170 .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 171 172 # This is the GHASH reducing polynomial without its constant term, i.e. 173 # x^128 + x^7 + x^2 + x, represented using the backwards mapping 174 # between bits and polynomial coefficients. 175 # 176 # Alternatively, it can be interpreted as the naturally-ordered 177 # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the 178 # "reversed" GHASH reducing polynomial without its x^128 term. 179.Lgfpoly: 180 .quad 1, 0xc200000000000000 181 182 # Same as above, but with the (1 << 64) bit set. 183.Lgfpoly_and_internal_carrybit: 184 .quad 1, 0xc200000000000001 185 186.align 32 187 # The below constants are used for incrementing the counter blocks. 188.Lctr_pattern: 189 .quad 0, 0 190 .quad 1, 0 191.Linc_2blocks: 192 .quad 2, 0 193 .quad 2, 0 194 195.text 196___ 197 198# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the 199# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) 200# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. 201my $NUM_H_POWERS = 8; 202my $OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; 203my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS; 204 205# Offset to 'rounds' in AES_KEY struct 206my $OFFSETOF_AES_ROUNDS = 240; 207 208# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store 209# the reduced products in \dst. Uses schoolbook multiplication. 210sub _ghash_mul { 211 my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; 212 return <<___; 213 vpclmulqdq \$0x00, $a, $b, $t0 # LO = a_L * b_L 214 vpclmulqdq \$0x01, $a, $b, $t1 # MI_0 = a_L * b_H 215 vpclmulqdq \$0x10, $a, $b, $t2 # MI_1 = a_H * b_L 216 vpxor $t2, $t1, $t1 # MI = MI_0 + MI_1 217 vpclmulqdq \$0x01, $t0, $gfpoly, $t2 # LO_L*(x^63 + x^62 + x^57) 218 vpshufd \$0x4e, $t0, $t0 # Swap halves of LO 219 vpxor $t0, $t1, $t1 # Fold LO into MI (part 1) 220 vpxor $t2, $t1, $t1 # Fold LO into MI (part 2) 221 vpclmulqdq \$0x11, $a, $b, $dst # HI = a_H * b_H 222 vpclmulqdq \$0x01, $t1, $gfpoly, $t0 # MI_L*(x^63 + x^62 + x^57) 223 vpshufd \$0x4e, $t1, $t1 # Swap halves of MI 224 vpxor $t1, $dst, $dst # Fold MI into HI (part 1) 225 vpxor $t0, $dst, $dst # Fold MI into HI (part 2) 226___ 227} 228 229# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); 230# 231# Initialize |Htable| with powers of the GHASH subkey |H|. 232# 233# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the 234# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication) 235# in the order 8,6,7,5,4,2,3,1. We do not use Htable[12..15]. 236$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1; 237{ 238 my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; 239 my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); 240 my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); 241 my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); 242 my ( $H_CUR, $H_CUR_XMM ) = ( "%ymm3", "%xmm3" ); 243 my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" ); 244 my ( $H_INC, $H_INC_XMM ) = ( "%ymm5", "%xmm5" ); 245 my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" ); 246 247 $code .= <<___; 248 @{[ _save_xmmregs (6) ]} 249 .seh_endprologue 250 251 # Load the byte-reflected hash subkey. BoringSSL provides it in 252 # byte-reflected form except the two halves are in the wrong order. 253 vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM 254 255 # Finish preprocessing the byte-reflected hash subkey by multiplying it by 256 # x^-1 ("standard" interpretation of polynomial coefficients) or 257 # equivalently x^1 (natural interpretation). This gets the key into a 258 # format that avoids having to bit-reflect the data blocks later. 259 vpshufd \$0xd3, $H_CUR_XMM, $TMP0_XMM 260 vpsrad \$31, $TMP0_XMM, $TMP0_XMM 261 vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM 262 vpand .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM 263 vpxor $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM 264 265 vbroadcasti128 .Lgfpoly(%rip), $GFPOLY 266 267 # Square H^1 to get H^2. 268 @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, 269 $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]} 270 271 # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. 272 vinserti128 \$1, $H_CUR_XMM, $H_INC, $H_CUR 273 vinserti128 \$1, $H_INC_XMM, $H_INC, $H_INC 274 275 # Compute H_CUR2 = [H^4, H^3]. 276 @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} 277 278 # Store [H^2, H^1] and [H^4, H^3]. 279 vmovdqu $H_CUR, 3*32($HTABLE) 280 vmovdqu $H_CUR2, 2*32($HTABLE) 281 282 # For Karatsuba multiplication: compute and store the two 64-bit halves of 283 # each key power XOR'd together. Order is 4,2,3,1. 284 vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 285 vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 286 vpxor $TMP1, $TMP0, $TMP0 287 vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE) 288 289 # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. 290 @{[ _ghash_mul $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} 291 @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]} 292 vmovdqu $H_CUR, 1*32($HTABLE) 293 vmovdqu $H_CUR2, 0*32($HTABLE) 294 295 # Again, compute and store the two 64-bit halves of each key power XOR'd 296 # together. Order is 8,6,7,5. 297 vpunpcklqdq $H_CUR, $H_CUR2, $TMP0 298 vpunpckhqdq $H_CUR, $H_CUR2, $TMP1 299 vpxor $TMP1, $TMP0, $TMP0 300 vmovdqu $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE) 301 302 vzeroupper 303___ 304} 305$code .= _end_func; 306 307# Do one step of the GHASH update of four vectors of data blocks. 308# $i: the step to do, 0 through 9 309# $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) 310# $htable: pointer to the Htable for the key 311# $bswap_mask: mask for reflecting the bytes of blocks 312# $h_pow[2-1]_xored: XOR'd key powers cached from Htable 313# $tmp[0-2]: temporary registers. $tmp[1-2] must be preserved across steps. 314# $lo, $mi: working state for this macro that must be preserved across steps 315# $ghash_acc: the GHASH accumulator (input/output) 316sub _ghash_step_4x { 317 my ( 318 $i, $ghashdata_ptr, $htable, $bswap_mask, 319 $h_pow2_xored, $h_pow1_xored, $tmp0, $tmp0_xmm, 320 $tmp1, $tmp2, $lo, $mi, 321 $ghash_acc, $ghash_acc_xmm 322 ) = @_; 323 my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm ); # alias 324 if ( $i == 0 ) { 325 return <<___; 326 # First vector 327 vmovdqu 0*32($ghashdata_ptr), $tmp1 328 vpshufb $bswap_mask, $tmp1, $tmp1 329 vmovdqu 0*32($htable), $tmp2 330 vpxor $ghash_acc, $tmp1, $tmp1 331 vpclmulqdq \$0x00, $tmp2, $tmp1, $lo 332 vpclmulqdq \$0x11, $tmp2, $tmp1, $hi 333 vpunpckhqdq $tmp1, $tmp1, $tmp0 334 vpxor $tmp1, $tmp0, $tmp0 335 vpclmulqdq \$0x00, $h_pow2_xored, $tmp0, $mi 336___ 337 } 338 elsif ( $i == 1 ) { 339 return <<___; 340___ 341 } 342 elsif ( $i == 2 ) { 343 return <<___; 344 # Second vector 345 vmovdqu 1*32($ghashdata_ptr), $tmp1 346 vpshufb $bswap_mask, $tmp1, $tmp1 347 vmovdqu 1*32($htable), $tmp2 348 vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 349 vpxor $tmp0, $lo, $lo 350 vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 351 vpxor $tmp0, $hi, $hi 352 vpunpckhqdq $tmp1, $tmp1, $tmp0 353 vpxor $tmp1, $tmp0, $tmp0 354 vpclmulqdq \$0x10, $h_pow2_xored, $tmp0, $tmp0 355 vpxor $tmp0, $mi, $mi 356___ 357 } 358 elsif ( $i == 3 ) { 359 return <<___; 360 # Third vector 361 vmovdqu 2*32($ghashdata_ptr), $tmp1 362 vpshufb $bswap_mask, $tmp1, $tmp1 363 vmovdqu 2*32($htable), $tmp2 364___ 365 } 366 elsif ( $i == 4 ) { 367 return <<___; 368 vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 369 vpxor $tmp0, $lo, $lo 370 vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 371 vpxor $tmp0, $hi, $hi 372___ 373 } 374 elsif ( $i == 5 ) { 375 return <<___; 376 vpunpckhqdq $tmp1, $tmp1, $tmp0 377 vpxor $tmp1, $tmp0, $tmp0 378 vpclmulqdq \$0x00, $h_pow1_xored, $tmp0, $tmp0 379 vpxor $tmp0, $mi, $mi 380 381 # Fourth vector 382 vmovdqu 3*32($ghashdata_ptr), $tmp1 383 vpshufb $bswap_mask, $tmp1, $tmp1 384___ 385 } 386 elsif ( $i == 6 ) { 387 return <<___; 388 vmovdqu 3*32($htable), $tmp2 389 vpclmulqdq \$0x00, $tmp2, $tmp1, $tmp0 390 vpxor $tmp0, $lo, $lo 391 vpclmulqdq \$0x11, $tmp2, $tmp1, $tmp0 392 vpxor $tmp0, $hi, $hi 393 vpunpckhqdq $tmp1, $tmp1, $tmp0 394 vpxor $tmp1, $tmp0, $tmp0 395 vpclmulqdq \$0x10, $h_pow1_xored, $tmp0, $tmp0 396 vpxor $tmp0, $mi, $mi 397___ 398 } 399 elsif ( $i == 7 ) { 400 return <<___; 401 # Finalize 'mi' following Karatsuba multiplication. 402 vpxor $lo, $mi, $mi 403 vpxor $hi, $mi, $mi 404 405 # Fold lo into mi. 406 vbroadcasti128 .Lgfpoly(%rip), $tmp2 407 vpclmulqdq \$0x01, $lo, $tmp2, $tmp0 408 vpshufd \$0x4e, $lo, $lo 409 vpxor $lo, $mi, $mi 410 vpxor $tmp0, $mi, $mi 411___ 412 } 413 elsif ( $i == 8 ) { 414 return <<___; 415 # Fold mi into hi. 416 vpclmulqdq \$0x01, $mi, $tmp2, $tmp0 417 vpshufd \$0x4e, $mi, $mi 418 vpxor $mi, $hi, $hi 419 vpxor $tmp0, $hi, $hi 420___ 421 } 422 elsif ( $i == 9 ) { 423 return <<___; 424 vextracti128 \$1, $hi, $tmp0_xmm 425 vpxor $tmp0_xmm, $hi_xmm, $ghash_acc_xmm 426___ 427 } 428} 429 430sub _ghash_4x { 431 my $code = ""; 432 for my $i ( 0 .. 9 ) { 433 $code .= _ghash_step_4x $i, @_; 434 } 435 return $code; 436} 437 438# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]); 439$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1; 440{ 441 my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ]; 442 my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = 443 map( "%xmm$_", ( 0 .. 6 ) ); 444 445 $code .= <<___; 446 @{[ _save_xmmregs (6) ]} 447 .seh_endprologue 448 449 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC 450 vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK 451 vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1 452 vmovdqu .Lgfpoly(%rip), $GFPOLY 453 vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC 454 455 @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]} 456 457 vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC 458 vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR) 459___ 460} 461$code .= _end_func; 462 463# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], 464# const uint8_t *in, size_t len); 465# 466# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given 467# by |in| and |len|. |len| must be a multiple of 16. 468# 469# This function handles large amounts of AAD efficiently, while also keeping the 470# overhead low for small amounts of AAD which is the common case. TLS uses less 471# than one block of AAD, but (uncommonly) other use cases may use much more. 472$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1; 473{ 474 # Function arguments 475 my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; 476 477 # Additional local variables 478 my ( $TMP0, $TMP0_XMM ) = ( "%ymm0", "%xmm0" ); 479 my ( $TMP1, $TMP1_XMM ) = ( "%ymm1", "%xmm1" ); 480 my ( $TMP2, $TMP2_XMM ) = ( "%ymm2", "%xmm2" ); 481 my ( $LO, $LO_XMM ) = ( "%ymm3", "%xmm3" ); 482 my ( $MI, $MI_XMM ) = ( "%ymm4", "%xmm4" ); 483 my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm5", "%xmm5" ); 484 my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" ); 485 my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm7", "%xmm7" ); 486 my $H_POW2_XORED = "%ymm8"; 487 my $H_POW1_XORED = "%ymm9"; 488 489 $code .= <<___; 490 @{[ _save_xmmregs (6 .. 9) ]} 491 .seh_endprologue 492 493 vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK 494 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM 495 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 496 vbroadcasti128 .Lgfpoly(%rip), $GFPOLY 497 498 # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128. 499 cmp \$32, $AADLEN 500 jb .Lghash_lastblock 501 502 cmp \$127, $AADLEN 503 jbe .Lghash_loop_1x 504 505 # Update GHASH with 128 bytes of AAD at a time. 506 vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED 507 vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED 508.Lghash_loop_4x: 509 @{[ _ghash_4x $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED, 510 $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC, 511 $GHASH_ACC_XMM ]} 512 sub \$-128, $AAD # 128 is 4 bytes, -128 is 1 byte 513 add \$-128, $AADLEN 514 cmp \$127, $AADLEN 515 ja .Lghash_loop_4x 516 517 # Update GHASH with 32 bytes of AAD at a time. 518 cmp \$32, $AADLEN 519 jb .Lghash_loop_1x_done 520.Lghash_loop_1x: 521 vmovdqu ($AAD), $TMP0 522 vpshufb $BSWAP_MASK, $TMP0, $TMP0 523 vpxor $TMP0, $GHASH_ACC, $GHASH_ACC 524 vmovdqu $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0 525 @{[ _ghash_mul $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]} 526 vextracti128 \$1, $GHASH_ACC, $TMP0_XMM 527 vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 528 add \$32, $AAD 529 sub \$32, $AADLEN 530 cmp \$32, $AADLEN 531 jae .Lghash_loop_1x 532.Lghash_loop_1x_done: 533 # Issue the vzeroupper that is needed after using ymm registers. Do it here 534 # instead of at the end, to minimize overhead for small AADLEN. 535 vzeroupper 536 537 # Update GHASH with the remaining 16-byte block if any. 538.Lghash_lastblock: 539 test $AADLEN, $AADLEN 540 jz .Lghash_done 541 vmovdqu ($AAD), $TMP0_XMM 542 vpshufb $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM 543 vpxor $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 544 vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM 545 @{[ _ghash_mul $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, 546 $TMP1_XMM, $TMP2_XMM, $LO_XMM ]} 547 548.Lghash_done: 549 # Store the updated GHASH accumulator back to memory. 550 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 551 vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) 552___ 553} 554$code .= _end_func; 555 556sub _vaesenc_4x { 557 my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_; 558 return <<___; 559 vaesenc $round_key, $aesdata0, $aesdata0 560 vaesenc $round_key, $aesdata1, $aesdata1 561 vaesenc $round_key, $aesdata2, $aesdata2 562 vaesenc $round_key, $aesdata3, $aesdata3 563___ 564} 565 566sub _ctr_begin_4x { 567 my ( 568 $le_ctr, $bswap_mask, $rndkey0, $aesdata0, 569 $aesdata1, $aesdata2, $aesdata3, $tmp 570 ) = @_; 571 return <<___; 572 # Increment le_ctr four times to generate four vectors of little-endian 573 # counter blocks, swap each to big-endian, and store them in aesdata[0-3]. 574 vmovdqu .Linc_2blocks(%rip), $tmp 575 vpshufb $bswap_mask, $le_ctr, $aesdata0 576 vpaddd $tmp, $le_ctr, $le_ctr 577 vpshufb $bswap_mask, $le_ctr, $aesdata1 578 vpaddd $tmp, $le_ctr, $le_ctr 579 vpshufb $bswap_mask, $le_ctr, $aesdata2 580 vpaddd $tmp, $le_ctr, $le_ctr 581 vpshufb $bswap_mask, $le_ctr, $aesdata3 582 vpaddd $tmp, $le_ctr, $le_ctr 583 584 # AES "round zero": XOR in the zero-th round key. 585 vpxor $rndkey0, $aesdata0, $aesdata0 586 vpxor $rndkey0, $aesdata1, $aesdata1 587 vpxor $rndkey0, $aesdata2, $aesdata2 588 vpxor $rndkey0, $aesdata3, $aesdata3 589___ 590} 591 592# Do the last AES round for four vectors of counter blocks, XOR four vectors of 593# source data with the resulting keystream blocks, and write the result to the 594# destination buffer. The implementation differs slightly as it takes advantage 595# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce 596# latency, but it has the same effect. 597sub _aesenclast_and_xor_4x { 598 my ( 599 $src, $dst, $rndkeylast, $aesdata0, 600 $aesdata1, $aesdata2, $aesdata3, $t0, 601 $t1, $t2, $t3 602 ) = @_; 603 return <<___; 604 vpxor 0*32($src), $rndkeylast, $t0 605 vpxor 1*32($src), $rndkeylast, $t1 606 vpxor 2*32($src), $rndkeylast, $t2 607 vpxor 3*32($src), $rndkeylast, $t3 608 vaesenclast $t0, $aesdata0, $aesdata0 609 vaesenclast $t1, $aesdata1, $aesdata1 610 vaesenclast $t2, $aesdata2, $aesdata2 611 vaesenclast $t3, $aesdata3, $aesdata3 612 vmovdqu $aesdata0, 0*32($dst) 613 vmovdqu $aesdata1, 1*32($dst) 614 vmovdqu $aesdata2, 2*32($dst) 615 vmovdqu $aesdata3, 3*32($dst) 616___ 617} 618 619my $g_update_macro_expansion_count = 0; 620 621# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out, 622# size_t len, const AES_KEY *key, 623# const uint8_t ivec[16], 624# const u128 Htable[16], 625# uint8_t Xi[16]); 626# 627# This macro generates a GCM encryption or decryption update function with the 628# above prototype (with \enc selecting which one). The function computes the 629# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and 630# writes the resulting encrypted or decrypted data to |out|. It also updates 631# the GHASH accumulator |Xi| using the next |len| ciphertext bytes. 632# 633# |len| must be a multiple of 16. The caller must do any buffering needed to 634# ensure this. Both in-place and out-of-place en/decryption are supported. 635# 636# |ivec| must give the current counter in big-endian format. This function 637# loads the counter from |ivec| and increments the loaded counter as needed, but 638# it does *not* store the updated counter back to |ivec|. The caller must 639# update |ivec| if any more data segments follow. Internally, only the low 640# 32-bit word of the counter is incremented, following the GCM standard. 641sub _aes_gcm_update { 642 my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; 643 my ($enc) = @_; 644 my $code = ""; 645 646 # Function arguments 647 my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ) 648 = $win64 649 ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) 650 : ( @argregs[ 0 .. 5 ], "%r12" ); 651 652 # Additional local variables. 653 # %rax is used as a temporary register. BE_CTR_PTR is also available as a 654 # temporary register after the counter is loaded. 655 656 # AES key length in bytes 657 my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); 658 659 # Pointer to the last AES round key for the chosen AES variant 660 my $RNDKEYLAST_PTR = "%r11"; 661 662 # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 663 # using vpshufb, copied to all 128-bit lanes. 664 my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" ); 665 666 # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 667 # only the lowest 128-bit lane can be nonzero. When not fully reduced, 668 # more than one lane may be used, and they need to be XOR'd together. 669 my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" ); 670 671 # TMP[0-2] are temporary registers. 672 my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" ); 673 my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" ); 674 my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" ); 675 676 # LO and MI are used to accumulate unreduced GHASH products. 677 my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" ); 678 my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" ); 679 680 # Cached key powers from Htable 681 my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" ); 682 my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" ); 683 684 # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. 685 my $RNDKEY0 = "%ymm9"; 686 my $RNDKEYLAST = "%ymm10"; 687 688 # LE_CTR contains the next set of little-endian counter blocks. 689 my $LE_CTR = "%ymm11"; 690 691 # AESDATA[0-3] hold the counter blocks that are being encrypted by AES. 692 my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" ); 693 my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" ); 694 my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" ); 695 my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" ); 696 my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 ); 697 698 my @ghash_4x_args = ( 699 $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, 700 $H_POW1_XORED, $TMP0, $TMP0_XMM, $TMP1, 701 $TMP2, $LO, $MI, $GHASH_ACC, 702 $GHASH_ACC_XMM 703 ); 704 705 if ($win64) { 706 $code .= <<___; 707 @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]} 708 mov 64(%rsp), $BE_CTR_PTR # arg5 709 mov 72(%rsp), $HTABLE # arg6 710 mov 80(%rsp), $GHASH_ACC_PTR # arg7 711 @{[ _save_xmmregs (6 .. 15) ]} 712 .seh_endprologue 713___ 714 } 715 else { 716 $code .= <<___; 717 @{[ _save_gpregs $GHASH_ACC_PTR ]} 718 mov 16(%rsp), $GHASH_ACC_PTR # arg7 719___ 720 } 721 722 if ($enc) { 723 $code .= <<___; 724#ifdef BORINGSSL_DISPATCH_TEST 725 .extern BORINGSSL_function_hit 726 movb \$1,BORINGSSL_function_hit+8(%rip) 727#endif 728___ 729 } 730 $code .= <<___; 731 vbroadcasti128 .Lbswap_mask(%rip), $BSWAP_MASK 732 733 # Load the GHASH accumulator and the starting counter. 734 # BoringSSL passes these values in big endian format. 735 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM 736 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 737 vbroadcasti128 ($BE_CTR_PTR), $LE_CTR 738 vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR 739 740 # Load the AES key length in bytes. BoringSSL stores number of rounds 741 # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. 742 movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN 743 lea -20(,$AESKEYLEN,4), $AESKEYLEN 744 745 # Make RNDKEYLAST_PTR point to the last AES round key. This is the 746 # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 747 # respectively. Then load the zero-th and last round keys. 748 lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR 749 vbroadcasti128 ($AESKEY), $RNDKEY0 750 vbroadcasti128 ($RNDKEYLAST_PTR), $RNDKEYLAST 751 752 # Finish initializing LE_CTR by adding 1 to the second block. 753 vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR 754 755 # If there are at least 128 bytes of data, then continue into the loop that 756 # processes 128 bytes of data at a time. Otherwise skip it. 757 cmp \$127, $DATALEN 758 jbe .Lcrypt_loop_4x_done$local_label_suffix 759 760 vmovdqu $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED 761 vmovdqu $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED 762___ 763 764 # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. 765 766 if ($enc) { 767 $code .= <<___; 768 # Encrypt the first 4 vectors of plaintext blocks. 769 @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} 770 lea 16($AESKEY), %rax 771.Lvaesenc_loop_first_4_vecs$local_label_suffix: 772 vbroadcasti128 (%rax), $TMP0 773 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 774 add \$16, %rax 775 cmp %rax, $RNDKEYLAST_PTR 776 jne .Lvaesenc_loop_first_4_vecs$local_label_suffix 777 @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, 778 $TMP0, $TMP1, $LO, $MI ]} 779 sub \$-128, $SRC # 128 is 4 bytes, -128 is 1 byte 780 add \$-128, $DATALEN 781 cmp \$127, $DATALEN 782 jbe .Lghash_last_ciphertext_4x$local_label_suffix 783___ 784 } 785 786 $code .= <<___; 787.align 16 788.Lcrypt_loop_4x$local_label_suffix: 789 790 # Start the AES encryption of the counter blocks. 791 @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]} 792 cmp \$24, $AESKEYLEN 793 jl .Laes128$local_label_suffix 794 je .Laes192$local_label_suffix 795 # AES-256 796 vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0 797 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 798 vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0 799 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 800.Laes192$local_label_suffix: 801 vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0 802 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 803 vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0 804 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 805.Laes128$local_label_suffix: 806___ 807 808 # Finish the AES encryption of the counter blocks in AESDATA[0-3], 809 # interleaved with the GHASH update of the ciphertext blocks. 810 for my $i ( reverse 1 .. 9 ) { 811 $code .= <<___; 812 @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]} 813 vbroadcasti128 -$i*16($RNDKEYLAST_PTR), $TMP0 814 @{[ _vaesenc_4x $TMP0, @AESDATA ]} 815___ 816 } 817 $code .= <<___; 818 @{[ _ghash_step_4x 9, @ghash_4x_args ]} 819 820 @{[ $enc ? "sub \$-128, $DST" : "" ]} # 128 is 4 bytes, -128 is 1 byte 821 @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA, 822 $TMP0, $TMP1, $LO, $MI ]} 823 sub \$-128, $SRC 824 @{[ !$enc ? "sub \$-128, $DST" : "" ]} 825 add \$-128, $DATALEN 826 cmp \$127, $DATALEN 827 ja .Lcrypt_loop_4x$local_label_suffix 828___ 829 830 if ($enc) { 831 832 # Update GHASH with the last set of ciphertext blocks. 833 $code .= <<___; 834.Lghash_last_ciphertext_4x$local_label_suffix: 835 @{[ _ghash_4x @ghash_4x_args ]} 836 sub \$-128, $DST 837___ 838 } 839 840 my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. 841 my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM ); # reuse 842 843 $code .= <<___; 844.Lcrypt_loop_4x_done$local_label_suffix: 845 # Check whether any data remains. 846 test $DATALEN, $DATALEN 847 jz .Ldone$local_label_suffix 848 849 # DATALEN is in [16, 32, 48, 64, 80, 96, 112]. 850 851 # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N 852 # is the number of blocks that remain. 853 lea $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR 854 sub $DATALEN, $POWERS_PTR 855 856 # Start collecting the unreduced GHASH intermediate value LO, MI, HI. 857 vpxor $LO_XMM, $LO_XMM, $LO_XMM 858 vpxor $MI_XMM, $MI_XMM, $MI_XMM 859 vpxor $HI_XMM, $HI_XMM, $HI_XMM 860 861 cmp \$64, $DATALEN 862 jb .Llessthan64bytes$local_label_suffix 863 864 # DATALEN is in [64, 80, 96, 112]. Encrypt two vectors of counter blocks. 865 vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 866 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR 867 vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 868 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR 869 vpxor $RNDKEY0, $AESDATA0, $AESDATA0 870 vpxor $RNDKEY0, $AESDATA1, $AESDATA1 871 lea 16($AESKEY), %rax 872.Lvaesenc_loop_tail_1$local_label_suffix: 873 vbroadcasti128 (%rax), $TMP0 874 vaesenc $TMP0, $AESDATA0, $AESDATA0 875 vaesenc $TMP0, $AESDATA1, $AESDATA1 876 add \$16, %rax 877 cmp %rax, $RNDKEYLAST_PTR 878 jne .Lvaesenc_loop_tail_1$local_label_suffix 879 vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 880 vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 881 882 # XOR the data with the two vectors of keystream blocks. 883 vmovdqu 0($SRC), $TMP0 884 vmovdqu 32($SRC), $TMP1 885 vpxor $TMP0, $AESDATA0, $AESDATA0 886 vpxor $TMP1, $AESDATA1, $AESDATA1 887 vmovdqu $AESDATA0, 0($DST) 888 vmovdqu $AESDATA1, 32($DST) 889 890 # Update GHASH with two vectors of ciphertext blocks, without reducing. 891 vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 892 vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1 893 vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 894 vmovdqu ($POWERS_PTR), $TMP0 895 vmovdqu 32($POWERS_PTR), $TMP1 896 vpclmulqdq \$0x00, $TMP0, $AESDATA0, $LO 897 vpclmulqdq \$0x01, $TMP0, $AESDATA0, $MI 898 vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 899 vpxor $TMP2, $MI, $MI 900 vpclmulqdq \$0x11, $TMP0, $AESDATA0, $HI 901 vpclmulqdq \$0x00, $TMP1, $AESDATA1, $TMP2 902 vpxor $TMP2, $LO, $LO 903 vpclmulqdq \$0x01, $TMP1, $AESDATA1, $TMP2 904 vpxor $TMP2, $MI, $MI 905 vpclmulqdq \$0x10, $TMP1, $AESDATA1, $TMP2 906 vpxor $TMP2, $MI, $MI 907 vpclmulqdq \$0x11, $TMP1, $AESDATA1, $TMP2 908 vpxor $TMP2, $HI, $HI 909 910 add \$64, $POWERS_PTR 911 add \$64, $SRC 912 add \$64, $DST 913 sub \$64, $DATALEN 914 jz .Lreduce$local_label_suffix 915 916 vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 917 918 # DATALEN is in [16, 32, 48]. Encrypt two last vectors of counter blocks. 919.Llessthan64bytes$local_label_suffix: 920 vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA0 921 vpaddd .Linc_2blocks(%rip), $LE_CTR, $LE_CTR 922 vpshufb $BSWAP_MASK, $LE_CTR, $AESDATA1 923 vpxor $RNDKEY0, $AESDATA0, $AESDATA0 924 vpxor $RNDKEY0, $AESDATA1, $AESDATA1 925 lea 16($AESKEY), %rax 926.Lvaesenc_loop_tail_2$local_label_suffix: 927 vbroadcasti128 (%rax), $TMP0 928 vaesenc $TMP0, $AESDATA0, $AESDATA0 929 vaesenc $TMP0, $AESDATA1, $AESDATA1 930 add \$16, %rax 931 cmp %rax, $RNDKEYLAST_PTR 932 jne .Lvaesenc_loop_tail_2$local_label_suffix 933 vaesenclast $RNDKEYLAST, $AESDATA0, $AESDATA0 934 vaesenclast $RNDKEYLAST, $AESDATA1, $AESDATA1 935 936 # XOR the remaining data with the keystream blocks, and update GHASH with 937 # the remaining ciphertext blocks without reducing. 938 939 cmp \$32, $DATALEN 940 jb .Lxor_one_block$local_label_suffix 941 je .Lxor_two_blocks$local_label_suffix 942 943.Lxor_three_blocks$local_label_suffix: 944 vmovdqu 0($SRC), $TMP0 945 vmovdqu 32($SRC), $TMP1_XMM 946 vpxor $TMP0, $AESDATA0, $AESDATA0 947 vpxor $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM 948 vmovdqu $AESDATA0, 0($DST) 949 vmovdqu $AESDATA1_XMM, 32($DST) 950 951 vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 952 vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM 953 vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 954 vmovdqu ($POWERS_PTR), $TMP0 955 vmovdqu 32($POWERS_PTR), $TMP1_XMM 956 vpclmulqdq \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM 957 vpxor $TMP2, $LO, $LO 958 vpclmulqdq \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM 959 vpxor $TMP2, $MI, $MI 960 vpclmulqdq \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM 961 vpxor $TMP2, $MI, $MI 962 vpclmulqdq \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM 963 vpxor $TMP2, $HI, $HI 964 jmp .Lghash_mul_one_vec_unreduced$local_label_suffix 965 966.Lxor_two_blocks$local_label_suffix: 967 vmovdqu ($SRC), $TMP0 968 vpxor $TMP0, $AESDATA0, $AESDATA0 969 vmovdqu $AESDATA0, ($DST) 970 vpshufb $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0 971 vpxor $GHASH_ACC, $AESDATA0, $AESDATA0 972 vmovdqu ($POWERS_PTR), $TMP0 973 jmp .Lghash_mul_one_vec_unreduced$local_label_suffix 974 975.Lxor_one_block$local_label_suffix: 976 vmovdqu ($SRC), $TMP0_XMM 977 vpxor $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM 978 vmovdqu $AESDATA0_XMM, ($DST) 979 vpshufb $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM 980 vpxor $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM 981 vmovdqu ($POWERS_PTR), $TMP0_XMM 982 983.Lghash_mul_one_vec_unreduced$local_label_suffix: 984 vpclmulqdq \$0x00, $TMP0, $AESDATA0, $TMP2 985 vpxor $TMP2, $LO, $LO 986 vpclmulqdq \$0x01, $TMP0, $AESDATA0, $TMP2 987 vpxor $TMP2, $MI, $MI 988 vpclmulqdq \$0x10, $TMP0, $AESDATA0, $TMP2 989 vpxor $TMP2, $MI, $MI 990 vpclmulqdq \$0x11, $TMP0, $AESDATA0, $TMP2 991 vpxor $TMP2, $HI, $HI 992 993.Lreduce$local_label_suffix: 994 # Finally, do the GHASH reduction. 995 vbroadcasti128 .Lgfpoly(%rip), $TMP0 996 vpclmulqdq \$0x01, $LO, $TMP0, $TMP1 997 vpshufd \$0x4e, $LO, $LO 998 vpxor $LO, $MI, $MI 999 vpxor $TMP1, $MI, $MI 1000 vpclmulqdq \$0x01, $MI, $TMP0, $TMP1 1001 vpshufd \$0x4e, $MI, $MI 1002 vpxor $MI, $HI, $HI 1003 vpxor $TMP1, $HI, $HI 1004 vextracti128 \$1, $HI, $GHASH_ACC_XMM 1005 vpxor $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 1006 1007.Ldone$local_label_suffix: 1008 # Store the updated GHASH accumulator back to memory. 1009 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 1010 vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) 1011 1012 vzeroupper 1013___ 1014 return $code; 1015} 1016 1017$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1; 1018$code .= _aes_gcm_update 1; 1019$code .= _end_func; 1020 1021$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1; 1022$code .= _aes_gcm_update 0; 1023$code .= _end_func; 1024 1025print $code; 1026close STDOUT or die "error closing STDOUT: $!"; 1027exit 0; 1028