1#!/usr/bin/env perl 2# Copyright 2024 The BoringSSL Authors 3# 4# Permission to use, copy, modify, and/or distribute this software for any 5# purpose with or without fee is hereby granted, provided that the above 6# copyright notice and this permission notice appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15# 16#------------------------------------------------------------------------------ 17# 18# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 19# 20# This file is based on aes-gcm-avx10-x86_64.S from the Linux kernel 21# (https://git.kernel.org/linus/b06affb1cb580e13). The following notable 22# changes have been made: 23# 24# - Relicensed under BoringSSL's preferred license. 25# 26# - Converted from GNU assembler to "perlasm". This was necessary for 27# compatibility with BoringSSL's Windows builds which use NASM instead of the 28# GNU assembler. It was also necessary for compatibility with the 'delocate' 29# tool used in BoringSSL's FIPS builds. 30# 31# - Added support for the Windows ABI. 32# 33# - Changed function prototypes to be compatible with what BoringSSL wants. 34# 35# - Removed the optimized finalization function, as BoringSSL doesn't want it. 36# 37# - Added a single-block GHASH multiplication function, as BoringSSL needs this. 38# 39# - Added optimization for large amounts of AAD. 40# 41#------------------------------------------------------------------------------ 42# 43# This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that 44# support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and 45# either AVX512 or AVX10. Some of the functions, notably the encryption and 46# decryption update functions which are the most performance-critical, are 47# provided in two variants generated from a macro: one using 256-bit vectors 48# (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The 49# other, "shared" functions (vaes_avx10) use at most 256-bit vectors. 50# 51# The functions that use 512-bit vectors are intended for CPUs that support 52# 512-bit vectors *and* where using them doesn't cause significant 53# downclocking. They require the following CPU features: 54# 55# VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) 56# 57# The other functions require the following CPU features: 58# 59# VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) 60# 61# Note that we use "avx10" in the names of the functions as a shorthand to 62# really mean "AVX10 or a certain set of AVX512 features". Due to Intel's 63# introduction of AVX512 and then its replacement by AVX10, there doesn't seem 64# to be a simple way to name things that makes sense on all CPUs. 65# 66# Note that the macros that support both 256-bit and 512-bit vectors could 67# fairly easily be changed to support 128-bit too. However, this would *not* 68# be sufficient to allow the code to run on CPUs without AVX512 or AVX10, 69# because the code heavily uses several features of these extensions other than 70# the vector length: the increase in the number of SIMD registers from 16 to 71# 32, masking support, and new instructions such as vpternlogd (which can do a 72# three-argument XOR). These features are very useful for AES-GCM. 73 74$flavour = shift; 75$output = shift; 76if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; } 77 78if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) { 79 $win64 = 1; 80 @argregs = ( "%rcx", "%rdx", "%r8", "%r9" ); 81} 82else { 83 $win64 = 0; 84 @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" ); 85} 86 87$0 =~ m/(.*[\/\\])[^\/\\]+$/; 88$dir = $1; 89( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate ) 90 or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate ) 91 or die "can't locate x86_64-xlate.pl"; 92 93open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; 94*STDOUT = *OUT; 95 96sub _begin_func { 97 my ( $funcname, $uses_seh ) = @_; 98 $g_cur_func_name = $funcname; 99 $g_cur_func_uses_seh = $uses_seh; 100 @g_cur_func_saved_gpregs = (); 101 @g_cur_func_saved_xmmregs = (); 102 return <<___; 103.globl $funcname 104.type $funcname,\@abi-omnipotent 105.align 32 106$funcname: 107 .cfi_startproc 108 @{[ $uses_seh ? ".seh_startproc" : "" ]} 109 _CET_ENDBR 110___ 111} 112 113# Push a list of general purpose registers onto the stack. 114sub _save_gpregs { 115 my @gpregs = @_; 116 my $code = ""; 117 die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh; 118 die "_save_gpregs can only be called once per function" 119 if @g_cur_func_saved_gpregs; 120 die "Order must be _save_gpregs, then _save_xmmregs" 121 if @g_cur_func_saved_xmmregs; 122 @g_cur_func_saved_gpregs = @gpregs; 123 for my $reg (@gpregs) { 124 $code .= "push $reg\n"; 125 if ($win64) { 126 $code .= ".seh_pushreg $reg\n"; 127 } 128 else { 129 $code .= ".cfi_push $reg\n"; 130 } 131 } 132 return $code; 133} 134 135# Push a list of xmm registers onto the stack if the target is Windows. 136sub _save_xmmregs { 137 my @xmmregs = @_; 138 my $num_xmmregs = scalar @xmmregs; 139 my $code = ""; 140 die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh; 141 die "_save_xmmregs can only be called once per function" 142 if @g_cur_func_saved_xmmregs; 143 if ( $win64 and $num_xmmregs > 0 ) { 144 @g_cur_func_saved_xmmregs = @xmmregs; 145 my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; 146 my $alloc_size = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 ); 147 $code .= "sub \$$alloc_size, %rsp\n"; 148 $code .= ".seh_stackalloc $alloc_size\n"; 149 for my $i ( 0 .. $num_xmmregs - 1 ) { 150 my $reg_num = $xmmregs[$i]; 151 my $pos = 16 * $i; 152 $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n"; 153 $code .= ".seh_savexmm %xmm$reg_num, $pos\n"; 154 } 155 } 156 return $code; 157} 158 159sub _end_func { 160 my $code = ""; 161 162 # Restore any xmm registers that were saved earlier. 163 my $num_xmmregs = scalar @g_cur_func_saved_xmmregs; 164 if ( $win64 and $num_xmmregs > 0 ) { 165 my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0; 166 my $alloc_size = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 ); 167 for my $i ( 0 .. $num_xmmregs - 1 ) { 168 my $reg_num = $g_cur_func_saved_xmmregs[$i]; 169 my $pos = 16 * $i; 170 $code .= "movdqa $pos(%rsp), %xmm$reg_num\n"; 171 } 172 $code .= "add \$$alloc_size, %rsp\n"; 173 } 174 175 # Restore any general purpose registers that were saved earlier. 176 for my $reg ( reverse @g_cur_func_saved_gpregs ) { 177 $code .= "pop $reg\n"; 178 if ( !$win64 ) { 179 $code .= ".cfi_pop $reg\n"; 180 } 181 } 182 183 $code .= <<___; 184 ret 185 @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]} 186 .cfi_endproc 187 .size $g_cur_func_name, . - $g_cur_func_name 188___ 189 return $code; 190} 191 192$code = <<___; 193.section .rodata 194.align 64 195 196 # A shuffle mask that reflects the bytes of 16-byte blocks 197.Lbswap_mask: 198 .quad 0x08090a0b0c0d0e0f, 0x0001020304050607 199 200 # This is the GHASH reducing polynomial without its constant term, i.e. 201 # x^128 + x^7 + x^2 + x, represented using the backwards mapping 202 # between bits and polynomial coefficients. 203 # 204 # Alternatively, it can be interpreted as the naturally-ordered 205 # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the 206 # "reversed" GHASH reducing polynomial without its x^128 term. 207.Lgfpoly: 208 .quad 1, 0xc200000000000000 209 210 # Same as above, but with the (1 << 64) bit set. 211.Lgfpoly_and_internal_carrybit: 212 .quad 1, 0xc200000000000001 213 214 # The below constants are used for incrementing the counter blocks. 215 # ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. 216 # inc_2blocks and inc_4blocks point to the single 128-bit values 2 and 217 # 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. 218.Lctr_pattern: 219 .quad 0, 0 220 .quad 1, 0 221.Linc_2blocks: 222 .quad 2, 0 223 .quad 3, 0 224.Linc_4blocks: 225 .quad 4, 0 226 227.text 228___ 229 230# Number of powers of the hash key stored in the key struct. The powers are 231# stored from highest (H^NUM_H_POWERS) to lowest (H^1). 232$NUM_H_POWERS = 16; 233 234$OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16; 235 236# Offset to 'rounds' in AES_KEY struct 237$OFFSETOF_AES_ROUNDS = 240; 238 239# The current vector length in bytes 240undef $VL; 241 242# Set the vector length in bytes. This sets the VL variable and defines 243# register aliases V0-V31 that map to the ymm or zmm registers. 244sub _set_veclen { 245 ($VL) = @_; 246 foreach my $i ( 0 .. 31 ) { 247 if ( $VL == 32 ) { 248 ${"V${i}"} = "%ymm${i}"; 249 } 250 elsif ( $VL == 64 ) { 251 ${"V${i}"} = "%zmm${i}"; 252 } 253 else { 254 die "Unsupported vector length"; 255 } 256 } 257} 258 259# The _ghash_mul_step macro does one step of GHASH multiplication of the 260# 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the 261# reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the 262# same size as \a and \b. To complete all steps, this must invoked with \i=0 263# through \i=9. The division into steps allows users of this macro to 264# optionally interleave the computation with other instructions. Users of this 265# macro must preserve the parameter registers across steps. 266# 267# The multiplications are done in GHASH's representation of the finite field 268# GF(2^128). Elements of GF(2^128) are represented as binary polynomials 269# (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial 270# G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is 271# just XOR, while multiplication is more complex and has two parts: (a) do 272# carryless multiplication of two 128-bit input polynomials to get a 256-bit 273# intermediate product polynomial, and (b) reduce the intermediate product to 274# 128 bits by adding multiples of G that cancel out terms in it. (Adding 275# multiples of G doesn't change which field element the polynomial represents.) 276# 277# Unfortunately, the GCM specification maps bits to/from polynomial 278# coefficients backwards from the natural order. In each byte it specifies the 279# highest bit to be the lowest order polynomial coefficient, *not* the highest! 280# This makes it nontrivial to work with the GHASH polynomials. We could 281# reflect the bits, but x86 doesn't have an instruction that does that. 282# 283# Instead, we operate on the values without bit-reflecting them. This *mostly* 284# just works, since XOR and carryless multiplication are symmetric with respect 285# to bit order, but it has some consequences. First, due to GHASH's byte 286# order, by skipping bit reflection, *byte* reflection becomes necessary to 287# give the polynomial terms a consistent order. E.g., considering an N-bit 288# value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 289# through N-1 of the byte-reflected value represent the coefficients of x^(N-1) 290# through x^0, whereas bits 0 through N-1 of the non-byte-reflected value 291# represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked 292# with. Fortunately, x86's vpshufb instruction can do byte reflection. 293# 294# Second, forgoing the bit reflection causes an extra multiple of x (still 295# using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each 296# multiplication. This is because an M-bit by N-bit carryless multiplication 297# really produces a (M+N-1)-bit product, but in practice it's zero-extended to 298# M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits 299# to polynomial coefficients backwards, this zero-extension actually changes 300# the product by introducing an extra factor of x. Therefore, users of this 301# macro must ensure that one of the inputs has an extra factor of x^-1, i.e. 302# the multiplicative inverse of x, to cancel out the extra x. 303# 304# Third, the backwards coefficients convention is just confusing to work with, 305# since it makes "low" and "high" in the polynomial math mean the opposite of 306# their normal meaning in computer programming. This can be solved by using an 307# alternative interpretation: the polynomial coefficients are understood to be 308# in the natural order, and the multiplication is actually \a * \b * x^-128 mod 309# x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, 310# or the implementation at all; it just changes the mathematical interpretation 311# of what each instruction is doing. Starting from here, we'll use this 312# alternative interpretation, as it's easier to understand the code that way. 313# 314# Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => 315# 128-bit carryless multiplication, so we break the 128 x 128 multiplication 316# into parts as follows (the _L and _H suffixes denote low and high 64 bits): 317# 318# LO = a_L * b_L 319# MI = (a_L * b_H) + (a_H * b_L) 320# HI = a_H * b_H 321# 322# The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. 323# Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and 324# HI right away, since the way the reduction works makes that unnecessary. 325# 326# For the reduction, we cancel out the low 128 bits by adding multiples of G = 327# x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of 328# which cancels out the next lowest 64 bits. Consider a value x^64*A + B, 329# where A and B are 128-bit. Adding B_L*G to that value gives: 330# 331# x^64*A + B + B_L*G 332# = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) 333# = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L 334# = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L 335# = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) 336# 337# So: if we sum A, B with its halves swapped, and the low half of B times x^63 338# + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the 339# original value x^64*A + B. I.e., the low 64 bits got canceled out. 340# 341# We just need to apply this twice: first to fold LO into MI, and second to 342# fold the updated MI into HI. 343# 344# The needed three-argument XORs are done using the vpternlogd instruction with 345# immediate 0x96, since this is faster than two vpxord instructions. 346# 347# A potential optimization, assuming that b is fixed per-key (if a is fixed 348# per-key it would work the other way around), is to use one iteration of the 349# reduction described above to precompute a value c such that x^64*c = b mod G, 350# and then multiply a_L by c (and implicitly by x^64) instead of by b: 351# 352# MI = (a_L * c_L) + (a_H * b_L) 353# HI = (a_L * c_H) + (a_H * b_H) 354# 355# This would eliminate the LO part of the intermediate product, which would 356# eliminate the need to fold LO into MI. This would save two instructions, 357# including a vpclmulqdq. However, we currently don't use this optimization 358# because it would require twice as many per-key precomputed values. 359# 360# Using Karatsuba multiplication instead of "schoolbook" multiplication 361# similarly would save a vpclmulqdq but does not seem to be worth it. 362sub _ghash_mul_step { 363 my ( $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; 364 if ( $i == 0 ) { 365 return "vpclmulqdq \$0x00, $a, $b, $t0\n" . # LO = a_L * b_L 366 "vpclmulqdq \$0x01, $a, $b, $t1\n"; # MI_0 = a_L * b_H 367 } 368 elsif ( $i == 1 ) { 369 return "vpclmulqdq \$0x10, $a, $b, $t2\n"; # MI_1 = a_H * b_L 370 } 371 elsif ( $i == 2 ) { 372 return "vpxord $t2, $t1, $t1\n"; # MI = MI_0 + MI_1 373 } 374 elsif ( $i == 3 ) { 375 return 376 "vpclmulqdq \$0x01, $t0, $gfpoly, $t2\n"; # LO_L*(x^63 + x^62 + x^57) 377 } 378 elsif ( $i == 4 ) { 379 return "vpshufd \$0x4e, $t0, $t0\n"; # Swap halves of LO 380 } 381 elsif ( $i == 5 ) { 382 return "vpternlogd \$0x96, $t2, $t0, $t1\n"; # Fold LO into MI 383 } 384 elsif ( $i == 6 ) { 385 return "vpclmulqdq \$0x11, $a, $b, $dst\n"; # HI = a_H * b_H 386 } 387 elsif ( $i == 7 ) { 388 return 389 "vpclmulqdq \$0x01, $t1, $gfpoly, $t0\n"; # MI_L*(x^63 + x^62 + x^57) 390 } 391 elsif ( $i == 8 ) { 392 return "vpshufd \$0x4e, $t1, $t1\n"; # Swap halves of MI 393 } 394 elsif ( $i == 9 ) { 395 return "vpternlogd \$0x96, $t0, $t1, $dst\n"; # Fold MI into HI 396 } 397} 398 399# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store 400# the reduced products in \dst. See _ghash_mul_step for full explanation. 401sub _ghash_mul { 402 my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_; 403 my $code = ""; 404 for my $i ( 0 .. 9 ) { 405 $code .= _ghash_mul_step $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2; 406 } 407 return $code; 408} 409 410# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the 411# *unreduced* products to \lo, \mi, and \hi. 412sub _ghash_mul_noreduce { 413 my ( $a, $b, $lo, $mi, $hi, $t0, $t1, $t2, $t3 ) = @_; 414 return <<___; 415 vpclmulqdq \$0x00, $a, $b, $t0 # a_L * b_L 416 vpclmulqdq \$0x01, $a, $b, $t1 # a_L * b_H 417 vpclmulqdq \$0x10, $a, $b, $t2 # a_H * b_L 418 vpclmulqdq \$0x11, $a, $b, $t3 # a_H * b_H 419 vpxord $t0, $lo, $lo 420 vpternlogd \$0x96, $t2, $t1, $mi 421 vpxord $t3, $hi, $hi 422___ 423} 424 425# Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit 426# reduced products in \hi. See _ghash_mul_step for explanation of reduction. 427sub _ghash_reduce { 428 my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_; 429 return <<___; 430 vpclmulqdq \$0x01, $lo, $gfpoly, $t0 431 vpshufd \$0x4e, $lo, $lo 432 vpternlogd \$0x96, $t0, $lo, $mi 433 vpclmulqdq \$0x01, $mi, $gfpoly, $t0 434 vpshufd \$0x4e, $mi, $mi 435 vpternlogd \$0x96, $t0, $mi, $hi 436___ 437} 438 439$g_init_macro_expansion_count = 0; 440 441# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]); 442# 443# Initialize |Htable| with powers of the GHASH subkey |H|. 444# 445# The powers are stored in the order H^NUM_H_POWERS to H^1. 446# 447# This macro supports both VL=32 and VL=64. _set_veclen must have been invoked 448# with the desired length. In the VL=32 case, the function computes twice as 449# many key powers than are actually used by the VL=32 GCM update functions. 450# This is done to keep the key format the same regardless of vector length. 451sub _aes_gcm_init { 452 my $local_label_suffix = "__func" . ++$g_init_macro_expansion_count; 453 454 # Function arguments 455 my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ]; 456 457 # Additional local variables. V0-V2 and %rax are used as temporaries. 458 my $POWERS_PTR = "%r8"; 459 my $RNDKEYLAST_PTR = "%r9"; 460 my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM ) = ( "$V3", "%ymm3", "%xmm3" ); 461 my ( $H_INC, $H_INC_YMM, $H_INC_XMM ) = ( "$V4", "%ymm4", "%xmm4" ); 462 my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "$V5", "%ymm5", "%xmm5" ); 463 464 my $code = <<___; 465 # Get pointer to lowest set of key powers (located at end of array). 466 lea $OFFSETOFEND_H_POWERS-$VL($HTABLE), $POWERS_PTR 467 468 # Load the byte-reflected hash subkey. BoringSSL provides it in 469 # byte-reflected form except the two halves are in the wrong order. 470 vpshufd \$0x4e, ($H_PTR), $H_CUR_XMM 471 472 # Finish preprocessing the first key power, H^1. Since this GHASH 473 # implementation operates directly on values with the backwards bit 474 # order specified by the GCM standard, it's necessary to preprocess the 475 # raw key as follows. First, reflect its bytes. Second, multiply it 476 # by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards 477 # interpretation of polynomial coefficients), which can also be 478 # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 479 # + 1 using the alternative, natural interpretation of polynomial 480 # coefficients. For details, see the comment above _ghash_mul_step. 481 # 482 # Either way, for the multiplication the concrete operation performed 483 # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 484 # << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit 485 # wide shift instruction, so instead double each of the two 64-bit 486 # halves and incorporate the internal carry bit into the value XOR'd. 487 vpshufd \$0xd3, $H_CUR_XMM, %xmm0 488 vpsrad \$31, %xmm0, %xmm0 489 vpaddq $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM 490 # H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit 491 vpternlogd \$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, $H_CUR_XMM 492 493 # Load the gfpoly constant. 494 vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY 495 496 # Square H^1 to get H^2. 497 # 498 # Note that as with H^1, all higher key powers also need an extra 499 # factor of x^-1 (or x using the natural interpretation). Nothing 500 # special needs to be done to make this happen, though: H^1 * H^1 would 501 # end up with two factors of x^-1, but the multiplication consumes one. 502 # So the product H^2 ends up with the desired one factor of x^-1. 503 @{[ _ghash_mul $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM, 504 "%xmm0", "%xmm1", "%xmm2" ]} 505 506 # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. 507 vinserti128 \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM 508 vinserti128 \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM 509___ 510 511 if ( $VL == 64 ) { 512 513 # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. 514 $code .= <<___; 515 @{[ _ghash_mul $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM, 516 "%ymm0", "%ymm1", "%ymm2" ]} 517 vinserti64x4 \$1, $H_CUR_YMM, $H_INC, $H_CUR 518 vshufi64x2 \$0, $H_INC, $H_INC, $H_INC 519___ 520 } 521 522 $code .= <<___; 523 # Store the lowest set of key powers. 524 vmovdqu8 $H_CUR, ($POWERS_PTR) 525 526 # Compute and store the remaining key powers. With VL=32, repeatedly 527 # multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. 528 # With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by 529 # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. 530 mov \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax 531.Lprecompute_next$local_label_suffix: 532 sub \$$VL, $POWERS_PTR 533 @{[ _ghash_mul $H_INC, $H_CUR, $H_CUR, $GFPOLY, $V0, $V1, $V2 ]} 534 vmovdqu8 $H_CUR, ($POWERS_PTR) 535 dec %eax 536 jnz .Lprecompute_next$local_label_suffix 537 538 vzeroupper # This is needed after using ymm or zmm registers. 539___ 540 return $code; 541} 542 543# XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store 544# the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. 545sub _horizontal_xor { 546 my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_; 547 if ( $VL == 32 ) { 548 return <<___; 549 vextracti32x4 \$1, $src, $t0_xmm 550 vpxord $t0_xmm, $src_xmm, $dst_xmm 551___ 552 } 553 elsif ( $VL == 64 ) { 554 return <<___; 555 vextracti32x4 \$1, $src, $t0_xmm 556 vextracti32x4 \$2, $src, $t1_xmm 557 vextracti32x4 \$3, $src, $t2_xmm 558 vpxord $t0_xmm, $src_xmm, $dst_xmm 559 vpternlogd \$0x96, $t1_xmm, $t2_xmm, $dst_xmm 560___ 561 } 562 else { 563 die "Unsupported vector length"; 564 } 565} 566 567# Do one step of the GHASH update of the data blocks given in the vector 568# registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The 569# division into steps allows users of this macro to optionally interleave the 570# computation with other instructions. This macro uses the vector register 571# GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; 572# H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and 573# GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the 574# data blocks. The parameter registers must be preserved across steps. 575# 576# The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + 577# H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the 578# operations are vectorized operations on vectors of 16-byte blocks. E.g., 579# with VL=32 there are 2 blocks per vector and the vectorized terms correspond 580# to the following non-vectorized terms: 581# 582# H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) 583# H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 584# H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 585# H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 586# 587# With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. 588# 589# More concretely, this code does: 590# - Do vectorized "schoolbook" multiplications to compute the intermediate 591# 256-bit product of each block and its corresponding hash key power. 592# There are 4*VL/16 of these intermediate products. 593# - Sum (XOR) the intermediate 256-bit products across vectors. This leaves 594# VL/16 256-bit intermediate values. 595# - Do a vectorized reduction of these 256-bit intermediate values to 596# 128-bits each. This leaves VL/16 128-bit intermediate values. 597# - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. 598# 599# See _ghash_mul_step for the full explanation of the operations performed for 600# each individual finite field multiplication and reduction. 601sub _ghash_step_4x { 602 my ($i) = @_; 603 if ( $i == 0 ) { 604 return <<___; 605 vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0 606 vpxord $GHASH_ACC, $GHASHDATA0, $GHASHDATA0 607 vpshufb $BSWAP_MASK, $GHASHDATA1, $GHASHDATA1 608 vpshufb $BSWAP_MASK, $GHASHDATA2, $GHASHDATA2 609___ 610 } 611 elsif ( $i == 1 ) { 612 return <<___; 613 vpshufb $BSWAP_MASK, $GHASHDATA3, $GHASHDATA3 614 vpclmulqdq \$0x00, $H_POW4, $GHASHDATA0, $GHASH_ACC # LO_0 615 vpclmulqdq \$0x00, $H_POW3, $GHASHDATA1, $GHASHTMP0 # LO_1 616 vpclmulqdq \$0x00, $H_POW2, $GHASHDATA2, $GHASHTMP1 # LO_2 617___ 618 } 619 elsif ( $i == 2 ) { 620 return <<___; 621 vpxord $GHASHTMP0, $GHASH_ACC, $GHASH_ACC # sum(LO_{1,0}) 622 vpclmulqdq \$0x00, $H_POW1, $GHASHDATA3, $GHASHTMP2 # LO_3 623 vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASH_ACC # LO = sum(LO_{3,2,1,0}) 624 vpclmulqdq \$0x01, $H_POW4, $GHASHDATA0, $GHASHTMP0 # MI_0 625___ 626 } 627 elsif ( $i == 3 ) { 628 return <<___; 629 vpclmulqdq \$0x01, $H_POW3, $GHASHDATA1, $GHASHTMP1 # MI_1 630 vpclmulqdq \$0x01, $H_POW2, $GHASHDATA2, $GHASHTMP2 # MI_2 631 vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{2,1,0}) 632 vpclmulqdq \$0x01, $H_POW1, $GHASHDATA3, $GHASHTMP1 # MI_3 633___ 634 } 635 elsif ( $i == 4 ) { 636 return <<___; 637 vpclmulqdq \$0x10, $H_POW4, $GHASHDATA0, $GHASHTMP2 # MI_4 638 vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{4,3,2,1,0}) 639 vpclmulqdq \$0x10, $H_POW3, $GHASHDATA1, $GHASHTMP1 # MI_5 640 vpclmulqdq \$0x10, $H_POW2, $GHASHDATA2, $GHASHTMP2 # MI_6 641___ 642 } 643 elsif ( $i == 5 ) { 644 return <<___; 645 vpternlogd \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0 # sum(MI_{6,5,4,3,2,1,0}) 646 vpclmulqdq \$0x01, $GHASH_ACC, $GFPOLY, $GHASHTMP2 # LO_L*(x^63 + x^62 + x^57) 647 vpclmulqdq \$0x10, $H_POW1, $GHASHDATA3, $GHASHTMP1 # MI_7 648 vpxord $GHASHTMP1, $GHASHTMP0, $GHASHTMP0 # MI = sum(MI_{7,6,5,4,3,2,1,0}) 649___ 650 } 651 elsif ( $i == 6 ) { 652 return <<___; 653 vpshufd \$0x4e, $GHASH_ACC, $GHASH_ACC # Swap halves of LO 654 vpclmulqdq \$0x11, $H_POW4, $GHASHDATA0, $GHASHDATA0 # HI_0 655 vpclmulqdq \$0x11, $H_POW3, $GHASHDATA1, $GHASHDATA1 # HI_1 656 vpclmulqdq \$0x11, $H_POW2, $GHASHDATA2, $GHASHDATA2 # HI_2 657___ 658 } 659 elsif ( $i == 7 ) { 660 return <<___; 661 vpternlogd \$0x96, $GHASHTMP2, $GHASH_ACC, $GHASHTMP0 # Fold LO into MI 662 vpclmulqdq \$0x11, $H_POW1, $GHASHDATA3, $GHASHDATA3 # HI_3 663 vpternlogd \$0x96, $GHASHDATA2, $GHASHDATA1, $GHASHDATA0 # sum(HI_{2,1,0}) 664 vpclmulqdq \$0x01, $GHASHTMP0, $GFPOLY, $GHASHTMP1 # MI_L*(x^63 + x^62 + x^57) 665___ 666 } 667 elsif ( $i == 8 ) { 668 return <<___; 669 vpxord $GHASHDATA3, $GHASHDATA0, $GHASH_ACC # HI = sum(HI_{3,2,1,0}) 670 vpshufd \$0x4e, $GHASHTMP0, $GHASHTMP0 # Swap halves of MI 671 vpternlogd \$0x96, $GHASHTMP1, $GHASHTMP0, $GHASH_ACC # Fold MI into HI 672___ 673 } 674 elsif ( $i == 9 ) { 675 return _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM, 676 $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM; 677 } 678} 679 680# Update GHASH with the blocks given in GHASHDATA[0-3]. 681# See _ghash_step_4x for full explanation. 682sub _ghash_4x { 683 my $code = ""; 684 for my $i ( 0 .. 9 ) { 685 $code .= _ghash_step_4x $i; 686 } 687 return $code; 688} 689 690$g_ghash_macro_expansion_count = 0; 691 692# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16], 693# const uint8_t *in, size_t len); 694# 695# This macro generates the body of a GHASH update function with the above 696# prototype. This macro supports both VL=32 and VL=64. _set_veclen must have 697# been invoked with the desired length. 698# 699# The generated function processes the AAD (Additional Authenticated Data) in 700# GCM. Using the key |Htable|, it updates the GHASH accumulator |Xi| with the 701# data given by |in| and |len|. On the first call, |Xi| must be all zeroes. 702# |len| must be a multiple of 16. 703# 704# This function handles large amounts of AAD efficiently, while also keeping the 705# overhead low for small amounts of AAD which is the common case. TLS uses less 706# than one block of AAD, but (uncommonly) other use cases may use much more. 707sub _ghash_update { 708 my $local_label_suffix = "__func" . ++$g_ghash_macro_expansion_count; 709 my $code = ""; 710 711 # Function arguments 712 my ( $GHASH_ACC_PTR, $H_POWERS, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ]; 713 714 # Additional local variables 715 ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" ); 716 ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" ); 717 ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" ); 718 ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" ); 719 ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" ); 720 ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V5, "%xmm5" ); 721 ( $H_POW4, $H_POW3, $H_POW2 ) = ( $V6, $V7, $V8 ); 722 ( $H_POW1, $H_POW1_XMM ) = ( $V9, "%xmm9" ); 723 ( $GFPOLY, $GFPOLY_XMM ) = ( $V10, "%xmm10" ); 724 ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 ); 725 726 $code .= <<___; 727 @{[ _save_xmmregs (6 .. 13) ]} 728 .seh_endprologue 729 730 # Load the bswap_mask and gfpoly constants. Since AADLEN is usually small, 731 # usually only 128-bit vectors will be used. So as an optimization, don't 732 # broadcast these constants to all 128-bit lanes quite yet. 733 vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK_XMM 734 vmovdqu .Lgfpoly(%rip), $GFPOLY_XMM 735 736 # Load the GHASH accumulator. 737 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM 738 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 739 740 # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL. 741 cmp \$$VL, $AADLEN 742 jb .Laad_blockbyblock$local_label_suffix 743 744 # AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and 745 # gfpoly to all 128-bit lanes. 746 vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK 747 vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY 748 749 # Load the lowest set of key powers. 750 vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1 751 752 cmp \$4*$VL-1, $AADLEN 753 jbe .Laad_loop_1x$local_label_suffix 754 755 # AADLEN >= 4*VL. Load the higher key powers. 756 vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4 757 vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3 758 vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2 759 760 # Update GHASH with 4*VL bytes of AAD at a time. 761.Laad_loop_4x$local_label_suffix: 762 vmovdqu8 0*$VL($AAD), $GHASHDATA0 763 vmovdqu8 1*$VL($AAD), $GHASHDATA1 764 vmovdqu8 2*$VL($AAD), $GHASHDATA2 765 vmovdqu8 3*$VL($AAD), $GHASHDATA3 766 @{[ _ghash_4x ]} 767 sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32 768 add \$-4*$VL, $AADLEN 769 cmp \$4*$VL-1, $AADLEN 770 ja .Laad_loop_4x$local_label_suffix 771 772 # Update GHASH with VL bytes of AAD at a time. 773 cmp \$$VL, $AADLEN 774 jb .Laad_large_done$local_label_suffix 775.Laad_loop_1x$local_label_suffix: 776 vmovdqu8 ($AAD), $GHASHDATA0 777 vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0 778 vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC 779 @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, 780 $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]} 781 @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM, 782 $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} 783 add \$$VL, $AAD 784 sub \$$VL, $AADLEN 785 cmp \$$VL, $AADLEN 786 jae .Laad_loop_1x$local_label_suffix 787 788.Laad_large_done$local_label_suffix: 789 # Issue the vzeroupper that is needed after using ymm or zmm registers. 790 # Do it here instead of at the end, to minimize overhead for small AADLEN. 791 vzeroupper 792 793 # GHASH the remaining data 16 bytes at a time, using xmm registers only. 794.Laad_blockbyblock$local_label_suffix: 795 test $AADLEN, $AADLEN 796 jz .Laad_done$local_label_suffix 797 vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1_XMM 798.Laad_loop_blockbyblock$local_label_suffix: 799 vmovdqu ($AAD), $GHASHDATA0_XMM 800 vpshufb $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM 801 vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 802 @{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM, 803 $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]} 804 add \$16, $AAD 805 sub \$16, $AADLEN 806 jnz .Laad_loop_blockbyblock$local_label_suffix 807 808.Laad_done$local_label_suffix: 809 # Store the updated GHASH accumulator back to memory. 810 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 811 vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) 812___ 813 return $code; 814} 815 816# Do one non-last round of AES encryption on the counter blocks in V0-V3 using 817# the round key that has been broadcast to all 128-bit lanes of \round_key. 818sub _vaesenc_4x { 819 my ($round_key) = @_; 820 return <<___; 821 vaesenc $round_key, $V0, $V0 822 vaesenc $round_key, $V1, $V1 823 vaesenc $round_key, $V2, $V2 824 vaesenc $round_key, $V3, $V3 825___ 826} 827 828# Start the AES encryption of four vectors of counter blocks. 829sub _ctr_begin_4x { 830 return <<___; 831 # Increment LE_CTR four times to generate four vectors of little-endian 832 # counter blocks, swap each to big-endian, and store them in V0-V3. 833 vpshufb $BSWAP_MASK, $LE_CTR, $V0 834 vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR 835 vpshufb $BSWAP_MASK, $LE_CTR, $V1 836 vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR 837 vpshufb $BSWAP_MASK, $LE_CTR, $V2 838 vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR 839 vpshufb $BSWAP_MASK, $LE_CTR, $V3 840 vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR 841 842 # AES "round zero": XOR in the zero-th round key. 843 vpxord $RNDKEY0, $V0, $V0 844 vpxord $RNDKEY0, $V1, $V1 845 vpxord $RNDKEY0, $V2, $V2 846 vpxord $RNDKEY0, $V3, $V3 847___ 848} 849 850# Do the last AES round for four vectors of counter blocks V0-V3, XOR source 851# data with the resulting keystream, and write the result to DST and 852# GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) 853sub _aesenclast_and_xor_4x { 854 return <<___; 855 # XOR the source data with the last round key, saving the result in 856 # GHASHDATA[0-3]. This reduces latency by taking advantage of the 857 # property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 858 vpxord 0*$VL($SRC), $RNDKEYLAST, $GHASHDATA0 859 vpxord 1*$VL($SRC), $RNDKEYLAST, $GHASHDATA1 860 vpxord 2*$VL($SRC), $RNDKEYLAST, $GHASHDATA2 861 vpxord 3*$VL($SRC), $RNDKEYLAST, $GHASHDATA3 862 863 # Do the last AES round. This handles the XOR with the source data 864 # too, as per the optimization described above. 865 vaesenclast $GHASHDATA0, $V0, $GHASHDATA0 866 vaesenclast $GHASHDATA1, $V1, $GHASHDATA1 867 vaesenclast $GHASHDATA2, $V2, $GHASHDATA2 868 vaesenclast $GHASHDATA3, $V3, $GHASHDATA3 869 870 # Store the en/decrypted data to DST. 871 vmovdqu8 $GHASHDATA0, 0*$VL($DST) 872 vmovdqu8 $GHASHDATA1, 1*$VL($DST) 873 vmovdqu8 $GHASHDATA2, 2*$VL($DST) 874 vmovdqu8 $GHASHDATA3, 3*$VL($DST) 875___ 876} 877 878$g_update_macro_expansion_count = 0; 879 880# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out, 881# size_t len, const AES_KEY *key, 882# const uint8_t ivec[16], 883# const u128 Htable[16], 884# uint8_t Xi[16]); 885# 886# This macro generates a GCM encryption or decryption update function with the 887# above prototype (with \enc selecting which one). This macro supports both 888# VL=32 and VL=64. _set_veclen must have been invoked with the desired length. 889# 890# This function computes the next portion of the CTR keystream, XOR's it with 891# |len| bytes from |in|, and writes the resulting encrypted or decrypted data 892# to |out|. It also updates the GHASH accumulator |Xi| using the next |len| 893# ciphertext bytes. 894# 895# |len| must be a multiple of 16, except on the last call where it can be any 896# length. The caller must do any buffering needed to ensure this. Both 897# in-place and out-of-place en/decryption are supported. 898# 899# |ivec| must give the current counter in big-endian format. This function 900# loads the counter from |ivec| and increments the loaded counter as needed, but 901# it does *not* store the updated counter back to |ivec|. The caller must 902# update |ivec| if any more data segments follow. Internally, only the low 903# 32-bit word of the counter is incremented, following the GCM standard. 904sub _aes_gcm_update { 905 my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count; 906 907 my ($enc) = @_; 908 909 my $code = ""; 910 911 # Function arguments 912 ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ) = 913 $win64 914 ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" ) 915 : ( @argregs[ 0 .. 5 ], "%r12" ); 916 917 # Additional local variables 918 919 # %rax, %k1, and %k2 are used as temporary registers. BE_CTR_PTR is 920 # also available as a temporary register after the counter is loaded. 921 922 # AES key length in bytes 923 ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" ); 924 925 # Pointer to the last AES round key for the chosen AES variant 926 $RNDKEYLAST_PTR = "%r11"; 927 928 # In the main loop, V0-V3 are used as AES input and output. Elsewhere 929 # they are used as temporary registers. 930 931 # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. 932 ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" ); 933 ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" ); 934 ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" ); 935 ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" ); 936 937 # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values 938 # using vpshufb, copied to all 128-bit lanes. 939 ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" ); 940 941 # RNDKEY temporarily holds the next AES round key. 942 $RNDKEY = $V9; 943 944 # GHASH_ACC is the accumulator variable for GHASH. When fully reduced, 945 # only the lowest 128-bit lane can be nonzero. When not fully reduced, 946 # more than one lane may be used, and they need to be XOR'd together. 947 ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" ); 948 949 # LE_CTR_INC is the vector of 32-bit words that need to be added to a 950 # vector of little-endian counter blocks to advance it forwards. 951 $LE_CTR_INC = $V11; 952 953 # LE_CTR contains the next set of little-endian counter blocks. 954 $LE_CTR = $V12; 955 956 # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, 957 # copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, 958 # RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. 959 ( 960 $RNDKEY0, $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8, 961 $RNDKEY_M7, $RNDKEY_M6, $RNDKEY_M5, $RNDKEY_M4, 962 $RNDKEY_M3, $RNDKEY_M2, $RNDKEY_M1 963 ) = ( $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23 ); 964 965 # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These 966 # cannot coincide with anything used for AES encryption, since for 967 # performance reasons GHASH and AES encryption are interleaved. 968 ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 ); 969 970 # H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The 971 # descending numbering reflects the order of the key powers. 972 ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 ); 973 974 # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. 975 $GFPOLY = $V31; 976 977 if ($win64) { 978 $code .= <<___; 979 @{[ _save_gpregs $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ]} 980 mov 64(%rsp), $BE_CTR_PTR # arg5 981 mov 72(%rsp), $H_POWERS # arg6 982 mov 80(%rsp), $GHASH_ACC_PTR # arg7 983 @{[ _save_xmmregs (6 .. 15) ]} 984 .seh_endprologue 985___ 986 } 987 else { 988 $code .= <<___; 989 @{[ _save_gpregs $GHASH_ACC_PTR ]} 990 mov 16(%rsp), $GHASH_ACC_PTR # arg7 991___ 992 } 993 994 if ($enc) { 995 $code .= <<___; 996#ifdef BORINGSSL_DISPATCH_TEST 997 .extern BORINGSSL_function_hit 998 movb \$1,BORINGSSL_function_hit+@{[ $VL < 64 ? 6 : 7 ]}(%rip) 999#endif 1000___ 1001 } 1002 $code .= <<___; 1003 # Load some constants. 1004 vbroadcasti32x4 .Lbswap_mask(%rip), $BSWAP_MASK 1005 vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY 1006 1007 # Load the GHASH accumulator and the starting counter. 1008 # BoringSSL passes these values in big endian format. 1009 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM 1010 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 1011 vbroadcasti32x4 ($BE_CTR_PTR), $LE_CTR 1012 vpshufb $BSWAP_MASK, $LE_CTR, $LE_CTR 1013 1014 # Load the AES key length in bytes. BoringSSL stores number of rounds 1015 # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20. 1016 movl $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN 1017 lea -20(,$AESKEYLEN,4), $AESKEYLEN 1018 1019 # Make RNDKEYLAST_PTR point to the last AES round key. This is the 1020 # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 1021 # respectively. Then load the zero-th and last round keys. 1022 lea 6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR 1023 vbroadcasti32x4 ($AESKEY), $RNDKEY0 1024 vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST 1025 1026 # Finish initializing LE_CTR by adding [0, 1, ...] to its low words. 1027 vpaddd .Lctr_pattern(%rip), $LE_CTR, $LE_CTR 1028 1029 # Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. 1030 vbroadcasti32x4 .Linc_@{[ $VL / 16 ]}blocks(%rip), $LE_CTR_INC 1031 1032 # If there are at least 4*VL bytes of data, then continue into the loop 1033 # that processes 4*VL bytes of data at a time. Otherwise skip it. 1034 cmp \$4*$VL-1, $DATALEN 1035 jbe .Lcrypt_loop_4x_done$local_label_suffix 1036 1037 # Load powers of the hash key. 1038 vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4 1039 vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3 1040 vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2 1041 vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1 1042___ 1043 1044 # Main loop: en/decrypt and hash 4 vectors at a time. 1045 # 1046 # When possible, interleave the AES encryption of the counter blocks 1047 # with the GHASH update of the ciphertext blocks. This improves 1048 # performance on many CPUs because the execution ports used by the VAES 1049 # instructions often differ from those used by vpclmulqdq and other 1050 # instructions used in GHASH. For example, many Intel CPUs dispatch 1051 # vaesenc to ports 0 and 1 and vpclmulqdq to port 5. 1052 # 1053 # The interleaving is easiest to do during decryption, since during 1054 # decryption the ciphertext blocks are immediately available. For 1055 # encryption, instead encrypt the first set of blocks, then hash those 1056 # blocks while encrypting the next set of blocks, repeat that as 1057 # needed, and finally hash the last set of blocks. 1058 1059 if ($enc) { 1060 $code .= <<___; 1061 # Encrypt the first 4 vectors of plaintext blocks. Leave the resulting 1062 # ciphertext in GHASHDATA[0-3] for GHASH. 1063 @{[ _ctr_begin_4x ]} 1064 lea 16($AESKEY), %rax 1065.Lvaesenc_loop_first_4_vecs$local_label_suffix: 1066 vbroadcasti32x4 (%rax), $RNDKEY 1067 @{[ _vaesenc_4x $RNDKEY ]} 1068 add \$16, %rax 1069 cmp %rax, $RNDKEYLAST_PTR 1070 jne .Lvaesenc_loop_first_4_vecs$local_label_suffix 1071 @{[ _aesenclast_and_xor_4x ]} 1072 sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32 1073 sub \$-4*$VL, $DST 1074 add \$-4*$VL, $DATALEN 1075 cmp \$4*$VL-1, $DATALEN 1076 jbe .Lghash_last_ciphertext_4x$local_label_suffix 1077___ 1078 } 1079 1080 # Cache as many additional AES round keys as possible. 1081 for my $i ( reverse 1 .. 9 ) { 1082 $code .= <<___; 1083 vbroadcasti32x4 -$i*16($RNDKEYLAST_PTR), ${"RNDKEY_M$i"} 1084___ 1085 } 1086 1087 $code .= <<___; 1088.Lcrypt_loop_4x$local_label_suffix: 1089___ 1090 1091 # If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If 1092 # encrypting, GHASHDATA[0-3] already contain the previous ciphertext. 1093 if ( !$enc ) { 1094 $code .= <<___; 1095 vmovdqu8 0*$VL($SRC), $GHASHDATA0 1096 vmovdqu8 1*$VL($SRC), $GHASHDATA1 1097 vmovdqu8 2*$VL($SRC), $GHASHDATA2 1098 vmovdqu8 3*$VL($SRC), $GHASHDATA3 1099___ 1100 } 1101 1102 $code .= <<___; 1103 # Start the AES encryption of the counter blocks. 1104 @{[ _ctr_begin_4x ]} 1105 cmp \$24, $AESKEYLEN 1106 jl .Laes128$local_label_suffix 1107 je .Laes192$local_label_suffix 1108 # AES-256 1109 vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY 1110 @{[ _vaesenc_4x $RNDKEY ]} 1111 vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY 1112 @{[ _vaesenc_4x $RNDKEY ]} 1113.Laes192$local_label_suffix: 1114 vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY 1115 @{[ _vaesenc_4x $RNDKEY ]} 1116 vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY 1117 @{[ _vaesenc_4x $RNDKEY ]} 1118.Laes128$local_label_suffix: 1119___ 1120 1121 # Finish the AES encryption of the counter blocks in V0-V3, interleaved 1122 # with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. 1123 for my $i ( reverse 1 .. 9 ) { 1124 $code .= <<___; 1125 @{[ _ghash_step_4x (9 - $i) ]} 1126 @{[ _vaesenc_4x ${"RNDKEY_M$i"} ]} 1127___ 1128 } 1129 $code .= <<___; 1130 @{[ _ghash_step_4x 9 ]} 1131 @{[ _aesenclast_and_xor_4x ]} 1132 sub \$-4*$VL, $SRC # shorter than 'add 4*VL' when VL=32 1133 sub \$-4*$VL, $DST 1134 add \$-4*$VL, $DATALEN 1135 cmp \$4*$VL-1, $DATALEN 1136 ja .Lcrypt_loop_4x$local_label_suffix 1137___ 1138 1139 if ($enc) { 1140 1141 # Update GHASH with the last set of ciphertext blocks. 1142 $code .= <<___; 1143.Lghash_last_ciphertext_4x$local_label_suffix: 1144 @{[ _ghash_4x ]} 1145___ 1146 } 1147 1148 my $POWERS_PTR = $BE_CTR_PTR; # BE_CTR_PTR is free to be reused. 1149 1150 $code .= <<___; 1151.Lcrypt_loop_4x_done$local_label_suffix: 1152 # Check whether any data remains. 1153 test $DATALEN, $DATALEN 1154 jz .Ldone$local_label_suffix 1155 1156 # The data length isn't a multiple of 4*VL. Process the remaining data 1157 # of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. 1158 # Going one vector at a time may seem inefficient compared to having 1159 # separate code paths for each possible number of vectors remaining. 1160 # However, using a loop keeps the code size down, and it performs 1161 # surprising well; modern CPUs will start executing the next iteration 1162 # before the previous one finishes and also predict the number of loop 1163 # iterations. For a similar reason, we roll up the AES rounds. 1164 # 1165 # On the last iteration, the remaining length may be less than VL. 1166 # Handle this using masking. 1167 # 1168 # Since there are enough key powers available for all remaining data, 1169 # there is no need to do a GHASH reduction after each iteration. 1170 # Instead, multiply each remaining block by its own key power, and only 1171 # do a GHASH reduction at the very end. 1172 1173 # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N 1174 # is the number of blocks that remain. 1175 mov $DATALEN, %rax 1176 neg %rax 1177 and \$-16, %rax # -round_up(DATALEN, 16) 1178 lea $OFFSETOFEND_H_POWERS($H_POWERS,%rax), $POWERS_PTR 1179___ 1180 1181 # Start collecting the unreduced GHASH intermediate value LO, MI, HI. 1182 my ( $LO, $LO_XMM ) = ( $GHASHDATA0, $GHASHDATA0_XMM ); 1183 my ( $MI, $MI_XMM ) = ( $GHASHDATA1, $GHASHDATA1_XMM ); 1184 my ( $HI, $HI_XMM ) = ( $GHASHDATA2, $GHASHDATA2_XMM ); 1185 $code .= <<___; 1186 vpxor $LO_XMM, $LO_XMM, $LO_XMM 1187 vpxor $MI_XMM, $MI_XMM, $MI_XMM 1188 vpxor $HI_XMM, $HI_XMM, $HI_XMM 1189 1190 cmp \$$VL, $DATALEN 1191 jb .Lpartial_vec$local_label_suffix 1192 1193.Lcrypt_loop_1x$local_label_suffix: 1194 # Process a full vector of length VL. 1195 1196 # Encrypt a vector of counter blocks. 1197 vpshufb $BSWAP_MASK, $LE_CTR, $V0 1198 vpaddd $LE_CTR_INC, $LE_CTR, $LE_CTR 1199 vpxord $RNDKEY0, $V0, $V0 1200 lea 16($AESKEY), %rax 1201.Lvaesenc_loop_tail_full_vec$local_label_suffix: 1202 vbroadcasti32x4 (%rax), $RNDKEY 1203 vaesenc $RNDKEY, $V0, $V0 1204 add \$16, %rax 1205 cmp %rax, $RNDKEYLAST_PTR 1206 jne .Lvaesenc_loop_tail_full_vec$local_label_suffix 1207 vaesenclast $RNDKEYLAST, $V0, $V0 1208 1209 # XOR the data with the vector of keystream blocks. 1210 vmovdqu8 ($SRC), $V1 1211 vpxord $V1, $V0, $V0 1212 vmovdqu8 $V0, ($DST) 1213 1214 # Update GHASH with the ciphertext blocks, without reducing. 1215 vmovdqu8 ($POWERS_PTR), $H_POW1 1216 vpshufb $BSWAP_MASK, @{[ $enc ? $V0 : $V1 ]}, $V0 1217 vpxord $GHASH_ACC, $V0, $V0 1218 @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3, 1219 $V1, $V2, $V3 ]} 1220 vpxor $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 1221 1222 add \$$VL, $POWERS_PTR 1223 add \$$VL, $SRC 1224 add \$$VL, $DST 1225 sub \$$VL, $DATALEN 1226 cmp \$$VL, $DATALEN 1227 jae .Lcrypt_loop_1x$local_label_suffix 1228 1229 test $DATALEN, $DATALEN 1230 jz .Lreduce$local_label_suffix 1231 1232.Lpartial_vec$local_label_suffix: 1233 # Process a partial vector of length 1 <= DATALEN < VL. 1234 1235 # Set the data mask %k1 to DATALEN 1's. 1236 # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's. 1237 mov \$-1, %rax 1238 bzhi $DATALEN, %rax, %rax 1239 @{[ $VL < 64 ? "kmovd %eax, %k1" : "kmovq %rax, %k1" ]} 1240 add \$15, $DATALEN 1241 and \$-16, $DATALEN 1242 mov \$-1, %rax 1243 bzhi $DATALEN, %rax, %rax 1244 @{[ $VL < 64 ? "kmovd %eax, %k2" : "kmovq %rax, %k2" ]} 1245 1246 # Encrypt one last vector of counter blocks. This does not need to be 1247 # masked. The counter does not need to be incremented here. 1248 vpshufb $BSWAP_MASK, $LE_CTR, $V0 1249 vpxord $RNDKEY0, $V0, $V0 1250 lea 16($AESKEY), %rax 1251.Lvaesenc_loop_tail_partialvec$local_label_suffix: 1252 vbroadcasti32x4 (%rax), $RNDKEY 1253 vaesenc $RNDKEY, $V0, $V0 1254 add \$16, %rax 1255 cmp %rax, $RNDKEYLAST_PTR 1256 jne .Lvaesenc_loop_tail_partialvec$local_label_suffix 1257 vaesenclast $RNDKEYLAST, $V0, $V0 1258 1259 # XOR the data with the appropriate number of keystream bytes. 1260 vmovdqu8 ($SRC), $V1\{%k1}{z} 1261 vpxord $V1, $V0, $V0 1262 vmovdqu8 $V0, ($DST){%k1} 1263 1264 # Update GHASH with the ciphertext block(s), without reducing. 1265 # 1266 # In the case of DATALEN < VL, the ciphertext is zero-padded to VL. 1267 # (If decrypting, it's done by the above masked load. If encrypting, 1268 # it's done by the below masked register-to-register move.) Note that 1269 # if DATALEN <= VL - 16, there will be additional padding beyond the 1270 # padding of the last block specified by GHASH itself; i.e., there may 1271 # be whole block(s) that get processed by the GHASH multiplication and 1272 # reduction instructions but should not actually be included in the 1273 # GHASH. However, any such blocks are all-zeroes, and the values that 1274 # they're multiplied with are also all-zeroes. Therefore they just add 1275 # 0 * 0 = 0 to the final GHASH result, which makes no difference. 1276 vmovdqu8 ($POWERS_PTR), $H_POW1\{%k2}{z} 1277 @{[ $enc ? "vmovdqu8 $V0, $V1\{%k1}{z}" : "" ]} 1278 vpshufb $BSWAP_MASK, $V1, $V0 1279 vpxord $GHASH_ACC, $V0, $V0 1280 @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3, 1281 $V1, $V2, $V3 ]} 1282 1283.Lreduce$local_label_suffix: 1284 # Finally, do the GHASH reduction. 1285 @{[ _ghash_reduce $LO, $MI, $HI, $GFPOLY, $V0 ]} 1286 @{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM, 1287 "%xmm0", "%xmm1", "%xmm2" ]} 1288 1289.Ldone$local_label_suffix: 1290 # Store the updated GHASH accumulator back to memory. 1291 vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM 1292 vmovdqu $GHASH_ACC_XMM, ($GHASH_ACC_PTR) 1293 1294 vzeroupper # This is needed after using ymm or zmm registers. 1295___ 1296 return $code; 1297} 1298 1299# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]); 1300$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1; 1301{ 1302 my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ]; 1303 my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) = 1304 map( "%xmm$_", ( 0 .. 6 ) ); 1305 1306 $code .= <<___; 1307 @{[ _save_xmmregs (6) ]} 1308 .seh_endprologue 1309 1310 vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC 1311 vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK 1312 vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1 1313 vmovdqu .Lgfpoly(%rip), $GFPOLY 1314 vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC 1315 1316 @{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]} 1317 1318 vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC 1319 vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR) 1320___ 1321} 1322$code .= _end_func; 1323 1324# Disabled until significant deployment of AVX10/256 is seen. The separate 1325# *_vaes_avx2 implementation provides the only 256-bit support for now. 1326# 1327# $code .= _begin_func "gcm_init_vpclmulqdq_avx10_256", 0; 1328# $code .= _aes_gcm_init; 1329# $code .= _end_func; 1330# 1331# $code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1; 1332# $code .= _ghash_update; 1333# $code .= _end_func; 1334# 1335# $code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1; 1336# $code .= _aes_gcm_update 1; 1337# $code .= _end_func; 1338# 1339# $code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1; 1340# $code .= _aes_gcm_update 0; 1341# $code .= _end_func; 1342 1343_set_veclen 64; 1344 1345$code .= _begin_func "gcm_init_vpclmulqdq_avx10_512", 0; 1346$code .= _aes_gcm_init; 1347$code .= _end_func; 1348 1349$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1; 1350$code .= _ghash_update; 1351$code .= _end_func; 1352 1353$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_512", 1; 1354$code .= _aes_gcm_update 1; 1355$code .= _end_func; 1356 1357$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_512", 1; 1358$code .= _aes_gcm_update 0; 1359$code .= _end_func; 1360 1361print $code; 1362close STDOUT or die "error closing STDOUT: $!"; 1363exit 0; 1364