• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2# Copyright 2024 The BoringSSL Authors
3#
4# Permission to use, copy, modify, and/or distribute this software for any
5# purpose with or without fee is hereby granted, provided that the above
6# copyright notice and this permission notice appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15#
16#------------------------------------------------------------------------------
17#
18# VAES and VPCLMULQDQ optimized AES-GCM for x86_64
19#
20# This file is based on aes-gcm-avx10-x86_64.S from the Linux kernel
21# (https://git.kernel.org/linus/b06affb1cb580e13).  The following notable
22# changes have been made:
23#
24# - Relicensed under BoringSSL's preferred license.
25#
26# - Converted from GNU assembler to "perlasm".  This was necessary for
27#   compatibility with BoringSSL's Windows builds which use NASM instead of the
28#   GNU assembler.  It was also necessary for compatibility with the 'delocate'
29#   tool used in BoringSSL's FIPS builds.
30#
31# - Added support for the Windows ABI.
32#
33# - Changed function prototypes to be compatible with what BoringSSL wants.
34#
35# - Removed the optimized finalization function, as BoringSSL doesn't want it.
36#
37# - Added a single-block GHASH multiplication function, as BoringSSL needs this.
38#
39# - Added optimization for large amounts of AAD.
40#
41#------------------------------------------------------------------------------
42#
43# This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
44# support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
45# either AVX512 or AVX10.  Some of the functions, notably the encryption and
46# decryption update functions which are the most performance-critical, are
47# provided in two variants generated from a macro: one using 256-bit vectors
48# (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
49# other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
50#
51# The functions that use 512-bit vectors are intended for CPUs that support
52# 512-bit vectors *and* where using them doesn't cause significant
53# downclocking.  They require the following CPU features:
54#
55#       VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
56#
57# The other functions require the following CPU features:
58#
59#       VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
60#
61# Note that we use "avx10" in the names of the functions as a shorthand to
62# really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
63# introduction of AVX512 and then its replacement by AVX10, there doesn't seem
64# to be a simple way to name things that makes sense on all CPUs.
65#
66# Note that the macros that support both 256-bit and 512-bit vectors could
67# fairly easily be changed to support 128-bit too.  However, this would *not*
68# be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
69# because the code heavily uses several features of these extensions other than
70# the vector length: the increase in the number of SIMD registers from 16 to
71# 32, masking support, and new instructions such as vpternlogd (which can do a
72# three-argument XOR).  These features are very useful for AES-GCM.
73
74$flavour = shift;
75$output  = shift;
76if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
77
78if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
79    $win64   = 1;
80    @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
81}
82else {
83    $win64   = 0;
84    @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
85}
86
87$0 =~ m/(.*[\/\\])[^\/\\]+$/;
88$dir = $1;
89( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
90  or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
91  or die "can't locate x86_64-xlate.pl";
92
93open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
94*STDOUT = *OUT;
95
96sub _begin_func {
97    my ( $funcname, $uses_seh ) = @_;
98    $g_cur_func_name          = $funcname;
99    $g_cur_func_uses_seh      = $uses_seh;
100    @g_cur_func_saved_gpregs  = ();
101    @g_cur_func_saved_xmmregs = ();
102    return <<___;
103.globl $funcname
104.type $funcname,\@abi-omnipotent
105.align 32
106$funcname:
107    .cfi_startproc
108    @{[ $uses_seh ? ".seh_startproc" : "" ]}
109    _CET_ENDBR
110___
111}
112
113# Push a list of general purpose registers onto the stack.
114sub _save_gpregs {
115    my @gpregs = @_;
116    my $code   = "";
117    die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
118    die "_save_gpregs can only be called once per function"
119      if @g_cur_func_saved_gpregs;
120    die "Order must be _save_gpregs, then _save_xmmregs"
121      if @g_cur_func_saved_xmmregs;
122    @g_cur_func_saved_gpregs = @gpregs;
123    for my $reg (@gpregs) {
124        $code .= "push $reg\n";
125        if ($win64) {
126            $code .= ".seh_pushreg $reg\n";
127        }
128        else {
129            $code .= ".cfi_push $reg\n";
130        }
131    }
132    return $code;
133}
134
135# Push a list of xmm registers onto the stack if the target is Windows.
136sub _save_xmmregs {
137    my @xmmregs     = @_;
138    my $num_xmmregs = scalar @xmmregs;
139    my $code        = "";
140    die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
141    die "_save_xmmregs can only be called once per function"
142      if @g_cur_func_saved_xmmregs;
143    if ( $win64 and $num_xmmregs > 0 ) {
144        @g_cur_func_saved_xmmregs = @xmmregs;
145        my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
146        my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
147        $code .= "sub \$$alloc_size, %rsp\n";
148        $code .= ".seh_stackalloc $alloc_size\n";
149        for my $i ( 0 .. $num_xmmregs - 1 ) {
150            my $reg_num = $xmmregs[$i];
151            my $pos     = 16 * $i;
152            $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
153            $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
154        }
155    }
156    return $code;
157}
158
159sub _end_func {
160    my $code = "";
161
162    # Restore any xmm registers that were saved earlier.
163    my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
164    if ( $win64 and $num_xmmregs > 0 ) {
165        my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
166        my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
167        for my $i ( 0 .. $num_xmmregs - 1 ) {
168            my $reg_num = $g_cur_func_saved_xmmregs[$i];
169            my $pos     = 16 * $i;
170            $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
171        }
172        $code .= "add \$$alloc_size, %rsp\n";
173    }
174
175    # Restore any general purpose registers that were saved earlier.
176    for my $reg ( reverse @g_cur_func_saved_gpregs ) {
177        $code .= "pop $reg\n";
178        if ( !$win64 ) {
179            $code .= ".cfi_pop $reg\n";
180        }
181    }
182
183    $code .= <<___;
184    ret
185    @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
186    .cfi_endproc
187    .size   $g_cur_func_name, . - $g_cur_func_name
188___
189    return $code;
190}
191
192$code = <<___;
193.section .rodata
194.align 64
195
196    # A shuffle mask that reflects the bytes of 16-byte blocks
197.Lbswap_mask:
198    .quad   0x08090a0b0c0d0e0f, 0x0001020304050607
199
200    # This is the GHASH reducing polynomial without its constant term, i.e.
201    # x^128 + x^7 + x^2 + x, represented using the backwards mapping
202    # between bits and polynomial coefficients.
203    #
204    # Alternatively, it can be interpreted as the naturally-ordered
205    # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
206    # "reversed" GHASH reducing polynomial without its x^128 term.
207.Lgfpoly:
208    .quad   1, 0xc200000000000000
209
210    # Same as above, but with the (1 << 64) bit set.
211.Lgfpoly_and_internal_carrybit:
212    .quad   1, 0xc200000000000001
213
214    # The below constants are used for incrementing the counter blocks.
215    # ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
216    # inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
217    # 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
218.Lctr_pattern:
219    .quad   0, 0
220    .quad   1, 0
221.Linc_2blocks:
222    .quad   2, 0
223    .quad   3, 0
224.Linc_4blocks:
225    .quad   4, 0
226
227.text
228___
229
230# Number of powers of the hash key stored in the key struct.  The powers are
231# stored from highest (H^NUM_H_POWERS) to lowest (H^1).
232$NUM_H_POWERS = 16;
233
234$OFFSETOFEND_H_POWERS = $NUM_H_POWERS * 16;
235
236# Offset to 'rounds' in AES_KEY struct
237$OFFSETOF_AES_ROUNDS = 240;
238
239# The current vector length in bytes
240undef $VL;
241
242# Set the vector length in bytes.  This sets the VL variable and defines
243# register aliases V0-V31 that map to the ymm or zmm registers.
244sub _set_veclen {
245    ($VL) = @_;
246    foreach my $i ( 0 .. 31 ) {
247        if ( $VL == 32 ) {
248            ${"V${i}"} = "%ymm${i}";
249        }
250        elsif ( $VL == 64 ) {
251            ${"V${i}"} = "%zmm${i}";
252        }
253        else {
254            die "Unsupported vector length";
255        }
256    }
257}
258
259# The _ghash_mul_step macro does one step of GHASH multiplication of the
260# 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
261# reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
262# same size as \a and \b.  To complete all steps, this must invoked with \i=0
263# through \i=9.  The division into steps allows users of this macro to
264# optionally interleave the computation with other instructions.  Users of this
265# macro must preserve the parameter registers across steps.
266#
267# The multiplications are done in GHASH's representation of the finite field
268# GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
269# (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
270# G.  The GCM specification uses G = x^128 + x^7 + x^2 + x + 1.  Addition is
271# just XOR, while multiplication is more complex and has two parts: (a) do
272# carryless multiplication of two 128-bit input polynomials to get a 256-bit
273# intermediate product polynomial, and (b) reduce the intermediate product to
274# 128 bits by adding multiples of G that cancel out terms in it.  (Adding
275# multiples of G doesn't change which field element the polynomial represents.)
276#
277# Unfortunately, the GCM specification maps bits to/from polynomial
278# coefficients backwards from the natural order.  In each byte it specifies the
279# highest bit to be the lowest order polynomial coefficient, *not* the highest!
280# This makes it nontrivial to work with the GHASH polynomials.  We could
281# reflect the bits, but x86 doesn't have an instruction that does that.
282#
283# Instead, we operate on the values without bit-reflecting them.  This *mostly*
284# just works, since XOR and carryless multiplication are symmetric with respect
285# to bit order, but it has some consequences.  First, due to GHASH's byte
286# order, by skipping bit reflection, *byte* reflection becomes necessary to
287# give the polynomial terms a consistent order.  E.g., considering an N-bit
288# value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
289# through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
290# through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
291# represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
292# with.  Fortunately, x86's vpshufb instruction can do byte reflection.
293#
294# Second, forgoing the bit reflection causes an extra multiple of x (still
295# using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
296# multiplication.  This is because an M-bit by N-bit carryless multiplication
297# really produces a (M+N-1)-bit product, but in practice it's zero-extended to
298# M+N bits.  In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
299# to polynomial coefficients backwards, this zero-extension actually changes
300# the product by introducing an extra factor of x.  Therefore, users of this
301# macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
302# the multiplicative inverse of x, to cancel out the extra x.
303#
304# Third, the backwards coefficients convention is just confusing to work with,
305# since it makes "low" and "high" in the polynomial math mean the opposite of
306# their normal meaning in computer programming.  This can be solved by using an
307# alternative interpretation: the polynomial coefficients are understood to be
308# in the natural order, and the multiplication is actually \a * \b * x^-128 mod
309# x^128 + x^127 + x^126 + x^121 + 1.  This doesn't change the inputs, outputs,
310# or the implementation at all; it just changes the mathematical interpretation
311# of what each instruction is doing.  Starting from here, we'll use this
312# alternative interpretation, as it's easier to understand the code that way.
313#
314# Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
315# 128-bit carryless multiplication, so we break the 128 x 128 multiplication
316# into parts as follows (the _L and _H suffixes denote low and high 64 bits):
317#
318#     LO = a_L * b_L
319#     MI = (a_L * b_H) + (a_H * b_L)
320#     HI = a_H * b_H
321#
322# The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
323# Note that MI "overlaps" with LO and HI.  We don't consolidate MI into LO and
324# HI right away, since the way the reduction works makes that unnecessary.
325#
326# For the reduction, we cancel out the low 128 bits by adding multiples of G =
327# x^128 + x^127 + x^126 + x^121 + 1.  This is done by two iterations, each of
328# which cancels out the next lowest 64 bits.  Consider a value x^64*A + B,
329# where A and B are 128-bit.  Adding B_L*G to that value gives:
330#
331#       x^64*A + B + B_L*G
332#     = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
333#     = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
334#     = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
335#     = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
336#
337# So: if we sum A, B with its halves swapped, and the low half of B times x^63
338# + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
339# original value x^64*A + B.  I.e., the low 64 bits got canceled out.
340#
341# We just need to apply this twice: first to fold LO into MI, and second to
342# fold the updated MI into HI.
343#
344# The needed three-argument XORs are done using the vpternlogd instruction with
345# immediate 0x96, since this is faster than two vpxord instructions.
346#
347# A potential optimization, assuming that b is fixed per-key (if a is fixed
348# per-key it would work the other way around), is to use one iteration of the
349# reduction described above to precompute a value c such that x^64*c = b mod G,
350# and then multiply a_L by c (and implicitly by x^64) instead of by b:
351#
352#     MI = (a_L * c_L) + (a_H * b_L)
353#     HI = (a_L * c_H) + (a_H * b_H)
354#
355# This would eliminate the LO part of the intermediate product, which would
356# eliminate the need to fold LO into MI.  This would save two instructions,
357# including a vpclmulqdq.  However, we currently don't use this optimization
358# because it would require twice as many per-key precomputed values.
359#
360# Using Karatsuba multiplication instead of "schoolbook" multiplication
361# similarly would save a vpclmulqdq but does not seem to be worth it.
362sub _ghash_mul_step {
363    my ( $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
364    if ( $i == 0 ) {
365        return "vpclmulqdq \$0x00, $a, $b, $t0\n" .    # LO = a_L * b_L
366          "vpclmulqdq \$0x01, $a, $b, $t1\n";          # MI_0 = a_L * b_H
367    }
368    elsif ( $i == 1 ) {
369        return "vpclmulqdq \$0x10, $a, $b, $t2\n";     # MI_1 = a_H * b_L
370    }
371    elsif ( $i == 2 ) {
372        return "vpxord $t2, $t1, $t1\n";               # MI = MI_0 + MI_1
373    }
374    elsif ( $i == 3 ) {
375        return
376          "vpclmulqdq \$0x01, $t0, $gfpoly, $t2\n";  # LO_L*(x^63 + x^62 + x^57)
377    }
378    elsif ( $i == 4 ) {
379        return "vpshufd \$0x4e, $t0, $t0\n";         # Swap halves of LO
380    }
381    elsif ( $i == 5 ) {
382        return "vpternlogd \$0x96, $t2, $t0, $t1\n";    # Fold LO into MI
383    }
384    elsif ( $i == 6 ) {
385        return "vpclmulqdq \$0x11, $a, $b, $dst\n";     # HI = a_H * b_H
386    }
387    elsif ( $i == 7 ) {
388        return
389          "vpclmulqdq \$0x01, $t1, $gfpoly, $t0\n";  # MI_L*(x^63 + x^62 + x^57)
390    }
391    elsif ( $i == 8 ) {
392        return "vpshufd \$0x4e, $t1, $t1\n";         # Swap halves of MI
393    }
394    elsif ( $i == 9 ) {
395        return "vpternlogd \$0x96, $t0, $t1, $dst\n";    # Fold MI into HI
396    }
397}
398
399# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
400# the reduced products in \dst.  See _ghash_mul_step for full explanation.
401sub _ghash_mul {
402    my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
403    my $code = "";
404    for my $i ( 0 .. 9 ) {
405        $code .= _ghash_mul_step $i, $a, $b, $dst, $gfpoly, $t0, $t1, $t2;
406    }
407    return $code;
408}
409
410# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
411# *unreduced* products to \lo, \mi, and \hi.
412sub _ghash_mul_noreduce {
413    my ( $a, $b, $lo, $mi, $hi, $t0, $t1, $t2, $t3 ) = @_;
414    return <<___;
415    vpclmulqdq      \$0x00, $a, $b, $t0      # a_L * b_L
416    vpclmulqdq      \$0x01, $a, $b, $t1      # a_L * b_H
417    vpclmulqdq      \$0x10, $a, $b, $t2      # a_H * b_L
418    vpclmulqdq      \$0x11, $a, $b, $t3      # a_H * b_H
419    vpxord          $t0, $lo, $lo
420    vpternlogd      \$0x96, $t2, $t1, $mi
421    vpxord          $t3, $hi, $hi
422___
423}
424
425# Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
426# reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
427sub _ghash_reduce {
428    my ( $lo, $mi, $hi, $gfpoly, $t0 ) = @_;
429    return <<___;
430    vpclmulqdq      \$0x01, $lo, $gfpoly, $t0
431    vpshufd         \$0x4e, $lo, $lo
432    vpternlogd      \$0x96, $t0, $lo, $mi
433    vpclmulqdq      \$0x01, $mi, $gfpoly, $t0
434    vpshufd         \$0x4e, $mi, $mi
435    vpternlogd      \$0x96, $t0, $mi, $hi
436___
437}
438
439$g_init_macro_expansion_count = 0;
440
441# void gcm_init_##suffix(u128 Htable[16], const uint64_t H[2]);
442#
443# Initialize |Htable| with powers of the GHASH subkey |H|.
444#
445# The powers are stored in the order H^NUM_H_POWERS to H^1.
446#
447# This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
448# with the desired length.  In the VL=32 case, the function computes twice as
449# many key powers than are actually used by the VL=32 GCM update functions.
450# This is done to keep the key format the same regardless of vector length.
451sub _aes_gcm_init {
452    my $local_label_suffix = "__func" . ++$g_init_macro_expansion_count;
453
454    # Function arguments
455    my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
456
457    # Additional local variables.  V0-V2 and %rax are used as temporaries.
458    my $POWERS_PTR     = "%r8";
459    my $RNDKEYLAST_PTR = "%r9";
460    my ( $H_CUR, $H_CUR_YMM, $H_CUR_XMM )    = ( "$V3", "%ymm3", "%xmm3" );
461    my ( $H_INC, $H_INC_YMM, $H_INC_XMM )    = ( "$V4", "%ymm4", "%xmm4" );
462    my ( $GFPOLY, $GFPOLY_YMM, $GFPOLY_XMM ) = ( "$V5", "%ymm5", "%xmm5" );
463
464    my $code = <<___;
465    # Get pointer to lowest set of key powers (located at end of array).
466    lea             $OFFSETOFEND_H_POWERS-$VL($HTABLE), $POWERS_PTR
467
468    # Load the byte-reflected hash subkey.  BoringSSL provides it in
469    # byte-reflected form except the two halves are in the wrong order.
470    vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM
471
472    # Finish preprocessing the first key power, H^1.  Since this GHASH
473    # implementation operates directly on values with the backwards bit
474    # order specified by the GCM standard, it's necessary to preprocess the
475    # raw key as follows.  First, reflect its bytes.  Second, multiply it
476    # by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
477    # interpretation of polynomial coefficients), which can also be
478    # interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
479    # + 1 using the alternative, natural interpretation of polynomial
480    # coefficients.  For details, see the comment above _ghash_mul_step.
481    #
482    # Either way, for the multiplication the concrete operation performed
483    # is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
484    # << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
485    # wide shift instruction, so instead double each of the two 64-bit
486    # halves and incorporate the internal carry bit into the value XOR'd.
487    vpshufd         \$0xd3, $H_CUR_XMM, %xmm0
488    vpsrad          \$31, %xmm0, %xmm0
489    vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
490    # H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit
491    vpternlogd      \$0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, $H_CUR_XMM
492
493    # Load the gfpoly constant.
494    vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
495
496    # Square H^1 to get H^2.
497    #
498    # Note that as with H^1, all higher key powers also need an extra
499    # factor of x^-1 (or x using the natural interpretation).  Nothing
500    # special needs to be done to make this happen, though: H^1 * H^1 would
501    # end up with two factors of x^-1, but the multiplication consumes one.
502    # So the product H^2 ends up with the desired one factor of x^-1.
503    @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
504                    "%xmm0", "%xmm1", "%xmm2" ]}
505
506    # Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
507    vinserti128     \$1, $H_CUR_XMM, $H_INC_YMM, $H_CUR_YMM
508    vinserti128     \$1, $H_INC_XMM, $H_INC_YMM, $H_INC_YMM
509___
510
511    if ( $VL == 64 ) {
512
513        # Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
514        $code .= <<___;
515        @{[ _ghash_mul  $H_INC_YMM, $H_CUR_YMM, $H_INC_YMM, $GFPOLY_YMM,
516                        "%ymm0", "%ymm1", "%ymm2" ]}
517        vinserti64x4    \$1, $H_CUR_YMM, $H_INC, $H_CUR
518        vshufi64x2      \$0, $H_INC, $H_INC, $H_INC
519___
520    }
521
522    $code .= <<___;
523    # Store the lowest set of key powers.
524    vmovdqu8        $H_CUR, ($POWERS_PTR)
525
526    # Compute and store the remaining key powers.  With VL=32, repeatedly
527    # multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
528    # With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
529    # [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
530    mov             \$@{[ $NUM_H_POWERS*16/$VL - 1 ]}, %eax
531.Lprecompute_next$local_label_suffix:
532    sub             \$$VL, $POWERS_PTR
533    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR, $GFPOLY, $V0, $V1, $V2 ]}
534    vmovdqu8        $H_CUR, ($POWERS_PTR)
535    dec             %eax
536    jnz             .Lprecompute_next$local_label_suffix
537
538    vzeroupper      # This is needed after using ymm or zmm registers.
539___
540    return $code;
541}
542
543# XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
544# the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
545sub _horizontal_xor {
546    my ( $src, $src_xmm, $dst_xmm, $t0_xmm, $t1_xmm, $t2_xmm ) = @_;
547    if ( $VL == 32 ) {
548        return <<___;
549        vextracti32x4   \$1, $src, $t0_xmm
550        vpxord          $t0_xmm, $src_xmm, $dst_xmm
551___
552    }
553    elsif ( $VL == 64 ) {
554        return <<___;
555        vextracti32x4   \$1, $src, $t0_xmm
556        vextracti32x4   \$2, $src, $t1_xmm
557        vextracti32x4   \$3, $src, $t2_xmm
558        vpxord          $t0_xmm, $src_xmm, $dst_xmm
559        vpternlogd      \$0x96, $t1_xmm, $t2_xmm, $dst_xmm
560___
561    }
562    else {
563        die "Unsupported vector length";
564    }
565}
566
567# Do one step of the GHASH update of the data blocks given in the vector
568# registers GHASHDATA[0-3].  \i specifies the step to do, 0 through 9.  The
569# division into steps allows users of this macro to optionally interleave the
570# computation with other instructions.  This macro uses the vector register
571# GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
572# H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
573# GHASHTMP[0-2] as temporaries.  This macro handles the byte-reflection of the
574# data blocks.  The parameter registers must be preserved across steps.
575#
576# The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
577# H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
578# operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
579# with VL=32 there are 2 blocks per vector and the vectorized terms correspond
580# to the following non-vectorized terms:
581#
582#       H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
583#       H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
584#       H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
585#       H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
586#
587# With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
588#
589# More concretely, this code does:
590#   - Do vectorized "schoolbook" multiplications to compute the intermediate
591#     256-bit product of each block and its corresponding hash key power.
592#     There are 4*VL/16 of these intermediate products.
593#   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
594#     VL/16 256-bit intermediate values.
595#   - Do a vectorized reduction of these 256-bit intermediate values to
596#     128-bits each.  This leaves VL/16 128-bit intermediate values.
597#   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
598#
599# See _ghash_mul_step for the full explanation of the operations performed for
600# each individual finite field multiplication and reduction.
601sub _ghash_step_4x {
602    my ($i) = @_;
603    if ( $i == 0 ) {
604        return <<___;
605        vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
606        vpxord          $GHASH_ACC, $GHASHDATA0, $GHASHDATA0
607        vpshufb         $BSWAP_MASK, $GHASHDATA1, $GHASHDATA1
608        vpshufb         $BSWAP_MASK, $GHASHDATA2, $GHASHDATA2
609___
610    }
611    elsif ( $i == 1 ) {
612        return <<___;
613        vpshufb         $BSWAP_MASK, $GHASHDATA3, $GHASHDATA3
614        vpclmulqdq      \$0x00, $H_POW4, $GHASHDATA0, $GHASH_ACC    # LO_0
615        vpclmulqdq      \$0x00, $H_POW3, $GHASHDATA1, $GHASHTMP0    # LO_1
616        vpclmulqdq      \$0x00, $H_POW2, $GHASHDATA2, $GHASHTMP1    # LO_2
617___
618    }
619    elsif ( $i == 2 ) {
620        return <<___;
621        vpxord          $GHASHTMP0, $GHASH_ACC, $GHASH_ACC          # sum(LO_{1,0})
622        vpclmulqdq      \$0x00, $H_POW1, $GHASHDATA3, $GHASHTMP2    # LO_3
623        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASH_ACC  # LO = sum(LO_{3,2,1,0})
624        vpclmulqdq      \$0x01, $H_POW4, $GHASHDATA0, $GHASHTMP0    # MI_0
625___
626    }
627    elsif ( $i == 3 ) {
628        return <<___;
629        vpclmulqdq      \$0x01, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_1
630        vpclmulqdq      \$0x01, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_2
631        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{2,1,0})
632        vpclmulqdq      \$0x01, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_3
633___
634    }
635    elsif ( $i == 4 ) {
636        return <<___;
637        vpclmulqdq      \$0x10, $H_POW4, $GHASHDATA0, $GHASHTMP2    # MI_4
638        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{4,3,2,1,0})
639        vpclmulqdq      \$0x10, $H_POW3, $GHASHDATA1, $GHASHTMP1    # MI_5
640        vpclmulqdq      \$0x10, $H_POW2, $GHASHDATA2, $GHASHTMP2    # MI_6
641___
642    }
643    elsif ( $i == 5 ) {
644        return <<___;
645        vpternlogd      \$0x96, $GHASHTMP2, $GHASHTMP1, $GHASHTMP0  # sum(MI_{6,5,4,3,2,1,0})
646        vpclmulqdq      \$0x01, $GHASH_ACC, $GFPOLY, $GHASHTMP2     # LO_L*(x^63 + x^62 + x^57)
647        vpclmulqdq      \$0x10, $H_POW1, $GHASHDATA3, $GHASHTMP1    # MI_7
648        vpxord          $GHASHTMP1, $GHASHTMP0, $GHASHTMP0          # MI = sum(MI_{7,6,5,4,3,2,1,0})
649___
650    }
651    elsif ( $i == 6 ) {
652        return <<___;
653        vpshufd         \$0x4e, $GHASH_ACC, $GHASH_ACC              # Swap halves of LO
654        vpclmulqdq      \$0x11, $H_POW4, $GHASHDATA0, $GHASHDATA0   # HI_0
655        vpclmulqdq      \$0x11, $H_POW3, $GHASHDATA1, $GHASHDATA1   # HI_1
656        vpclmulqdq      \$0x11, $H_POW2, $GHASHDATA2, $GHASHDATA2   # HI_2
657___
658    }
659    elsif ( $i == 7 ) {
660        return <<___;
661        vpternlogd      \$0x96, $GHASHTMP2, $GHASH_ACC, $GHASHTMP0  # Fold LO into MI
662        vpclmulqdq      \$0x11, $H_POW1, $GHASHDATA3, $GHASHDATA3   # HI_3
663        vpternlogd      \$0x96, $GHASHDATA2, $GHASHDATA1, $GHASHDATA0 # sum(HI_{2,1,0})
664        vpclmulqdq      \$0x01, $GHASHTMP0, $GFPOLY, $GHASHTMP1     # MI_L*(x^63 + x^62 + x^57)
665___
666    }
667    elsif ( $i == 8 ) {
668        return <<___;
669        vpxord          $GHASHDATA3, $GHASHDATA0, $GHASH_ACC        # HI = sum(HI_{3,2,1,0})
670        vpshufd         \$0x4e, $GHASHTMP0, $GHASHTMP0              # Swap halves of MI
671        vpternlogd      \$0x96, $GHASHTMP1, $GHASHTMP0, $GHASH_ACC  # Fold MI into HI
672___
673    }
674    elsif ( $i == 9 ) {
675        return _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
676          $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM;
677    }
678}
679
680# Update GHASH with the blocks given in GHASHDATA[0-3].
681# See _ghash_step_4x for full explanation.
682sub _ghash_4x {
683    my $code = "";
684    for my $i ( 0 .. 9 ) {
685        $code .= _ghash_step_4x $i;
686    }
687    return $code;
688}
689
690$g_ghash_macro_expansion_count = 0;
691
692# void gcm_ghash_##suffix(uint8_t Xi[16], const u128 Htable[16],
693#                         const uint8_t *in, size_t len);
694#
695# This macro generates the body of a GHASH update function with the above
696# prototype.  This macro supports both VL=32 and VL=64.  _set_veclen must have
697# been invoked with the desired length.
698#
699# The generated function processes the AAD (Additional Authenticated Data) in
700# GCM.  Using the key |Htable|, it updates the GHASH accumulator |Xi| with the
701# data given by |in| and |len|.  On the first call, |Xi| must be all zeroes.
702# |len| must be a multiple of 16.
703#
704# This function handles large amounts of AAD efficiently, while also keeping the
705# overhead low for small amounts of AAD which is the common case.  TLS uses less
706# than one block of AAD, but (uncommonly) other use cases may use much more.
707sub _ghash_update {
708    my $local_label_suffix = "__func" . ++$g_ghash_macro_expansion_count;
709    my $code               = "";
710
711    # Function arguments
712    my ( $GHASH_ACC_PTR, $H_POWERS, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
713
714    # Additional local variables
715    ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V0, "%xmm0" );
716    ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V1, "%xmm1" );
717    ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V2, "%xmm2" );
718    ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V3, "%xmm3" );
719    ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V4, "%xmm4" );
720    ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( $V5, "%xmm5" );
721    ( $H_POW4, $H_POW3, $H_POW2 )          = ( $V6, $V7, $V8 );
722    ( $H_POW1, $H_POW1_XMM )               = ( $V9, "%xmm9" );
723    ( $GFPOLY, $GFPOLY_XMM )               = ( $V10, "%xmm10" );
724    ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V11, $V12, $V13 );
725
726    $code .= <<___;
727    @{[ _save_xmmregs (6 .. 13) ]}
728    .seh_endprologue
729
730    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
731    # usually only 128-bit vectors will be used.  So as an optimization, don't
732    # broadcast these constants to all 128-bit lanes quite yet.
733    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
734    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
735
736    # Load the GHASH accumulator.
737    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
738    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
739
740    # Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
741    cmp             \$$VL, $AADLEN
742    jb              .Laad_blockbyblock$local_label_suffix
743
744    # AADLEN >= VL, so we'll operate on full vectors.  Broadcast bswap_mask and
745    # gfpoly to all 128-bit lanes.
746    vshufi64x2      \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
747    vshufi64x2      \$0, $GFPOLY, $GFPOLY, $GFPOLY
748
749    # Load the lowest set of key powers.
750    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
751
752    cmp             \$4*$VL-1, $AADLEN
753    jbe             .Laad_loop_1x$local_label_suffix
754
755    # AADLEN >= 4*VL.  Load the higher key powers.
756    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
757    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
758    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
759
760    # Update GHASH with 4*VL bytes of AAD at a time.
761.Laad_loop_4x$local_label_suffix:
762    vmovdqu8        0*$VL($AAD), $GHASHDATA0
763    vmovdqu8        1*$VL($AAD), $GHASHDATA1
764    vmovdqu8        2*$VL($AAD), $GHASHDATA2
765    vmovdqu8        3*$VL($AAD), $GHASHDATA3
766    @{[ _ghash_4x ]}
767    sub             \$-4*$VL, $AAD  # shorter than 'add 4*VL' when VL=32
768    add             \$-4*$VL, $AADLEN
769    cmp             \$4*$VL-1, $AADLEN
770    ja              .Laad_loop_4x$local_label_suffix
771
772    # Update GHASH with VL bytes of AAD at a time.
773    cmp             \$$VL, $AADLEN
774    jb              .Laad_large_done$local_label_suffix
775.Laad_loop_1x$local_label_suffix:
776    vmovdqu8        ($AAD), $GHASHDATA0
777    vpshufb         $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
778    vpxord          $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
779    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
780                    $GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
781    @{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
782                        $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
783    add             \$$VL, $AAD
784    sub             \$$VL, $AADLEN
785    cmp             \$$VL, $AADLEN
786    jae             .Laad_loop_1x$local_label_suffix
787
788.Laad_large_done$local_label_suffix:
789    # Issue the vzeroupper that is needed after using ymm or zmm registers.
790    # Do it here instead of at the end, to minimize overhead for small AADLEN.
791    vzeroupper
792
793    # GHASH the remaining data 16 bytes at a time, using xmm registers only.
794.Laad_blockbyblock$local_label_suffix:
795    test            $AADLEN, $AADLEN
796    jz              .Laad_done$local_label_suffix
797    vmovdqu         $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1_XMM
798.Laad_loop_blockbyblock$local_label_suffix:
799    vmovdqu         ($AAD), $GHASHDATA0_XMM
800    vpshufb         $BSWAP_MASK_XMM, $GHASHDATA0_XMM, $GHASHDATA0_XMM
801    vpxor           $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
802    @{[ _ghash_mul  $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
803                    $GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
804    add             \$16, $AAD
805    sub             \$16, $AADLEN
806    jnz             .Laad_loop_blockbyblock$local_label_suffix
807
808.Laad_done$local_label_suffix:
809    # Store the updated GHASH accumulator back to memory.
810    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
811    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
812___
813    return $code;
814}
815
816# Do one non-last round of AES encryption on the counter blocks in V0-V3 using
817# the round key that has been broadcast to all 128-bit lanes of \round_key.
818sub _vaesenc_4x {
819    my ($round_key) = @_;
820    return <<___;
821    vaesenc         $round_key, $V0, $V0
822    vaesenc         $round_key, $V1, $V1
823    vaesenc         $round_key, $V2, $V2
824    vaesenc         $round_key, $V3, $V3
825___
826}
827
828# Start the AES encryption of four vectors of counter blocks.
829sub _ctr_begin_4x {
830    return <<___;
831    # Increment LE_CTR four times to generate four vectors of little-endian
832    # counter blocks, swap each to big-endian, and store them in V0-V3.
833    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
834    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
835    vpshufb         $BSWAP_MASK, $LE_CTR, $V1
836    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
837    vpshufb         $BSWAP_MASK, $LE_CTR, $V2
838    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
839    vpshufb         $BSWAP_MASK, $LE_CTR, $V3
840    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
841
842    # AES "round zero": XOR in the zero-th round key.
843    vpxord          $RNDKEY0, $V0, $V0
844    vpxord          $RNDKEY0, $V1, $V1
845    vpxord          $RNDKEY0, $V2, $V2
846    vpxord          $RNDKEY0, $V3, $V3
847___
848}
849
850# Do the last AES round for four vectors of counter blocks V0-V3, XOR source
851# data with the resulting keystream, and write the result to DST and
852# GHASHDATA[0-3].  (Implementation differs slightly, but has the same effect.)
853sub _aesenclast_and_xor_4x {
854    return <<___;
855    # XOR the source data with the last round key, saving the result in
856    # GHASHDATA[0-3].  This reduces latency by taking advantage of the
857    # property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
858    vpxord          0*$VL($SRC), $RNDKEYLAST, $GHASHDATA0
859    vpxord          1*$VL($SRC), $RNDKEYLAST, $GHASHDATA1
860    vpxord          2*$VL($SRC), $RNDKEYLAST, $GHASHDATA2
861    vpxord          3*$VL($SRC), $RNDKEYLAST, $GHASHDATA3
862
863    # Do the last AES round.  This handles the XOR with the source data
864    # too, as per the optimization described above.
865    vaesenclast     $GHASHDATA0, $V0, $GHASHDATA0
866    vaesenclast     $GHASHDATA1, $V1, $GHASHDATA1
867    vaesenclast     $GHASHDATA2, $V2, $GHASHDATA2
868    vaesenclast     $GHASHDATA3, $V3, $GHASHDATA3
869
870    # Store the en/decrypted data to DST.
871    vmovdqu8        $GHASHDATA0, 0*$VL($DST)
872    vmovdqu8        $GHASHDATA1, 1*$VL($DST)
873    vmovdqu8        $GHASHDATA2, 2*$VL($DST)
874    vmovdqu8        $GHASHDATA3, 3*$VL($DST)
875___
876}
877
878$g_update_macro_expansion_count = 0;
879
880# void aes_gcm_{enc,dec}_update_##suffix(const uint8_t *in, uint8_t *out,
881#                                        size_t len, const AES_KEY *key,
882#                                        const uint8_t ivec[16],
883#                                        const u128 Htable[16],
884#                                        uint8_t Xi[16]);
885#
886# This macro generates a GCM encryption or decryption update function with the
887# above prototype (with \enc selecting which one).  This macro supports both
888# VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
889#
890# This function computes the next portion of the CTR keystream, XOR's it with
891# |len| bytes from |in|, and writes the resulting encrypted or decrypted data
892# to |out|.  It also updates the GHASH accumulator |Xi| using the next |len|
893# ciphertext bytes.
894#
895# |len| must be a multiple of 16, except on the last call where it can be any
896# length.  The caller must do any buffering needed to ensure this.  Both
897# in-place and out-of-place en/decryption are supported.
898#
899# |ivec| must give the current counter in big-endian format.  This function
900# loads the counter from |ivec| and increments the loaded counter as needed, but
901# it does *not* store the updated counter back to |ivec|.  The caller must
902# update |ivec| if any more data segments follow.  Internally, only the low
903# 32-bit word of the counter is incremented, following the GCM standard.
904sub _aes_gcm_update {
905    my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
906
907    my ($enc) = @_;
908
909    my $code = "";
910
911    # Function arguments
912    ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ) =
913      $win64
914      ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
915      : ( @argregs[ 0 .. 5 ], "%r12" );
916
917    # Additional local variables
918
919    # %rax, %k1, and %k2 are used as temporary registers.  BE_CTR_PTR is
920    # also available as a temporary register after the counter is loaded.
921
922    # AES key length in bytes
923    ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
924
925    # Pointer to the last AES round key for the chosen AES variant
926    $RNDKEYLAST_PTR = "%r11";
927
928    # In the main loop, V0-V3 are used as AES input and output.  Elsewhere
929    # they are used as temporary registers.
930
931    # GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
932    ( $GHASHDATA0, $GHASHDATA0_XMM ) = ( $V4, "%xmm4" );
933    ( $GHASHDATA1, $GHASHDATA1_XMM ) = ( $V5, "%xmm5" );
934    ( $GHASHDATA2, $GHASHDATA2_XMM ) = ( $V6, "%xmm6" );
935    ( $GHASHDATA3, $GHASHDATA3_XMM ) = ( $V7, "%xmm7" );
936
937    # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
938    # using vpshufb, copied to all 128-bit lanes.
939    ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( $V8, "%xmm8" );
940
941    # RNDKEY temporarily holds the next AES round key.
942    $RNDKEY = $V9;
943
944    # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
945    # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
946    # more than one lane may be used, and they need to be XOR'd together.
947    ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( $V10, "%xmm10" );
948
949    # LE_CTR_INC is the vector of 32-bit words that need to be added to a
950    # vector of little-endian counter blocks to advance it forwards.
951    $LE_CTR_INC = $V11;
952
953    # LE_CTR contains the next set of little-endian counter blocks.
954    $LE_CTR = $V12;
955
956    # RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys,
957    # copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
958    # RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
959    (
960        $RNDKEY0,   $RNDKEYLAST, $RNDKEY_M9, $RNDKEY_M8,
961        $RNDKEY_M7, $RNDKEY_M6,  $RNDKEY_M5, $RNDKEY_M4,
962        $RNDKEY_M3, $RNDKEY_M2,  $RNDKEY_M1
963    ) = ( $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23 );
964
965    # GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
966    # cannot coincide with anything used for AES encryption, since for
967    # performance reasons GHASH and AES encryption are interleaved.
968    ( $GHASHTMP0, $GHASHTMP1, $GHASHTMP2 ) = ( $V24, $V25, $V26 );
969
970    # H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
971    # descending numbering reflects the order of the key powers.
972    ( $H_POW4, $H_POW3, $H_POW2, $H_POW1 ) = ( $V27, $V28, $V29, $V30 );
973
974    # GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
975    $GFPOLY = $V31;
976
977    if ($win64) {
978        $code .= <<___;
979        @{[ _save_gpregs $BE_CTR_PTR, $H_POWERS, $GHASH_ACC_PTR ]}
980        mov             64(%rsp), $BE_CTR_PTR     # arg5
981        mov             72(%rsp), $H_POWERS       # arg6
982        mov             80(%rsp), $GHASH_ACC_PTR  # arg7
983        @{[ _save_xmmregs (6 .. 15) ]}
984        .seh_endprologue
985___
986    }
987    else {
988        $code .= <<___;
989        @{[ _save_gpregs $GHASH_ACC_PTR ]}
990        mov             16(%rsp), $GHASH_ACC_PTR  # arg7
991___
992    }
993
994    if ($enc) {
995        $code .= <<___;
996#ifdef BORINGSSL_DISPATCH_TEST
997        .extern BORINGSSL_function_hit
998        movb \$1,BORINGSSL_function_hit+@{[ $VL < 64 ? 6 : 7 ]}(%rip)
999#endif
1000___
1001    }
1002    $code .= <<___;
1003    # Load some constants.
1004    vbroadcasti32x4 .Lbswap_mask(%rip), $BSWAP_MASK
1005    vbroadcasti32x4 .Lgfpoly(%rip), $GFPOLY
1006
1007    # Load the GHASH accumulator and the starting counter.
1008    # BoringSSL passes these values in big endian format.
1009    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
1010    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
1011    vbroadcasti32x4 ($BE_CTR_PTR), $LE_CTR
1012    vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR
1013
1014    # Load the AES key length in bytes.  BoringSSL stores number of rounds
1015    # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
1016    movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
1017    lea             -20(,$AESKEYLEN,4), $AESKEYLEN
1018
1019    # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
1020    # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
1021    # respectively.  Then load the zero-th and last round keys.
1022    lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
1023    vbroadcasti32x4 ($AESKEY), $RNDKEY0
1024    vbroadcasti32x4 ($RNDKEYLAST_PTR), $RNDKEYLAST
1025
1026    # Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
1027    vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
1028
1029    # Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
1030    vbroadcasti32x4 .Linc_@{[ $VL / 16 ]}blocks(%rip), $LE_CTR_INC
1031
1032    # If there are at least 4*VL bytes of data, then continue into the loop
1033    # that processes 4*VL bytes of data at a time.  Otherwise skip it.
1034    cmp             \$4*$VL-1, $DATALEN
1035    jbe             .Lcrypt_loop_4x_done$local_label_suffix
1036
1037    # Load powers of the hash key.
1038    vmovdqu8        $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
1039    vmovdqu8        $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
1040    vmovdqu8        $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
1041    vmovdqu8        $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
1042___
1043
1044    # Main loop: en/decrypt and hash 4 vectors at a time.
1045    #
1046    # When possible, interleave the AES encryption of the counter blocks
1047    # with the GHASH update of the ciphertext blocks.  This improves
1048    # performance on many CPUs because the execution ports used by the VAES
1049    # instructions often differ from those used by vpclmulqdq and other
1050    # instructions used in GHASH.  For example, many Intel CPUs dispatch
1051    # vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
1052    #
1053    # The interleaving is easiest to do during decryption, since during
1054    # decryption the ciphertext blocks are immediately available.  For
1055    # encryption, instead encrypt the first set of blocks, then hash those
1056    # blocks while encrypting the next set of blocks, repeat that as
1057    # needed, and finally hash the last set of blocks.
1058
1059    if ($enc) {
1060        $code .= <<___;
1061        # Encrypt the first 4 vectors of plaintext blocks.  Leave the resulting
1062        # ciphertext in GHASHDATA[0-3] for GHASH.
1063        @{[ _ctr_begin_4x ]}
1064        lea             16($AESKEY), %rax
1065.Lvaesenc_loop_first_4_vecs$local_label_suffix:
1066        vbroadcasti32x4 (%rax), $RNDKEY
1067        @{[ _vaesenc_4x $RNDKEY ]}
1068        add             \$16, %rax
1069        cmp             %rax, $RNDKEYLAST_PTR
1070        jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
1071        @{[ _aesenclast_and_xor_4x ]}
1072        sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
1073        sub             \$-4*$VL, $DST
1074        add             \$-4*$VL, $DATALEN
1075        cmp             \$4*$VL-1, $DATALEN
1076        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
1077___
1078    }
1079
1080    # Cache as many additional AES round keys as possible.
1081    for my $i ( reverse 1 .. 9 ) {
1082        $code .= <<___;
1083        vbroadcasti32x4 -$i*16($RNDKEYLAST_PTR), ${"RNDKEY_M$i"}
1084___
1085    }
1086
1087    $code .= <<___;
1088.Lcrypt_loop_4x$local_label_suffix:
1089___
1090
1091    # If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
1092    # encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
1093    if ( !$enc ) {
1094        $code .= <<___;
1095        vmovdqu8        0*$VL($SRC), $GHASHDATA0
1096        vmovdqu8        1*$VL($SRC), $GHASHDATA1
1097        vmovdqu8        2*$VL($SRC), $GHASHDATA2
1098        vmovdqu8        3*$VL($SRC), $GHASHDATA3
1099___
1100    }
1101
1102    $code .= <<___;
1103    # Start the AES encryption of the counter blocks.
1104    @{[ _ctr_begin_4x ]}
1105    cmp             \$24, $AESKEYLEN
1106    jl              .Laes128$local_label_suffix
1107    je              .Laes192$local_label_suffix
1108    # AES-256
1109    vbroadcasti32x4 -13*16($RNDKEYLAST_PTR), $RNDKEY
1110    @{[ _vaesenc_4x $RNDKEY ]}
1111    vbroadcasti32x4 -12*16($RNDKEYLAST_PTR), $RNDKEY
1112    @{[ _vaesenc_4x $RNDKEY ]}
1113.Laes192$local_label_suffix:
1114    vbroadcasti32x4 -11*16($RNDKEYLAST_PTR), $RNDKEY
1115    @{[ _vaesenc_4x $RNDKEY ]}
1116    vbroadcasti32x4 -10*16($RNDKEYLAST_PTR), $RNDKEY
1117    @{[ _vaesenc_4x $RNDKEY ]}
1118.Laes128$local_label_suffix:
1119___
1120
1121    # Finish the AES encryption of the counter blocks in V0-V3, interleaved
1122    # with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
1123    for my $i ( reverse 1 .. 9 ) {
1124        $code .= <<___;
1125        @{[ _ghash_step_4x  (9 - $i) ]}
1126        @{[ _vaesenc_4x     ${"RNDKEY_M$i"} ]}
1127___
1128    }
1129    $code .= <<___;
1130    @{[ _ghash_step_4x  9 ]}
1131    @{[ _aesenclast_and_xor_4x ]}
1132    sub             \$-4*$VL, $SRC  # shorter than 'add 4*VL' when VL=32
1133    sub             \$-4*$VL, $DST
1134    add             \$-4*$VL, $DATALEN
1135    cmp             \$4*$VL-1, $DATALEN
1136    ja              .Lcrypt_loop_4x$local_label_suffix
1137___
1138
1139    if ($enc) {
1140
1141        # Update GHASH with the last set of ciphertext blocks.
1142        $code .= <<___;
1143.Lghash_last_ciphertext_4x$local_label_suffix:
1144        @{[ _ghash_4x ]}
1145___
1146    }
1147
1148    my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
1149
1150    $code .= <<___;
1151.Lcrypt_loop_4x_done$local_label_suffix:
1152    # Check whether any data remains.
1153    test            $DATALEN, $DATALEN
1154    jz              .Ldone$local_label_suffix
1155
1156    # The data length isn't a multiple of 4*VL.  Process the remaining data
1157    # of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
1158    # Going one vector at a time may seem inefficient compared to having
1159    # separate code paths for each possible number of vectors remaining.
1160    # However, using a loop keeps the code size down, and it performs
1161    # surprising well; modern CPUs will start executing the next iteration
1162    # before the previous one finishes and also predict the number of loop
1163    # iterations.  For a similar reason, we roll up the AES rounds.
1164    #
1165    # On the last iteration, the remaining length may be less than VL.
1166    # Handle this using masking.
1167    #
1168    # Since there are enough key powers available for all remaining data,
1169    # there is no need to do a GHASH reduction after each iteration.
1170    # Instead, multiply each remaining block by its own key power, and only
1171    # do a GHASH reduction at the very end.
1172
1173    # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
1174    # is the number of blocks that remain.
1175    mov             $DATALEN, %rax
1176    neg             %rax
1177    and             \$-16, %rax  # -round_up(DATALEN, 16)
1178    lea             $OFFSETOFEND_H_POWERS($H_POWERS,%rax), $POWERS_PTR
1179___
1180
1181    # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
1182    my ( $LO, $LO_XMM ) = ( $GHASHDATA0, $GHASHDATA0_XMM );
1183    my ( $MI, $MI_XMM ) = ( $GHASHDATA1, $GHASHDATA1_XMM );
1184    my ( $HI, $HI_XMM ) = ( $GHASHDATA2, $GHASHDATA2_XMM );
1185    $code .= <<___;
1186    vpxor           $LO_XMM, $LO_XMM, $LO_XMM
1187    vpxor           $MI_XMM, $MI_XMM, $MI_XMM
1188    vpxor           $HI_XMM, $HI_XMM, $HI_XMM
1189
1190    cmp             \$$VL, $DATALEN
1191    jb              .Lpartial_vec$local_label_suffix
1192
1193.Lcrypt_loop_1x$local_label_suffix:
1194    # Process a full vector of length VL.
1195
1196    # Encrypt a vector of counter blocks.
1197    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
1198    vpaddd          $LE_CTR_INC, $LE_CTR, $LE_CTR
1199    vpxord          $RNDKEY0, $V0, $V0
1200    lea             16($AESKEY), %rax
1201.Lvaesenc_loop_tail_full_vec$local_label_suffix:
1202    vbroadcasti32x4 (%rax), $RNDKEY
1203    vaesenc         $RNDKEY, $V0, $V0
1204    add             \$16, %rax
1205    cmp             %rax, $RNDKEYLAST_PTR
1206    jne             .Lvaesenc_loop_tail_full_vec$local_label_suffix
1207    vaesenclast     $RNDKEYLAST, $V0, $V0
1208
1209    # XOR the data with the vector of keystream blocks.
1210    vmovdqu8        ($SRC), $V1
1211    vpxord          $V1, $V0, $V0
1212    vmovdqu8        $V0, ($DST)
1213
1214    # Update GHASH with the ciphertext blocks, without reducing.
1215    vmovdqu8        ($POWERS_PTR), $H_POW1
1216    vpshufb         $BSWAP_MASK, @{[ $enc ? $V0 : $V1 ]}, $V0
1217    vpxord          $GHASH_ACC, $V0, $V0
1218    @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
1219                            $V1, $V2, $V3 ]}
1220    vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
1221
1222    add             \$$VL, $POWERS_PTR
1223    add             \$$VL, $SRC
1224    add             \$$VL, $DST
1225    sub             \$$VL, $DATALEN
1226    cmp             \$$VL, $DATALEN
1227    jae             .Lcrypt_loop_1x$local_label_suffix
1228
1229    test            $DATALEN, $DATALEN
1230    jz              .Lreduce$local_label_suffix
1231
1232.Lpartial_vec$local_label_suffix:
1233    # Process a partial vector of length 1 <= DATALEN < VL.
1234
1235    # Set the data mask %k1 to DATALEN 1's.
1236    # Set the key powers mask %k2 to round_up(DATALEN, 16) 1's.
1237    mov             \$-1, %rax
1238    bzhi            $DATALEN, %rax, %rax
1239    @{[ $VL < 64 ? "kmovd %eax, %k1" : "kmovq %rax, %k1" ]}
1240    add             \$15, $DATALEN
1241    and             \$-16, $DATALEN
1242    mov             \$-1, %rax
1243    bzhi            $DATALEN, %rax, %rax
1244    @{[ $VL < 64 ? "kmovd %eax, %k2" : "kmovq %rax, %k2" ]}
1245
1246    # Encrypt one last vector of counter blocks.  This does not need to be
1247    # masked.  The counter does not need to be incremented here.
1248    vpshufb         $BSWAP_MASK, $LE_CTR, $V0
1249    vpxord          $RNDKEY0, $V0, $V0
1250    lea             16($AESKEY), %rax
1251.Lvaesenc_loop_tail_partialvec$local_label_suffix:
1252    vbroadcasti32x4 (%rax), $RNDKEY
1253    vaesenc         $RNDKEY, $V0, $V0
1254    add             \$16, %rax
1255    cmp             %rax, $RNDKEYLAST_PTR
1256    jne             .Lvaesenc_loop_tail_partialvec$local_label_suffix
1257    vaesenclast     $RNDKEYLAST, $V0, $V0
1258
1259    # XOR the data with the appropriate number of keystream bytes.
1260    vmovdqu8        ($SRC), $V1\{%k1}{z}
1261    vpxord          $V1, $V0, $V0
1262    vmovdqu8        $V0, ($DST){%k1}
1263
1264    # Update GHASH with the ciphertext block(s), without reducing.
1265    #
1266    # In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
1267    # (If decrypting, it's done by the above masked load.  If encrypting,
1268    # it's done by the below masked register-to-register move.)  Note that
1269    # if DATALEN <= VL - 16, there will be additional padding beyond the
1270    # padding of the last block specified by GHASH itself; i.e., there may
1271    # be whole block(s) that get processed by the GHASH multiplication and
1272    # reduction instructions but should not actually be included in the
1273    # GHASH.  However, any such blocks are all-zeroes, and the values that
1274    # they're multiplied with are also all-zeroes.  Therefore they just add
1275    # 0 * 0 = 0 to the final GHASH result, which makes no difference.
1276    vmovdqu8        ($POWERS_PTR), $H_POW1\{%k2}{z}
1277    @{[ $enc ? "vmovdqu8 $V0, $V1\{%k1}{z}" : "" ]}
1278    vpshufb         $BSWAP_MASK, $V1, $V0
1279    vpxord          $GHASH_ACC, $V0, $V0
1280    @{[ _ghash_mul_noreduce $H_POW1, $V0, $LO, $MI, $HI, $GHASHDATA3,
1281                            $V1, $V2, $V3 ]}
1282
1283.Lreduce$local_label_suffix:
1284    # Finally, do the GHASH reduction.
1285    @{[ _ghash_reduce   $LO, $MI, $HI, $GFPOLY, $V0 ]}
1286    @{[ _horizontal_xor $HI, $HI_XMM, $GHASH_ACC_XMM,
1287                        "%xmm0", "%xmm1", "%xmm2" ]}
1288
1289.Ldone$local_label_suffix:
1290    # Store the updated GHASH accumulator back to memory.
1291    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
1292    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
1293
1294    vzeroupper      # This is needed after using ymm or zmm registers.
1295___
1296    return $code;
1297}
1298
1299# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
1300$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
1301{
1302    my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
1303    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
1304      map( "%xmm$_", ( 0 .. 6 ) );
1305
1306    $code .= <<___;
1307    @{[ _save_xmmregs (6) ]}
1308    .seh_endprologue
1309
1310    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
1311    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
1312    vmovdqu         $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
1313    vmovdqu         .Lgfpoly(%rip), $GFPOLY
1314    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
1315
1316    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
1317
1318    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
1319    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
1320___
1321}
1322$code .= _end_func;
1323
1324# Disabled until significant deployment of AVX10/256 is seen.  The separate
1325# *_vaes_avx2 implementation provides the only 256-bit support for now.
1326#
1327# $code .= _begin_func "gcm_init_vpclmulqdq_avx10_256", 0;
1328# $code .= _aes_gcm_init;
1329# $code .= _end_func;
1330#
1331# $code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_256", 1;
1332# $code .= _ghash_update;
1333# $code .= _end_func;
1334#
1335# $code .= _begin_func "aes_gcm_enc_update_vaes_avx10_256", 1;
1336# $code .= _aes_gcm_update 1;
1337# $code .= _end_func;
1338#
1339# $code .= _begin_func "aes_gcm_dec_update_vaes_avx10_256", 1;
1340# $code .= _aes_gcm_update 0;
1341# $code .= _end_func;
1342
1343_set_veclen 64;
1344
1345$code .= _begin_func "gcm_init_vpclmulqdq_avx10_512", 0;
1346$code .= _aes_gcm_init;
1347$code .= _end_func;
1348
1349$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
1350$code .= _ghash_update;
1351$code .= _end_func;
1352
1353$code .= _begin_func "aes_gcm_enc_update_vaes_avx10_512", 1;
1354$code .= _aes_gcm_update 1;
1355$code .= _end_func;
1356
1357$code .= _begin_func "aes_gcm_dec_update_vaes_avx10_512", 1;
1358$code .= _aes_gcm_update 0;
1359$code .= _end_func;
1360
1361print $code;
1362close STDOUT or die "error closing STDOUT: $!";
1363exit 0;
1364