• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2# Copyright 2024 The BoringSSL Authors
3#
4# Permission to use, copy, modify, and/or distribute this software for any
5# purpose with or without fee is hereby granted, provided that the above
6# copyright notice and this permission notice appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15#
16#------------------------------------------------------------------------------
17#
18# VAES and VPCLMULQDQ optimized AES-GCM for x86_64 (AVX2 version)
19#
20# This is similar to aes-gcm-avx10-x86_64.pl, but it uses AVX2 instead of AVX512
21# / AVX10.  This means it can only use 16 vector registers instead of 32, the
22# maximum vector length is 32 bytes, and some instructions such as vpternlogd
23# and masked loads/stores are unavailable.  However, it is able to run on CPUs
24# that have VAES without AVX512 / AVX10, namely AMD Zen 3 (including "Milan"
25# server processors) and some Intel client CPUs such as Alder Lake.
26#
27# This implementation also uses Karatsuba multiplication instead of schoolbook
28# multiplication for GHASH in its main loop.  This does not help much on Intel,
29# but it improves performance by ~5% on AMD Zen 3 which is the main target for
30# this implementation.  Other factors weighing slightly in favor of Karatsuba
31# multiplication in this implementation are the lower maximum vector length
32# (which means there is space left in the Htable array to cache the halves of
33# the key powers XOR'd together) and the unavailability of the vpternlogd
34# instruction (which helped schoolbook a bit more than Karatsuba).
35
36use strict;
37
38my $flavour = shift;
39my $output  = shift;
40if ( $flavour =~ /\./ ) { $output = $flavour; undef $flavour; }
41
42my $win64;
43my @argregs;
44if ( $flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/ ) {
45    $win64   = 1;
46    @argregs = ( "%rcx", "%rdx", "%r8", "%r9" );
47}
48else {
49    $win64   = 0;
50    @argregs = ( "%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9" );
51}
52
53$0 =~ m/(.*[\/\\])[^\/\\]+$/;
54my $dir = $1;
55my $xlate;
56( $xlate = "${dir}x86_64-xlate.pl" and -f $xlate )
57  or ( $xlate = "${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate )
58  or die "can't locate x86_64-xlate.pl";
59
60open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
61*STDOUT = *OUT;
62
63my $g_cur_func_name;
64my $g_cur_func_uses_seh;
65my @g_cur_func_saved_gpregs;
66my @g_cur_func_saved_xmmregs;
67
68sub _begin_func {
69    my ( $funcname, $uses_seh ) = @_;
70    $g_cur_func_name          = $funcname;
71    $g_cur_func_uses_seh      = $uses_seh;
72    @g_cur_func_saved_gpregs  = ();
73    @g_cur_func_saved_xmmregs = ();
74    return <<___;
75.globl $funcname
76.type $funcname,\@abi-omnipotent
77.align 32
78$funcname:
79    .cfi_startproc
80    @{[ $uses_seh ? ".seh_startproc" : "" ]}
81    _CET_ENDBR
82___
83}
84
85# Push a list of general purpose registers onto the stack.
86sub _save_gpregs {
87    my @gpregs = @_;
88    my $code   = "";
89    die "_save_gpregs requires uses_seh" unless $g_cur_func_uses_seh;
90    die "_save_gpregs can only be called once per function"
91      if @g_cur_func_saved_gpregs;
92    die "Order must be _save_gpregs, then _save_xmmregs"
93      if @g_cur_func_saved_xmmregs;
94    @g_cur_func_saved_gpregs = @gpregs;
95    for my $reg (@gpregs) {
96        $code .= "push $reg\n";
97        if ($win64) {
98            $code .= ".seh_pushreg $reg\n";
99        }
100        else {
101            $code .= ".cfi_push $reg\n";
102        }
103    }
104    return $code;
105}
106
107# Push a list of xmm registers onto the stack if the target is Windows.
108sub _save_xmmregs {
109    my @xmmregs     = @_;
110    my $num_xmmregs = scalar @xmmregs;
111    my $code        = "";
112    die "_save_xmmregs requires uses_seh" unless $g_cur_func_uses_seh;
113    die "_save_xmmregs can only be called once per function"
114      if @g_cur_func_saved_xmmregs;
115    if ( $win64 and $num_xmmregs > 0 ) {
116        @g_cur_func_saved_xmmregs = @xmmregs;
117        my $is_misaligned = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
118        my $alloc_size    = 16 * $num_xmmregs + ( $is_misaligned ? 8 : 0 );
119        $code .= "sub \$$alloc_size, %rsp\n";
120        $code .= ".seh_stackalloc $alloc_size\n";
121        for my $i ( 0 .. $num_xmmregs - 1 ) {
122            my $reg_num = $xmmregs[$i];
123            my $pos     = 16 * $i;
124            $code .= "movdqa %xmm$reg_num, $pos(%rsp)\n";
125            $code .= ".seh_savexmm %xmm$reg_num, $pos\n";
126        }
127    }
128    return $code;
129}
130
131sub _end_func {
132    my $code = "";
133
134    # Restore any xmm registers that were saved earlier.
135    my $num_xmmregs = scalar @g_cur_func_saved_xmmregs;
136    if ( $win64 and $num_xmmregs > 0 ) {
137        my $need_alignment = ( scalar @g_cur_func_saved_gpregs ) % 2 == 0;
138        my $alloc_size     = 16 * $num_xmmregs + ( $need_alignment ? 8 : 0 );
139        for my $i ( 0 .. $num_xmmregs - 1 ) {
140            my $reg_num = $g_cur_func_saved_xmmregs[$i];
141            my $pos     = 16 * $i;
142            $code .= "movdqa $pos(%rsp), %xmm$reg_num\n";
143        }
144        $code .= "add \$$alloc_size, %rsp\n";
145    }
146
147    # Restore any general purpose registers that were saved earlier.
148    for my $reg ( reverse @g_cur_func_saved_gpregs ) {
149        $code .= "pop $reg\n";
150        if ( !$win64 ) {
151            $code .= ".cfi_pop $reg\n";
152        }
153    }
154
155    $code .= <<___;
156    ret
157    @{[ $g_cur_func_uses_seh ? ".seh_endproc" : "" ]}
158    .cfi_endproc
159    .size   $g_cur_func_name, . - $g_cur_func_name
160___
161    return $code;
162}
163
164my $code = <<___;
165.section .rodata
166.align 16
167
168    # A shuffle mask that reflects the bytes of 16-byte blocks
169.Lbswap_mask:
170    .quad   0x08090a0b0c0d0e0f, 0x0001020304050607
171
172    # This is the GHASH reducing polynomial without its constant term, i.e.
173    # x^128 + x^7 + x^2 + x, represented using the backwards mapping
174    # between bits and polynomial coefficients.
175    #
176    # Alternatively, it can be interpreted as the naturally-ordered
177    # representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
178    # "reversed" GHASH reducing polynomial without its x^128 term.
179.Lgfpoly:
180    .quad   1, 0xc200000000000000
181
182    # Same as above, but with the (1 << 64) bit set.
183.Lgfpoly_and_internal_carrybit:
184    .quad   1, 0xc200000000000001
185
186.align 32
187    # The below constants are used for incrementing the counter blocks.
188.Lctr_pattern:
189    .quad   0, 0
190    .quad   1, 0
191.Linc_2blocks:
192    .quad   2, 0
193    .quad   2, 0
194
195.text
196___
197
198# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
199# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
200# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
201my $NUM_H_POWERS            = 8;
202my $OFFSETOFEND_H_POWERS    = $NUM_H_POWERS * 16;
203my $OFFSETOF_H_POWERS_XORED = $OFFSETOFEND_H_POWERS;
204
205# Offset to 'rounds' in AES_KEY struct
206my $OFFSETOF_AES_ROUNDS = 240;
207
208# GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
209# the reduced products in \dst.  Uses schoolbook multiplication.
210sub _ghash_mul {
211    my ( $a, $b, $dst, $gfpoly, $t0, $t1, $t2 ) = @_;
212    return <<___;
213    vpclmulqdq      \$0x00, $a, $b, $t0        # LO = a_L * b_L
214    vpclmulqdq      \$0x01, $a, $b, $t1        # MI_0 = a_L * b_H
215    vpclmulqdq      \$0x10, $a, $b, $t2        # MI_1 = a_H * b_L
216    vpxor           $t2, $t1, $t1              # MI = MI_0 + MI_1
217    vpclmulqdq      \$0x01, $t0, $gfpoly, $t2  # LO_L*(x^63 + x^62 + x^57)
218    vpshufd         \$0x4e, $t0, $t0           # Swap halves of LO
219    vpxor           $t0, $t1, $t1              # Fold LO into MI (part 1)
220    vpxor           $t2, $t1, $t1              # Fold LO into MI (part 2)
221    vpclmulqdq      \$0x11, $a, $b, $dst       # HI = a_H * b_H
222    vpclmulqdq      \$0x01, $t1, $gfpoly, $t0  # MI_L*(x^63 + x^62 + x^57)
223    vpshufd         \$0x4e, $t1, $t1           # Swap halves of MI
224    vpxor           $t1, $dst, $dst            # Fold MI into HI (part 1)
225    vpxor           $t0, $dst, $dst            # Fold MI into HI (part 2)
226___
227}
228
229# void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]);
230#
231# Initialize |Htable| with powers of the GHASH subkey |H|.
232#
233# We use Htable[0..7] to store H^8 through H^1, and Htable[8..11] to store the
234# 64-bit halves of the key powers XOR'd together (for Karatsuba multiplication)
235# in the order 8,6,7,5,4,2,3,1.  We do not use Htable[12..15].
236$code .= _begin_func "gcm_init_vpclmulqdq_avx2", 1;
237{
238    my ( $HTABLE, $H_PTR ) = @argregs[ 0 .. 1 ];
239    my ( $TMP0,   $TMP0_XMM )   = ( "%ymm0", "%xmm0" );
240    my ( $TMP1,   $TMP1_XMM )   = ( "%ymm1", "%xmm1" );
241    my ( $TMP2,   $TMP2_XMM )   = ( "%ymm2", "%xmm2" );
242    my ( $H_CUR,  $H_CUR_XMM )  = ( "%ymm3", "%xmm3" );
243    my ( $H_CUR2, $H_CUR2_XMM ) = ( "%ymm4", "%xmm4" );
244    my ( $H_INC,  $H_INC_XMM )  = ( "%ymm5", "%xmm5" );
245    my ( $GFPOLY, $GFPOLY_XMM ) = ( "%ymm6", "%xmm6" );
246
247    $code .= <<___;
248    @{[ _save_xmmregs (6) ]}
249    .seh_endprologue
250
251    # Load the byte-reflected hash subkey.  BoringSSL provides it in
252    # byte-reflected form except the two halves are in the wrong order.
253    vpshufd         \$0x4e, ($H_PTR), $H_CUR_XMM
254
255    # Finish preprocessing the byte-reflected hash subkey by multiplying it by
256    # x^-1 ("standard" interpretation of polynomial coefficients) or
257    # equivalently x^1 (natural interpretation).  This gets the key into a
258    # format that avoids having to bit-reflect the data blocks later.
259    vpshufd         \$0xd3, $H_CUR_XMM, $TMP0_XMM
260    vpsrad          \$31, $TMP0_XMM, $TMP0_XMM
261    vpaddq          $H_CUR_XMM, $H_CUR_XMM, $H_CUR_XMM
262    vpand           .Lgfpoly_and_internal_carrybit(%rip), $TMP0_XMM, $TMP0_XMM
263    vpxor           $TMP0_XMM, $H_CUR_XMM, $H_CUR_XMM
264
265    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
266
267    # Square H^1 to get H^2.
268    @{[ _ghash_mul  $H_CUR_XMM, $H_CUR_XMM, $H_INC_XMM, $GFPOLY_XMM,
269                    $TMP0_XMM, $TMP1_XMM, $TMP2_XMM ]}
270
271    # Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2].
272    vinserti128     \$1, $H_CUR_XMM, $H_INC, $H_CUR
273    vinserti128     \$1, $H_INC_XMM, $H_INC, $H_INC
274
275    # Compute H_CUR2 = [H^4, H^3].
276    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
277
278    # Store [H^2, H^1] and [H^4, H^3].
279    vmovdqu         $H_CUR, 3*32($HTABLE)
280    vmovdqu         $H_CUR2, 2*32($HTABLE)
281
282    # For Karatsuba multiplication: compute and store the two 64-bit halves of
283    # each key power XOR'd together.  Order is 4,2,3,1.
284    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
285    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
286    vpxor           $TMP1, $TMP0, $TMP0
287    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED+32($HTABLE)
288
289    # Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7].
290    @{[ _ghash_mul  $H_INC, $H_CUR2, $H_CUR, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
291    @{[ _ghash_mul  $H_INC, $H_CUR, $H_CUR2, $GFPOLY, $TMP0, $TMP1, $TMP2 ]}
292    vmovdqu         $H_CUR, 1*32($HTABLE)
293    vmovdqu         $H_CUR2, 0*32($HTABLE)
294
295    # Again, compute and store the two 64-bit halves of each key power XOR'd
296    # together.  Order is 8,6,7,5.
297    vpunpcklqdq     $H_CUR, $H_CUR2, $TMP0
298    vpunpckhqdq     $H_CUR, $H_CUR2, $TMP1
299    vpxor           $TMP1, $TMP0, $TMP0
300    vmovdqu         $TMP0, $OFFSETOF_H_POWERS_XORED($HTABLE)
301
302    vzeroupper
303___
304}
305$code .= _end_func;
306
307# Do one step of the GHASH update of four vectors of data blocks.
308#   $i: the step to do, 0 through 9
309#   $ghashdata_ptr: pointer to the data blocks (ciphertext or AAD)
310#   $htable: pointer to the Htable for the key
311#   $bswap_mask: mask for reflecting the bytes of blocks
312#   $h_pow[2-1]_xored: XOR'd key powers cached from Htable
313#   $tmp[0-2]: temporary registers.  $tmp[1-2] must be preserved across steps.
314#   $lo, $mi: working state for this macro that must be preserved across steps
315#   $ghash_acc: the GHASH accumulator (input/output)
316sub _ghash_step_4x {
317    my (
318        $i,            $ghashdata_ptr, $htable, $bswap_mask,
319        $h_pow2_xored, $h_pow1_xored,  $tmp0,   $tmp0_xmm,
320        $tmp1,         $tmp2,          $lo,     $mi,
321        $ghash_acc,    $ghash_acc_xmm
322    ) = @_;
323    my ( $hi, $hi_xmm ) = ( $ghash_acc, $ghash_acc_xmm );    # alias
324    if ( $i == 0 ) {
325        return <<___;
326        # First vector
327        vmovdqu         0*32($ghashdata_ptr), $tmp1
328        vpshufb         $bswap_mask, $tmp1, $tmp1
329        vmovdqu         0*32($htable), $tmp2
330        vpxor           $ghash_acc, $tmp1, $tmp1
331        vpclmulqdq      \$0x00, $tmp2, $tmp1, $lo
332        vpclmulqdq      \$0x11, $tmp2, $tmp1, $hi
333        vpunpckhqdq     $tmp1, $tmp1, $tmp0
334        vpxor           $tmp1, $tmp0, $tmp0
335        vpclmulqdq      \$0x00, $h_pow2_xored, $tmp0, $mi
336___
337    }
338    elsif ( $i == 1 ) {
339        return <<___;
340___
341    }
342    elsif ( $i == 2 ) {
343        return <<___;
344        # Second vector
345        vmovdqu         1*32($ghashdata_ptr), $tmp1
346        vpshufb         $bswap_mask, $tmp1, $tmp1
347        vmovdqu         1*32($htable), $tmp2
348        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
349        vpxor           $tmp0, $lo, $lo
350        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
351        vpxor           $tmp0, $hi, $hi
352        vpunpckhqdq     $tmp1, $tmp1, $tmp0
353        vpxor           $tmp1, $tmp0, $tmp0
354        vpclmulqdq      \$0x10, $h_pow2_xored, $tmp0, $tmp0
355        vpxor           $tmp0, $mi, $mi
356___
357    }
358    elsif ( $i == 3 ) {
359        return <<___;
360        # Third vector
361        vmovdqu         2*32($ghashdata_ptr), $tmp1
362        vpshufb         $bswap_mask, $tmp1, $tmp1
363        vmovdqu         2*32($htable), $tmp2
364___
365    }
366    elsif ( $i == 4 ) {
367        return <<___;
368        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
369        vpxor           $tmp0, $lo, $lo
370        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
371        vpxor           $tmp0, $hi, $hi
372___
373    }
374    elsif ( $i == 5 ) {
375        return <<___;
376        vpunpckhqdq     $tmp1, $tmp1, $tmp0
377        vpxor           $tmp1, $tmp0, $tmp0
378        vpclmulqdq      \$0x00, $h_pow1_xored, $tmp0, $tmp0
379        vpxor           $tmp0, $mi, $mi
380
381        # Fourth vector
382        vmovdqu         3*32($ghashdata_ptr), $tmp1
383        vpshufb         $bswap_mask, $tmp1, $tmp1
384___
385    }
386    elsif ( $i == 6 ) {
387        return <<___;
388        vmovdqu         3*32($htable), $tmp2
389        vpclmulqdq      \$0x00, $tmp2, $tmp1, $tmp0
390        vpxor           $tmp0, $lo, $lo
391        vpclmulqdq      \$0x11, $tmp2, $tmp1, $tmp0
392        vpxor           $tmp0, $hi, $hi
393        vpunpckhqdq     $tmp1, $tmp1, $tmp0
394        vpxor           $tmp1, $tmp0, $tmp0
395        vpclmulqdq      \$0x10, $h_pow1_xored, $tmp0, $tmp0
396        vpxor           $tmp0, $mi, $mi
397___
398    }
399    elsif ( $i == 7 ) {
400        return <<___;
401        # Finalize 'mi' following Karatsuba multiplication.
402        vpxor           $lo, $mi, $mi
403        vpxor           $hi, $mi, $mi
404
405        # Fold lo into mi.
406        vbroadcasti128  .Lgfpoly(%rip), $tmp2
407        vpclmulqdq      \$0x01, $lo, $tmp2, $tmp0
408        vpshufd         \$0x4e, $lo, $lo
409        vpxor           $lo, $mi, $mi
410        vpxor           $tmp0, $mi, $mi
411___
412    }
413    elsif ( $i == 8 ) {
414        return <<___;
415        # Fold mi into hi.
416        vpclmulqdq      \$0x01, $mi, $tmp2, $tmp0
417        vpshufd         \$0x4e, $mi, $mi
418        vpxor           $mi, $hi, $hi
419        vpxor           $tmp0, $hi, $hi
420___
421    }
422    elsif ( $i == 9 ) {
423        return <<___;
424        vextracti128    \$1, $hi, $tmp0_xmm
425        vpxor           $tmp0_xmm, $hi_xmm, $ghash_acc_xmm
426___
427    }
428}
429
430sub _ghash_4x {
431    my $code = "";
432    for my $i ( 0 .. 9 ) {
433        $code .= _ghash_step_4x $i, @_;
434    }
435    return $code;
436}
437
438# void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16]);
439$code .= _begin_func "gcm_gmult_vpclmulqdq_avx2", 1;
440{
441    my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
442    my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
443      map( "%xmm$_", ( 0 .. 6 ) );
444
445    $code .= <<___;
446    @{[ _save_xmmregs (6) ]}
447    .seh_endprologue
448
449    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC
450    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK
451    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
452    vmovdqu         .Lgfpoly(%rip), $GFPOLY
453    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
454
455    @{[ _ghash_mul  $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
456
457    vpshufb         $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
458    vmovdqu         $GHASH_ACC, ($GHASH_ACC_PTR)
459___
460}
461$code .= _end_func;
462
463# void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16],
464#                                const uint8_t *in, size_t len);
465#
466# Using the key |Htable|, update the GHASH accumulator |Xi| with the data given
467# by |in| and |len|.  |len| must be a multiple of 16.
468#
469# This function handles large amounts of AAD efficiently, while also keeping the
470# overhead low for small amounts of AAD which is the common case.  TLS uses less
471# than one block of AAD, but (uncommonly) other use cases may use much more.
472$code .= _begin_func "gcm_ghash_vpclmulqdq_avx2", 1;
473{
474    # Function arguments
475    my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AADLEN ) = @argregs[ 0 .. 3 ];
476
477    # Additional local variables
478    my ( $TMP0,       $TMP0_XMM )       = ( "%ymm0", "%xmm0" );
479    my ( $TMP1,       $TMP1_XMM )       = ( "%ymm1", "%xmm1" );
480    my ( $TMP2,       $TMP2_XMM )       = ( "%ymm2", "%xmm2" );
481    my ( $LO,         $LO_XMM )         = ( "%ymm3", "%xmm3" );
482    my ( $MI,         $MI_XMM )         = ( "%ymm4", "%xmm4" );
483    my ( $GHASH_ACC,  $GHASH_ACC_XMM )  = ( "%ymm5", "%xmm5" );
484    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm6", "%xmm6" );
485    my ( $GFPOLY,     $GFPOLY_XMM )     = ( "%ymm7", "%xmm7" );
486    my $H_POW2_XORED = "%ymm8";
487    my $H_POW1_XORED = "%ymm9";
488
489    $code .= <<___;
490    @{[ _save_xmmregs (6 .. 9) ]}
491    .seh_endprologue
492
493    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
494    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
495    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
496    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
497
498    # Optimize for AADLEN < 32 by checking for AADLEN < 32 before AADLEN < 128.
499    cmp             \$32, $AADLEN
500    jb              .Lghash_lastblock
501
502    cmp             \$127, $AADLEN
503    jbe             .Lghash_loop_1x
504
505    # Update GHASH with 128 bytes of AAD at a time.
506    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
507    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
508.Lghash_loop_4x:
509    @{[ _ghash_4x   $AAD, $HTABLE, $BSWAP_MASK, $H_POW2_XORED, $H_POW1_XORED,
510                    $TMP0, $TMP0_XMM, $TMP1, $TMP2, $LO, $MI, $GHASH_ACC,
511                    $GHASH_ACC_XMM ]}
512    sub             \$-128, $AAD  # 128 is 4 bytes, -128 is 1 byte
513    add             \$-128, $AADLEN
514    cmp             \$127, $AADLEN
515    ja              .Lghash_loop_4x
516
517    # Update GHASH with 32 bytes of AAD at a time.
518    cmp             \$32, $AADLEN
519    jb              .Lghash_loop_1x_done
520.Lghash_loop_1x:
521    vmovdqu         ($AAD), $TMP0
522    vpshufb         $BSWAP_MASK, $TMP0, $TMP0
523    vpxor           $TMP0, $GHASH_ACC, $GHASH_ACC
524    vmovdqu         $OFFSETOFEND_H_POWERS-32($HTABLE), $TMP0
525    @{[ _ghash_mul  $TMP0, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $TMP1, $TMP2, $LO ]}
526    vextracti128    \$1, $GHASH_ACC, $TMP0_XMM
527    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
528    add             \$32, $AAD
529    sub             \$32, $AADLEN
530    cmp             \$32, $AADLEN
531    jae             .Lghash_loop_1x
532.Lghash_loop_1x_done:
533    # Issue the vzeroupper that is needed after using ymm registers.  Do it here
534    # instead of at the end, to minimize overhead for small AADLEN.
535    vzeroupper
536
537    # Update GHASH with the remaining 16-byte block if any.
538.Lghash_lastblock:
539    test            $AADLEN, $AADLEN
540    jz              .Lghash_done
541    vmovdqu         ($AAD), $TMP0_XMM
542    vpshufb         $BSWAP_MASK_XMM, $TMP0_XMM, $TMP0_XMM
543    vpxor           $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
544    vmovdqu         $OFFSETOFEND_H_POWERS-16($HTABLE), $TMP0_XMM
545    @{[ _ghash_mul  $TMP0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
546                    $TMP1_XMM, $TMP2_XMM, $LO_XMM ]}
547
548.Lghash_done:
549    # Store the updated GHASH accumulator back to memory.
550    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
551    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
552___
553}
554$code .= _end_func;
555
556sub _vaesenc_4x {
557    my ( $round_key, $aesdata0, $aesdata1, $aesdata2, $aesdata3 ) = @_;
558    return <<___;
559    vaesenc         $round_key, $aesdata0, $aesdata0
560    vaesenc         $round_key, $aesdata1, $aesdata1
561    vaesenc         $round_key, $aesdata2, $aesdata2
562    vaesenc         $round_key, $aesdata3, $aesdata3
563___
564}
565
566sub _ctr_begin_4x {
567    my (
568        $le_ctr,   $bswap_mask, $rndkey0,  $aesdata0,
569        $aesdata1, $aesdata2,   $aesdata3, $tmp
570    ) = @_;
571    return <<___;
572    # Increment le_ctr four times to generate four vectors of little-endian
573    # counter blocks, swap each to big-endian, and store them in aesdata[0-3].
574    vmovdqu         .Linc_2blocks(%rip), $tmp
575    vpshufb         $bswap_mask, $le_ctr, $aesdata0
576    vpaddd          $tmp, $le_ctr, $le_ctr
577    vpshufb         $bswap_mask, $le_ctr, $aesdata1
578    vpaddd          $tmp, $le_ctr, $le_ctr
579    vpshufb         $bswap_mask, $le_ctr, $aesdata2
580    vpaddd          $tmp, $le_ctr, $le_ctr
581    vpshufb         $bswap_mask, $le_ctr, $aesdata3
582    vpaddd          $tmp, $le_ctr, $le_ctr
583
584    # AES "round zero": XOR in the zero-th round key.
585    vpxor           $rndkey0, $aesdata0, $aesdata0
586    vpxor           $rndkey0, $aesdata1, $aesdata1
587    vpxor           $rndkey0, $aesdata2, $aesdata2
588    vpxor           $rndkey0, $aesdata3, $aesdata3
589___
590}
591
592# Do the last AES round for four vectors of counter blocks, XOR four vectors of
593# source data with the resulting keystream blocks, and write the result to the
594# destination buffer.  The implementation differs slightly as it takes advantage
595# of the property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) to reduce
596# latency, but it has the same effect.
597sub _aesenclast_and_xor_4x {
598    my (
599        $src,      $dst,      $rndkeylast, $aesdata0,
600        $aesdata1, $aesdata2, $aesdata3,   $t0,
601        $t1,       $t2,       $t3
602    ) = @_;
603    return <<___;
604    vpxor           0*32($src), $rndkeylast, $t0
605    vpxor           1*32($src), $rndkeylast, $t1
606    vpxor           2*32($src), $rndkeylast, $t2
607    vpxor           3*32($src), $rndkeylast, $t3
608    vaesenclast     $t0, $aesdata0, $aesdata0
609    vaesenclast     $t1, $aesdata1, $aesdata1
610    vaesenclast     $t2, $aesdata2, $aesdata2
611    vaesenclast     $t3, $aesdata3, $aesdata3
612    vmovdqu         $aesdata0, 0*32($dst)
613    vmovdqu         $aesdata1, 1*32($dst)
614    vmovdqu         $aesdata2, 2*32($dst)
615    vmovdqu         $aesdata3, 3*32($dst)
616___
617}
618
619my $g_update_macro_expansion_count = 0;
620
621# void aes_gcm_{enc,dec}_update_vaes_avx2(const uint8_t *in, uint8_t *out,
622#                                         size_t len, const AES_KEY *key,
623#                                         const uint8_t ivec[16],
624#                                         const u128 Htable[16],
625#                                         uint8_t Xi[16]);
626#
627# This macro generates a GCM encryption or decryption update function with the
628# above prototype (with \enc selecting which one).  The function computes the
629# next portion of the CTR keystream, XOR's it with |len| bytes from |in|, and
630# writes the resulting encrypted or decrypted data to |out|.  It also updates
631# the GHASH accumulator |Xi| using the next |len| ciphertext bytes.
632#
633# |len| must be a multiple of 16.  The caller must do any buffering needed to
634# ensure this.  Both in-place and out-of-place en/decryption are supported.
635#
636# |ivec| must give the current counter in big-endian format.  This function
637# loads the counter from |ivec| and increments the loaded counter as needed, but
638# it does *not* store the updated counter back to |ivec|.  The caller must
639# update |ivec| if any more data segments follow.  Internally, only the low
640# 32-bit word of the counter is incremented, following the GCM standard.
641sub _aes_gcm_update {
642    my $local_label_suffix = "__func" . ++$g_update_macro_expansion_count;
643    my ($enc)              = @_;
644    my $code               = "";
645
646    # Function arguments
647    my ( $SRC, $DST, $DATALEN, $AESKEY, $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR )
648      = $win64
649      ? ( @argregs[ 0 .. 3 ], "%rsi", "%rdi", "%r12" )
650      : ( @argregs[ 0 .. 5 ], "%r12" );
651
652    # Additional local variables.
653    # %rax is used as a temporary register.  BE_CTR_PTR is also available as a
654    # temporary register after the counter is loaded.
655
656    # AES key length in bytes
657    my ( $AESKEYLEN, $AESKEYLEN64 ) = ( "%r10d", "%r10" );
658
659    # Pointer to the last AES round key for the chosen AES variant
660    my $RNDKEYLAST_PTR = "%r11";
661
662    # BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
663    # using vpshufb, copied to all 128-bit lanes.
664    my ( $BSWAP_MASK, $BSWAP_MASK_XMM ) = ( "%ymm0", "%xmm0" );
665
666    # GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
667    # only the lowest 128-bit lane can be nonzero.  When not fully reduced,
668    # more than one lane may be used, and they need to be XOR'd together.
669    my ( $GHASH_ACC, $GHASH_ACC_XMM ) = ( "%ymm1", "%xmm1" );
670
671    # TMP[0-2] are temporary registers.
672    my ( $TMP0, $TMP0_XMM ) = ( "%ymm2", "%xmm2" );
673    my ( $TMP1, $TMP1_XMM ) = ( "%ymm3", "%xmm3" );
674    my ( $TMP2, $TMP2_XMM ) = ( "%ymm4", "%xmm4" );
675
676    # LO and MI are used to accumulate unreduced GHASH products.
677    my ( $LO, $LO_XMM ) = ( "%ymm5", "%xmm5" );
678    my ( $MI, $MI_XMM ) = ( "%ymm6", "%xmm6" );
679
680    # Cached key powers from Htable
681    my ( $H_POW2_XORED, $H_POW2_XORED_XMM ) = ( "%ymm7", "%xmm7" );
682    my ( $H_POW1_XORED, $H_POW1_XORED_XMM ) = ( "%ymm8", "%xmm8" );
683
684    # RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one.
685    my $RNDKEY0    = "%ymm9";
686    my $RNDKEYLAST = "%ymm10";
687
688    # LE_CTR contains the next set of little-endian counter blocks.
689    my $LE_CTR = "%ymm11";
690
691    # AESDATA[0-3] hold the counter blocks that are being encrypted by AES.
692    my ( $AESDATA0, $AESDATA0_XMM ) = ( "%ymm12", "%xmm12" );
693    my ( $AESDATA1, $AESDATA1_XMM ) = ( "%ymm13", "%xmm13" );
694    my ( $AESDATA2, $AESDATA2_XMM ) = ( "%ymm14", "%xmm14" );
695    my ( $AESDATA3, $AESDATA3_XMM ) = ( "%ymm15", "%xmm15" );
696    my @AESDATA = ( $AESDATA0, $AESDATA1, $AESDATA2, $AESDATA3 );
697
698    my @ghash_4x_args = (
699        $enc ? $DST : $SRC, $HTABLE, $BSWAP_MASK, $H_POW2_XORED,
700        $H_POW1_XORED,      $TMP0,   $TMP0_XMM,   $TMP1,
701        $TMP2,              $LO,     $MI,         $GHASH_ACC,
702        $GHASH_ACC_XMM
703    );
704
705    if ($win64) {
706        $code .= <<___;
707        @{[ _save_gpregs $BE_CTR_PTR, $HTABLE, $GHASH_ACC_PTR ]}
708        mov             64(%rsp), $BE_CTR_PTR     # arg5
709        mov             72(%rsp), $HTABLE         # arg6
710        mov             80(%rsp), $GHASH_ACC_PTR  # arg7
711        @{[ _save_xmmregs (6 .. 15) ]}
712        .seh_endprologue
713___
714    }
715    else {
716        $code .= <<___;
717        @{[ _save_gpregs $GHASH_ACC_PTR ]}
718        mov             16(%rsp), $GHASH_ACC_PTR  # arg7
719___
720    }
721
722    if ($enc) {
723        $code .= <<___;
724#ifdef BORINGSSL_DISPATCH_TEST
725        .extern BORINGSSL_function_hit
726        movb \$1,BORINGSSL_function_hit+8(%rip)
727#endif
728___
729    }
730    $code .= <<___;
731    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
732
733    # Load the GHASH accumulator and the starting counter.
734    # BoringSSL passes these values in big endian format.
735    vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
736    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
737    vbroadcasti128  ($BE_CTR_PTR), $LE_CTR
738    vpshufb         $BSWAP_MASK, $LE_CTR, $LE_CTR
739
740    # Load the AES key length in bytes.  BoringSSL stores number of rounds
741    # minus 1, so convert using: AESKEYLEN = 4 * aeskey->rounds - 20.
742    movl            $OFFSETOF_AES_ROUNDS($AESKEY), $AESKEYLEN
743    lea             -20(,$AESKEYLEN,4), $AESKEYLEN
744
745    # Make RNDKEYLAST_PTR point to the last AES round key.  This is the
746    # round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
747    # respectively.  Then load the zero-th and last round keys.
748    lea             6*16($AESKEY,$AESKEYLEN64,4), $RNDKEYLAST_PTR
749    vbroadcasti128  ($AESKEY), $RNDKEY0
750    vbroadcasti128  ($RNDKEYLAST_PTR), $RNDKEYLAST
751
752    # Finish initializing LE_CTR by adding 1 to the second block.
753    vpaddd          .Lctr_pattern(%rip), $LE_CTR, $LE_CTR
754
755    # If there are at least 128 bytes of data, then continue into the loop that
756    # processes 128 bytes of data at a time.  Otherwise skip it.
757    cmp             \$127, $DATALEN
758    jbe             .Lcrypt_loop_4x_done$local_label_suffix
759
760    vmovdqu         $OFFSETOF_H_POWERS_XORED($HTABLE), $H_POW2_XORED
761    vmovdqu         $OFFSETOF_H_POWERS_XORED+32($HTABLE), $H_POW1_XORED
762___
763
764    # Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time.
765
766    if ($enc) {
767        $code .= <<___;
768        # Encrypt the first 4 vectors of plaintext blocks.
769        @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
770        lea             16($AESKEY), %rax
771.Lvaesenc_loop_first_4_vecs$local_label_suffix:
772        vbroadcasti128  (%rax), $TMP0
773        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
774        add             \$16, %rax
775        cmp             %rax, $RNDKEYLAST_PTR
776        jne             .Lvaesenc_loop_first_4_vecs$local_label_suffix
777        @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
778                                   $TMP0, $TMP1, $LO, $MI ]}
779        sub             \$-128, $SRC  # 128 is 4 bytes, -128 is 1 byte
780        add             \$-128, $DATALEN
781        cmp             \$127, $DATALEN
782        jbe             .Lghash_last_ciphertext_4x$local_label_suffix
783___
784    }
785
786    $code .= <<___;
787.align 16
788.Lcrypt_loop_4x$local_label_suffix:
789
790    # Start the AES encryption of the counter blocks.
791    @{[ _ctr_begin_4x $LE_CTR, $BSWAP_MASK, $RNDKEY0, @AESDATA, $TMP0 ]}
792    cmp             \$24, $AESKEYLEN
793    jl              .Laes128$local_label_suffix
794    je              .Laes192$local_label_suffix
795    # AES-256
796    vbroadcasti128 -13*16($RNDKEYLAST_PTR), $TMP0
797    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
798    vbroadcasti128 -12*16($RNDKEYLAST_PTR), $TMP0
799    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
800.Laes192$local_label_suffix:
801    vbroadcasti128 -11*16($RNDKEYLAST_PTR), $TMP0
802    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
803    vbroadcasti128 -10*16($RNDKEYLAST_PTR), $TMP0
804    @{[ _vaesenc_4x $TMP0, @AESDATA ]}
805.Laes128$local_label_suffix:
806___
807
808    # Finish the AES encryption of the counter blocks in AESDATA[0-3],
809    # interleaved with the GHASH update of the ciphertext blocks.
810    for my $i ( reverse 1 .. 9 ) {
811        $code .= <<___;
812        @{[ _ghash_step_4x 9-$i, @ghash_4x_args ]}
813        vbroadcasti128  -$i*16($RNDKEYLAST_PTR), $TMP0
814        @{[ _vaesenc_4x $TMP0, @AESDATA ]}
815___
816    }
817    $code .= <<___;
818    @{[ _ghash_step_4x 9, @ghash_4x_args ]}
819
820    @{[ $enc ? "sub \$-128, $DST" : "" ]}  # 128 is 4 bytes, -128 is 1 byte
821    @{[ _aesenclast_and_xor_4x $SRC, $DST, $RNDKEYLAST, @AESDATA,
822                               $TMP0, $TMP1, $LO, $MI ]}
823    sub             \$-128, $SRC
824    @{[ !$enc ? "sub \$-128, $DST" : "" ]}
825    add             \$-128, $DATALEN
826    cmp             \$127, $DATALEN
827    ja              .Lcrypt_loop_4x$local_label_suffix
828___
829
830    if ($enc) {
831
832        # Update GHASH with the last set of ciphertext blocks.
833        $code .= <<___;
834.Lghash_last_ciphertext_4x$local_label_suffix:
835        @{[ _ghash_4x @ghash_4x_args ]}
836        sub             \$-128, $DST
837___
838    }
839
840    my $POWERS_PTR = $BE_CTR_PTR;    # BE_CTR_PTR is free to be reused.
841    my ( $HI, $HI_XMM ) = ( $H_POW2_XORED, $H_POW2_XORED_XMM );    # reuse
842
843    $code .= <<___;
844.Lcrypt_loop_4x_done$local_label_suffix:
845    # Check whether any data remains.
846    test            $DATALEN, $DATALEN
847    jz              .Ldone$local_label_suffix
848
849    # DATALEN is in [16, 32, 48, 64, 80, 96, 112].
850
851    # Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
852    # is the number of blocks that remain.
853    lea             $OFFSETOFEND_H_POWERS($HTABLE), $POWERS_PTR
854    sub             $DATALEN, $POWERS_PTR
855
856    # Start collecting the unreduced GHASH intermediate value LO, MI, HI.
857    vpxor           $LO_XMM, $LO_XMM, $LO_XMM
858    vpxor           $MI_XMM, $MI_XMM, $MI_XMM
859    vpxor           $HI_XMM, $HI_XMM, $HI_XMM
860
861    cmp             \$64, $DATALEN
862    jb              .Llessthan64bytes$local_label_suffix
863
864    # DATALEN is in [64, 80, 96, 112].  Encrypt two vectors of counter blocks.
865    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
866    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
867    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
868    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
869    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
870    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
871    lea             16($AESKEY), %rax
872.Lvaesenc_loop_tail_1$local_label_suffix:
873    vbroadcasti128  (%rax), $TMP0
874    vaesenc         $TMP0, $AESDATA0, $AESDATA0
875    vaesenc         $TMP0, $AESDATA1, $AESDATA1
876    add             \$16, %rax
877    cmp             %rax, $RNDKEYLAST_PTR
878    jne             .Lvaesenc_loop_tail_1$local_label_suffix
879    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
880    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
881
882    # XOR the data with the two vectors of keystream blocks.
883    vmovdqu         0($SRC), $TMP0
884    vmovdqu         32($SRC), $TMP1
885    vpxor           $TMP0, $AESDATA0, $AESDATA0
886    vpxor           $TMP1, $AESDATA1, $AESDATA1
887    vmovdqu         $AESDATA0, 0($DST)
888    vmovdqu         $AESDATA1, 32($DST)
889
890    # Update GHASH with two vectors of ciphertext blocks, without reducing.
891    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
892    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA1 : $TMP1 ]}, $AESDATA1
893    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
894    vmovdqu         ($POWERS_PTR), $TMP0
895    vmovdqu         32($POWERS_PTR), $TMP1
896    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $LO
897    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $MI
898    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
899    vpxor           $TMP2, $MI, $MI
900    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $HI
901    vpclmulqdq      \$0x00, $TMP1, $AESDATA1, $TMP2
902    vpxor           $TMP2, $LO, $LO
903    vpclmulqdq      \$0x01, $TMP1, $AESDATA1, $TMP2
904    vpxor           $TMP2, $MI, $MI
905    vpclmulqdq      \$0x10, $TMP1, $AESDATA1, $TMP2
906    vpxor           $TMP2, $MI, $MI
907    vpclmulqdq      \$0x11, $TMP1, $AESDATA1, $TMP2
908    vpxor           $TMP2, $HI, $HI
909
910    add             \$64, $POWERS_PTR
911    add             \$64, $SRC
912    add             \$64, $DST
913    sub             \$64, $DATALEN
914    jz              .Lreduce$local_label_suffix
915
916    vpxor           $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
917
918    # DATALEN is in [16, 32, 48].  Encrypt two last vectors of counter blocks.
919.Llessthan64bytes$local_label_suffix:
920    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA0
921    vpaddd          .Linc_2blocks(%rip), $LE_CTR, $LE_CTR
922    vpshufb         $BSWAP_MASK, $LE_CTR, $AESDATA1
923    vpxor           $RNDKEY0, $AESDATA0, $AESDATA0
924    vpxor           $RNDKEY0, $AESDATA1, $AESDATA1
925    lea             16($AESKEY), %rax
926.Lvaesenc_loop_tail_2$local_label_suffix:
927    vbroadcasti128  (%rax), $TMP0
928    vaesenc         $TMP0, $AESDATA0, $AESDATA0
929    vaesenc         $TMP0, $AESDATA1, $AESDATA1
930    add             \$16, %rax
931    cmp             %rax, $RNDKEYLAST_PTR
932    jne             .Lvaesenc_loop_tail_2$local_label_suffix
933    vaesenclast     $RNDKEYLAST, $AESDATA0, $AESDATA0
934    vaesenclast     $RNDKEYLAST, $AESDATA1, $AESDATA1
935
936    # XOR the remaining data with the keystream blocks, and update GHASH with
937    # the remaining ciphertext blocks without reducing.
938
939    cmp             \$32, $DATALEN
940    jb              .Lxor_one_block$local_label_suffix
941    je              .Lxor_two_blocks$local_label_suffix
942
943.Lxor_three_blocks$local_label_suffix:
944    vmovdqu         0($SRC), $TMP0
945    vmovdqu         32($SRC), $TMP1_XMM
946    vpxor           $TMP0, $AESDATA0, $AESDATA0
947    vpxor           $TMP1_XMM, $AESDATA1_XMM, $AESDATA1_XMM
948    vmovdqu         $AESDATA0, 0($DST)
949    vmovdqu         $AESDATA1_XMM, 32($DST)
950
951    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
952    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA1_XMM : $TMP1_XMM ]}, $AESDATA1_XMM
953    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
954    vmovdqu         ($POWERS_PTR), $TMP0
955    vmovdqu         32($POWERS_PTR), $TMP1_XMM
956    vpclmulqdq      \$0x00, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
957    vpxor           $TMP2, $LO, $LO
958    vpclmulqdq      \$0x01, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
959    vpxor           $TMP2, $MI, $MI
960    vpclmulqdq      \$0x10, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
961    vpxor           $TMP2, $MI, $MI
962    vpclmulqdq      \$0x11, $TMP1_XMM, $AESDATA1_XMM, $TMP2_XMM
963    vpxor           $TMP2, $HI, $HI
964    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
965
966.Lxor_two_blocks$local_label_suffix:
967    vmovdqu         ($SRC), $TMP0
968    vpxor           $TMP0, $AESDATA0, $AESDATA0
969    vmovdqu         $AESDATA0, ($DST)
970    vpshufb         $BSWAP_MASK, @{[ $enc ? $AESDATA0 : $TMP0 ]}, $AESDATA0
971    vpxor           $GHASH_ACC, $AESDATA0, $AESDATA0
972    vmovdqu         ($POWERS_PTR), $TMP0
973    jmp             .Lghash_mul_one_vec_unreduced$local_label_suffix
974
975.Lxor_one_block$local_label_suffix:
976    vmovdqu         ($SRC), $TMP0_XMM
977    vpxor           $TMP0_XMM, $AESDATA0_XMM, $AESDATA0_XMM
978    vmovdqu         $AESDATA0_XMM, ($DST)
979    vpshufb         $BSWAP_MASK_XMM, @{[ $enc ? $AESDATA0_XMM : $TMP0_XMM ]}, $AESDATA0_XMM
980    vpxor           $GHASH_ACC_XMM, $AESDATA0_XMM, $AESDATA0_XMM
981    vmovdqu         ($POWERS_PTR), $TMP0_XMM
982
983.Lghash_mul_one_vec_unreduced$local_label_suffix:
984    vpclmulqdq      \$0x00, $TMP0, $AESDATA0, $TMP2
985    vpxor           $TMP2, $LO, $LO
986    vpclmulqdq      \$0x01, $TMP0, $AESDATA0, $TMP2
987    vpxor           $TMP2, $MI, $MI
988    vpclmulqdq      \$0x10, $TMP0, $AESDATA0, $TMP2
989    vpxor           $TMP2, $MI, $MI
990    vpclmulqdq      \$0x11, $TMP0, $AESDATA0, $TMP2
991    vpxor           $TMP2, $HI, $HI
992
993.Lreduce$local_label_suffix:
994    # Finally, do the GHASH reduction.
995    vbroadcasti128  .Lgfpoly(%rip), $TMP0
996    vpclmulqdq      \$0x01, $LO, $TMP0, $TMP1
997    vpshufd         \$0x4e, $LO, $LO
998    vpxor           $LO, $MI, $MI
999    vpxor           $TMP1, $MI, $MI
1000    vpclmulqdq      \$0x01, $MI, $TMP0, $TMP1
1001    vpshufd         \$0x4e, $MI, $MI
1002    vpxor           $MI, $HI, $HI
1003    vpxor           $TMP1, $HI, $HI
1004    vextracti128    \$1, $HI, $GHASH_ACC_XMM
1005    vpxor           $HI_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
1006
1007.Ldone$local_label_suffix:
1008    # Store the updated GHASH accumulator back to memory.
1009    vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
1010    vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
1011
1012    vzeroupper
1013___
1014    return $code;
1015}
1016
1017$code .= _begin_func "aes_gcm_enc_update_vaes_avx2", 1;
1018$code .= _aes_gcm_update 1;
1019$code .= _end_func;
1020
1021$code .= _begin_func "aes_gcm_dec_update_vaes_avx2", 1;
1022$code .= _aes_gcm_update 0;
1023$code .= _end_func;
1024
1025print $code;
1026close STDOUT or die "error closing STDOUT: $!";
1027exit 0;
1028