#!/usr/bin/env perl # Copyright (c) 2017, Shay Gueron. # Copyright (c) 2017, Google Inc. # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ use warnings FATAL => 'all'; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $code.=<<___; .data .align 16 one: .quad 1,0 two: .quad 2,0 three: .quad 3,0 four: .quad 4,0 five: .quad 5,0 six: .quad 6,0 seven: .quad 7,0 eight: .quad 8,0 OR_MASK: .long 0x00000000,0x00000000,0x00000000,0x80000000 poly: .quad 0x1, 0xc200000000000000 mask: .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d con1: .long 1,1,1,1 con2: .long 0x1b,0x1b,0x1b,0x1b con3: .byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 and_mask: .long 0,0xffffffff, 0xffffffff, 0xffffffff ___ $code.=<<___; .text ___ sub gfmul { ######################### # a = T # b = TMP0 - remains unchanged # res = T # uses also TMP1,TMP2,TMP3,TMP4 # __m128i GFMUL(__m128i A, __m128i B); my $T = "%xmm0"; my $TMP0 = "%xmm1"; my $TMP1 = "%xmm2"; my $TMP2 = "%xmm3"; my $TMP3 = "%xmm4"; my $TMP4 = "%xmm5"; $code.=<<___; .type GFMUL,\@abi-omnipotent .align 16 GFMUL: .cfi_startproc vpclmulqdq \$0x00, $TMP0, $T, $TMP1 vpclmulqdq \$0x11, $TMP0, $T, $TMP4 vpclmulqdq \$0x10, $TMP0, $T, $TMP2 vpclmulqdq \$0x01, $TMP0, $T, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpslldq \$8, $TMP2, $TMP3 vpsrldq \$8, $TMP2, $TMP2 vpxor $TMP3, $TMP1, $TMP1 vpxor $TMP2, $TMP4, $TMP4 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 vpshufd \$78, $TMP1, $TMP3 vpxor $TMP3, $TMP2, $TMP1 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 vpshufd \$78, $TMP1, $TMP3 vpxor $TMP3, $TMP2, $TMP1 vpxor $TMP4, $TMP1, $T ret .cfi_endproc .size GFMUL, .-GFMUL ___ } gfmul(); sub aesgcmsiv_htable_init { # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to # |out_htable|. # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H); my $Htbl = "%rdi"; my $H = "%rsi"; my $T = "%xmm0"; my $TMP0 = "%xmm1"; $code.=<<___; .globl aesgcmsiv_htable_init .type aesgcmsiv_htable_init,\@function,2 .align 16 aesgcmsiv_htable_init: .cfi_startproc vmovdqa ($H), $T vmovdqa $T, $TMP0 vmovdqa $T, ($Htbl) # H call GFMUL vmovdqa $T, 16($Htbl) # H^2 call GFMUL vmovdqa $T, 32($Htbl) # H^3 call GFMUL vmovdqa $T, 48($Htbl) # H^4 call GFMUL vmovdqa $T, 64($Htbl) # H^5 call GFMUL vmovdqa $T, 80($Htbl) # H^6 call GFMUL vmovdqa $T, 96($Htbl) # H^7 call GFMUL vmovdqa $T, 112($Htbl) # H^8 ret .cfi_endproc .size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init ___ } aesgcmsiv_htable_init(); sub aesgcmsiv_htable6_init { # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to # |out_htable|. # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H); # my $Htbl = "%rdi"; my $H = "%rsi"; my $T = "%xmm0"; my $TMP0 = "%xmm1"; $code.=<<___; .globl aesgcmsiv_htable6_init .type aesgcmsiv_htable6_init,\@function,2 .align 16 aesgcmsiv_htable6_init: .cfi_startproc vmovdqa ($H), $T vmovdqa $T, $TMP0 vmovdqa $T, ($Htbl) # H call GFMUL vmovdqa $T, 16($Htbl) # H^2 call GFMUL vmovdqa $T, 32($Htbl) # H^3 call GFMUL vmovdqa $T, 48($Htbl) # H^4 call GFMUL vmovdqa $T, 64($Htbl) # H^5 call GFMUL vmovdqa $T, 80($Htbl) # H^6 ret .cfi_endproc .size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init ___ } aesgcmsiv_htable6_init(); sub aesgcmsiv_htable_polyval { # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T); # parameter 1: %rdi Htable - pointer to Htable # parameter 2: %rsi INp - pointer to input # parameter 3: %rdx LEN - length of BUFFER in bytes # parameter 4: %rcx T - pointer to POLYVAL output my $DATA = "%xmm0"; my $hlp0 = "%r11"; my $Htbl = "%rdi"; my $inp = "%rsi"; my $len = "%rdx"; my $TMP0 = "%xmm3"; my $TMP1 = "%xmm4"; my $TMP2 = "%xmm5"; my $TMP3 = "%xmm6"; my $TMP4 = "%xmm7"; my $Tp = "%rcx"; my $T = "%xmm1"; my $Xhi = "%xmm9"; my $SCHOOLBOOK_AAD = sub { my ($i)=@_; return <<___; vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 vpxor $TMP3, $TMP1, $TMP1 vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 ___ }; $code.=<<___; .globl aesgcmsiv_htable_polyval .type aesgcmsiv_htable_polyval,\@function,4 .align 16 aesgcmsiv_htable_polyval: .cfi_startproc test $len, $len jnz .Lhtable_polyval_start ret .Lhtable_polyval_start: vzeroall # We hash 8 blocks each iteration. If the total number of blocks is not a # multiple of 8, we first hash the leading n%8 blocks. movq $len, $hlp0 andq \$127, $hlp0 jz .Lhtable_polyval_no_prefix vpxor $Xhi, $Xhi, $Xhi vmovdqa ($Tp), $T sub $hlp0, $len sub \$16, $hlp0 # hash first prefix block vmovdqu ($inp), $DATA vpxor $T, $DATA, $DATA vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 lea 16($inp), $inp test $hlp0, $hlp0 jnz .Lhtable_polyval_prefix_loop jmp .Lhtable_polyval_prefix_complete # hash remaining prefix bocks (up to 7 total prefix blocks) .align 64 .Lhtable_polyval_prefix_loop: sub \$16, $hlp0 vmovdqu ($inp), $DATA # next data block vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3 vpxor $TMP3, $TMP1, $TMP1 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 test $hlp0, $hlp0 lea 16($inp), $inp jnz .Lhtable_polyval_prefix_loop .Lhtable_polyval_prefix_complete: vpsrldq \$8, $TMP2, $TMP3 vpslldq \$8, $TMP2, $TMP2 vpxor $TMP3, $TMP1, $Xhi vpxor $TMP2, $TMP0, $T jmp .Lhtable_polyval_main_loop .Lhtable_polyval_no_prefix: # At this point we know the number of blocks is a multiple of 8. However, # the reduction in the main loop includes a multiplication by x^(-128). In # order to counter this, the existing tag needs to be multipled by x^128. # In practice, this just means that it is loaded into $Xhi, not $T. vpxor $T, $T, $T vmovdqa ($Tp), $Xhi .align 64 .Lhtable_polyval_main_loop: sub \$0x80, $len jb .Lhtable_polyval_out vmovdqu 16*7($inp), $DATA # Ii vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2 vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0 vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1 vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3 vpxor $TMP3, $TMP2, $TMP2 ######################################################### vmovdqu 16*6($inp), $DATA ${\$SCHOOLBOOK_AAD->(1)} ######################################################### vmovdqu 16*5($inp), $DATA vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a vpalignr \$8, $T, $T, $T ${\$SCHOOLBOOK_AAD->(2)} vpxor $TMP4, $T, $T # reduction stage 1b ######################################################### vmovdqu 16*4($inp), $DATA ${\$SCHOOLBOOK_AAD->(3)} ######################################################### vmovdqu 16*3($inp), $DATA vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a vpalignr \$8, $T, $T, $T ${\$SCHOOLBOOK_AAD->(4)} vpxor $TMP4, $T, $T # reduction stage 2b ######################################################### vmovdqu 16*2($inp), $DATA ${\$SCHOOLBOOK_AAD->(5)} vpxor $Xhi, $T, $T # reduction finalize ######################################################### vmovdqu 16*1($inp), $DATA ${\$SCHOOLBOOK_AAD->(6)} ######################################################### vmovdqu 16*0($inp), $DATA vpxor $T, $DATA, $DATA ${\$SCHOOLBOOK_AAD->(7)} ######################################################### vpsrldq \$8, $TMP2, $TMP3 vpslldq \$8, $TMP2, $TMP2 vpxor $TMP3, $TMP1, $Xhi vpxor $TMP2, $TMP0, $T lea 16*8($inp), $inp jmp .Lhtable_polyval_main_loop ######################################################### .Lhtable_polyval_out: vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 vpalignr \$8, $T, $T, $T vpxor $TMP3, $T, $T vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 vpalignr \$8, $T, $T, $T vpxor $TMP3, $T, $T vpxor $Xhi, $T, $T vmovdqu $T, ($Tp) vzeroupper ret .cfi_endproc .size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval ___ } aesgcmsiv_htable_polyval(); sub aesgcmsiv_polyval_horner { #void aesgcmsiv_polyval_horner(unsigned char T[16], // output # const unsigned char* H, // H # unsigned char* BUF, // Buffer # unsigned int blocks); // Len2 # # parameter 1: %rdi T - pointers to POLYVAL output # parameter 2: %rsi Hp - pointer to H (user key) # parameter 3: %rdx INp - pointer to input # parameter 4: %rcx L - total number of blocks in input BUFFER # my $T = "%rdi"; my $Hp = "%rsi"; my $INp = "%rdx"; my $L = "%rcx"; my $LOC = "%r10"; my $LEN = "%eax"; my $H = "%xmm1"; my $RES = "%xmm0"; $code.=<<___; .globl aesgcmsiv_polyval_horner .type aesgcmsiv_polyval_horner,\@function,4 .align 16 aesgcmsiv_polyval_horner: .cfi_startproc test $L, $L jnz .Lpolyval_horner_start ret .Lpolyval_horner_start: # We will start with L GFMULS for POLYVAL(BIG_BUFFER) # RES = GFMUL(RES, H) xorq $LOC, $LOC shlq \$4, $L # L contains number of bytes to process vmovdqa ($Hp), $H vmovdqa ($T), $RES .Lpolyval_horner_loop: vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi call GFMUL # RES = RES * H add \$16, $LOC cmp $LOC, $L jne .Lpolyval_horner_loop # calculation of T is complete. RES=T vmovdqa $RES, ($T) ret .cfi_endproc .size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner ___ } aesgcmsiv_polyval_horner(); # void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); # parameter 1: %rdi # parameter 2: %rsi $code.=<<___; .globl aes128gcmsiv_aes_ks .type aes128gcmsiv_aes_ks,\@function,2 .align 16 aes128gcmsiv_aes_ks: .cfi_startproc vmovdqu (%rdi), %xmm1 # xmm1 = user key vmovdqa %xmm1, (%rsi) # rsi points to output vmovdqa con1(%rip), %xmm0 vmovdqa mask(%rip), %xmm15 movq \$8, %rax .Lks128_loop: addq \$16, %rsi # rsi points for next key subq \$1, %rax vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key vaesenclast %xmm0, %xmm2, %xmm2 vpslld \$1, %xmm0, %xmm0 vpslldq \$4, %xmm1, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vmovdqa %xmm1, (%rsi) jne .Lks128_loop vmovdqa con2(%rip), %xmm0 vpshufb %xmm15, %xmm1, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslld \$1, %xmm0, %xmm0 vpslldq \$4, %xmm1, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vmovdqa %xmm1, 16(%rsi) vpshufb %xmm15, %xmm1, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslldq \$4, %xmm1, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpslldq \$4, %xmm3, %xmm3 vpxor %xmm3, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vmovdqa %xmm1, 32(%rsi) ret .cfi_endproc .size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks ___ # void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); # parameter 1: %rdi # parameter 2: %rsi $code.=<<___; .globl aes256gcmsiv_aes_ks .type aes256gcmsiv_aes_ks,\@function,2 .align 16 aes256gcmsiv_aes_ks: .cfi_startproc vmovdqu (%rdi), %xmm1 vmovdqu 16(%rdi), %xmm3 vmovdqa %xmm1, (%rsi) vmovdqa %xmm3, 16(%rsi) vmovdqa con1(%rip), %xmm0 vmovdqa mask(%rip), %xmm15 vpxor %xmm14, %xmm14, %xmm14 mov \$6, %rax .Lks256_loop: add \$32, %rsi subq \$1, %rax vpshufb %xmm15, %xmm3, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslld \$1, %xmm0, %xmm0 vpsllq \$32, %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpshufb con3(%rip), %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vmovdqa %xmm1, (%rsi) vpshufd \$0xff, %xmm1, %xmm2 vaesenclast %xmm14, %xmm2, %xmm2 vpsllq \$32, %xmm3, %xmm4 vpxor %xmm4, %xmm3, %xmm3 vpshufb con3(%rip), %xmm3, %xmm4 vpxor %xmm4, %xmm3, %xmm3 vpxor %xmm2, %xmm3, %xmm3 vmovdqa %xmm3, 16(%rsi) jne .Lks256_loop vpshufb %xmm15, %xmm3, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpsllq \$32, %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpshufb con3(%rip), %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vmovdqa %xmm1, 32(%rsi) ret .cfi_endproc ___ sub aes128gcmsiv_aes_ks_enc_x1 { my $KS1_REGA = "%xmm1"; my $KS1_REGB = "%xmm2"; my $BLOCK1 = "%xmm4"; my $AUXREG = "%xmm3"; my $KS_BLOCK = sub { my ($reg, $reg2, $auxReg) = @_; return <<___; vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3 vpxor $auxReg, $reg, $reg vpshufb con3(%rip), $reg, $auxReg vpxor $auxReg, $reg, $reg vpxor $reg2, $reg, $reg ___ }; my $round = sub { my ($i, $j) = @_; return <<___; vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslld \$1, %xmm0, %xmm0 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} vaesenc %xmm1, $BLOCK1, $BLOCK1 vmovdqa %xmm1, ${\eval(16*$i)}($j) ___ }; my $roundlast = sub { my ($i, $j) = @_; return <<___; vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 vaesenclast %xmm0, %xmm2, %xmm2 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} vaesenclast %xmm1, $BLOCK1, $BLOCK1 vmovdqa %xmm1, ${\eval(16*$i)}($j) ___ }; # parameter 1: %rdi Pointer to PT # parameter 2: %rsi Pointer to CT # parameter 4: %rdx Pointer to keys # parameter 5: %rcx Pointer to initial key $code.=<<___; .globl aes128gcmsiv_aes_ks_enc_x1 .type aes128gcmsiv_aes_ks_enc_x1,\@function,4 .align 16 aes128gcmsiv_aes_ks_enc_x1: .cfi_startproc vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key vmovdqa 0*16(%rdi), $BLOCK1 vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key vpxor %xmm1, $BLOCK1, $BLOCK1 vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1 vmovdqa mask(%rip), %xmm15 # xmm15 = mask ${\$round->(1, "%rdx")} ${\$round->(2, "%rdx")} ${\$round->(3, "%rdx")} ${\$round->(4, "%rdx")} ${\$round->(5, "%rdx")} ${\$round->(6, "%rdx")} ${\$round->(7, "%rdx")} ${\$round->(8, "%rdx")} vmovdqa con2(%rip), %xmm0 ${\$round->(9, "%rdx")} ${\$roundlast->(10, "%rdx")} vmovdqa $BLOCK1, 0*16(%rsi) ret .cfi_endproc .size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 ___ } aes128gcmsiv_aes_ks_enc_x1(); sub aes128gcmsiv_kdf { my $BLOCK1 = "%xmm9"; my $BLOCK2 = "%xmm10"; my $BLOCK3 = "%xmm11"; my $BLOCK4 = "%xmm12"; my $BLOCK5 = "%xmm13"; my $BLOCK6 = "%xmm14"; my $ONE = "%xmm13"; my $KSp = "%rdx"; my $STATE_1 = "%xmm1"; my $enc_roundx4 = sub { my ($i, $j) = @_; return <<___; vmovdqa ${\eval($i*16)}(%rdx), $j vaesenc $j, $BLOCK1, $BLOCK1 vaesenc $j, $BLOCK2, $BLOCK2 vaesenc $j, $BLOCK3, $BLOCK3 vaesenc $j, $BLOCK4, $BLOCK4 ___ }; my $enc_roundlastx4 = sub { my ($i, $j) = @_; return <<___; vmovdqa ${\eval($i*16)}(%rdx), $j vaesenclast $j, $BLOCK1, $BLOCK1 vaesenclast $j, $BLOCK2, $BLOCK2 vaesenclast $j, $BLOCK3, $BLOCK3 vaesenclast $j, $BLOCK4, $BLOCK4 ___ }; # void aes128gcmsiv_kdf(const uint8_t nonce[16], # uint8_t *out_key_material, # const uint8_t *key_schedule); $code.=<<___; .globl aes128gcmsiv_kdf .type aes128gcmsiv_kdf,\@function,3 .align 16 aes128gcmsiv_kdf: .cfi_startproc # parameter 1: %rdi Pointer to NONCE # parameter 2: %rsi Pointer to CT # parameter 4: %rdx Pointer to keys vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key vmovdqa 0*16(%rdi), $BLOCK1 vmovdqa and_mask(%rip), $BLOCK4 vmovdqa one(%rip), $ONE vpshufd \$0x90, $BLOCK1, $BLOCK1 vpand $BLOCK4, $BLOCK1, $BLOCK1 vpaddd $ONE, $BLOCK1, $BLOCK2 vpaddd $ONE, $BLOCK2, $BLOCK3 vpaddd $ONE, $BLOCK3, $BLOCK4 vpxor %xmm1, $BLOCK1, $BLOCK1 vpxor %xmm1, $BLOCK2, $BLOCK2 vpxor %xmm1, $BLOCK3, $BLOCK3 vpxor %xmm1, $BLOCK4, $BLOCK4 ${\$enc_roundx4->(1, "%xmm1")} ${\$enc_roundx4->(2, "%xmm2")} ${\$enc_roundx4->(3, "%xmm1")} ${\$enc_roundx4->(4, "%xmm2")} ${\$enc_roundx4->(5, "%xmm1")} ${\$enc_roundx4->(6, "%xmm2")} ${\$enc_roundx4->(7, "%xmm1")} ${\$enc_roundx4->(8, "%xmm2")} ${\$enc_roundx4->(9, "%xmm1")} ${\$enc_roundlastx4->(10, "%xmm2")} vmovdqa $BLOCK1, 0*16(%rsi) vmovdqa $BLOCK2, 1*16(%rsi) vmovdqa $BLOCK3, 2*16(%rsi) vmovdqa $BLOCK4, 3*16(%rsi) ret .cfi_endproc .size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf ___ } aes128gcmsiv_kdf(); sub aes128gcmsiv_enc_msg_x4 { my $CTR1 = "%xmm0"; my $CTR2 = "%xmm1"; my $CTR3 = "%xmm2"; my $CTR4 = "%xmm3"; my $ADDER = "%xmm4"; my $STATE1 = "%xmm5"; my $STATE2 = "%xmm6"; my $STATE3 = "%xmm7"; my $STATE4 = "%xmm8"; my $TMP = "%xmm12"; my $TMP2 = "%xmm13"; my $TMP3 = "%xmm14"; my $IV = "%xmm15"; my $PT = "%rdi"; my $CT = "%rsi"; my $TAG = "%rdx"; my $KS = "%rcx"; my $LEN = "%r8"; my $aes_round = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP vaesenc $TMP, $STATE1, $STATE1 vaesenc $TMP, $STATE2, $STATE2 vaesenc $TMP, $STATE3, $STATE3 vaesenc $TMP, $STATE4, $STATE4 ___ }; my $aes_lastround = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP vaesenclast $TMP, $STATE1, $STATE1 vaesenclast $TMP, $STATE2, $STATE2 vaesenclast $TMP, $STATE3, $STATE3 vaesenclast $TMP, $STATE4, $STATE4 ___ }; # void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, # unsigned char* TAG, unsigned char* KS, # size_t byte_len); # parameter 1: %rdi #PT # parameter 2: %rsi #CT # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] # parameter 4: %rcx #KS # parameter 5: %r8 #LEN MSG_length in bytes $code.=<<___; .globl aes128gcmsiv_enc_msg_x4 .type aes128gcmsiv_enc_msg_x4,\@function,5 .align 16 aes128gcmsiv_enc_msg_x4: .cfi_startproc test $LEN, $LEN jnz .L128_enc_msg_x4_start ret .L128_enc_msg_x4_start: pushq %r12 .cfi_push %r12 pushq %r13 .cfi_push %r13 shrq \$4, $LEN # LEN = num of blocks movq $LEN, %r10 shlq \$62, %r10 shrq \$62, %r10 # make IV from TAG vmovdqa ($TAG), $IV vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00] vmovdqu four(%rip), $ADDER # Register to increment counters vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] shrq \$2, $LEN je .L128_enc_msg_x4_check_remainder subq \$64, $CT subq \$64, $PT .L128_enc_msg_x4_loop1: addq \$64, $CT addq \$64, $PT vmovdqa $CTR1, $STATE1 vmovdqa $CTR2, $STATE2 vmovdqa $CTR3, $STATE3 vmovdqa $CTR4, $STATE4 vpxor ($KS), $STATE1, $STATE1 vpxor ($KS), $STATE2, $STATE2 vpxor ($KS), $STATE3, $STATE3 vpxor ($KS), $STATE4, $STATE4 ${\$aes_round->(1)} vpaddd $ADDER, $CTR1, $CTR1 ${\$aes_round->(2)} vpaddd $ADDER, $CTR2, $CTR2 ${\$aes_round->(3)} vpaddd $ADDER, $CTR3, $CTR3 ${\$aes_round->(4)} vpaddd $ADDER, $CTR4, $CTR4 ${\$aes_round->(5)} ${\$aes_round->(6)} ${\$aes_round->(7)} ${\$aes_round->(8)} ${\$aes_round->(9)} ${\$aes_lastround->(10)} # XOR with Plaintext vpxor 0*16($PT), $STATE1, $STATE1 vpxor 1*16($PT), $STATE2, $STATE2 vpxor 2*16($PT), $STATE3, $STATE3 vpxor 3*16($PT), $STATE4, $STATE4 subq \$1, $LEN vmovdqu $STATE1, 0*16($CT) vmovdqu $STATE2, 1*16($CT) vmovdqu $STATE3, 2*16($CT) vmovdqu $STATE4, 3*16($CT) jne .L128_enc_msg_x4_loop1 addq \$64,$CT addq \$64,$PT .L128_enc_msg_x4_check_remainder: cmpq \$0, %r10 je .L128_enc_msg_x4_out .L128_enc_msg_x4_loop2: # enc each block separately # CTR1 is the highest counter (even if no LOOP done) vmovdqa $CTR1, $STATE1 vpaddd one(%rip), $CTR1, $CTR1 # inc counter vpxor ($KS), $STATE1, $STATE1 vaesenc 16($KS), $STATE1, $STATE1 vaesenc 32($KS), $STATE1, $STATE1 vaesenc 48($KS), $STATE1, $STATE1 vaesenc 64($KS), $STATE1, $STATE1 vaesenc 80($KS), $STATE1, $STATE1 vaesenc 96($KS), $STATE1, $STATE1 vaesenc 112($KS), $STATE1, $STATE1 vaesenc 128($KS), $STATE1, $STATE1 vaesenc 144($KS), $STATE1, $STATE1 vaesenclast 160($KS), $STATE1, $STATE1 # XOR with plaintext vpxor ($PT), $STATE1, $STATE1 vmovdqu $STATE1, ($CT) addq \$16, $PT addq \$16, $CT subq \$1, %r10 jne .L128_enc_msg_x4_loop2 .L128_enc_msg_x4_out: popq %r13 .cfi_pop %r13 popq %r12 .cfi_pop %r12 ret .cfi_endproc .size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 ___ } aes128gcmsiv_enc_msg_x4(); sub aes128gcmsiv_enc_msg_x8 { my $STATE1 = "%xmm1"; my $STATE2 = "%xmm2"; my $STATE3 = "%xmm3"; my $STATE4 = "%xmm4"; my $STATE5 = "%xmm5"; my $STATE6 = "%xmm6"; my $STATE7 = "%xmm7"; my $STATE8 = "%xmm8"; my $CTR1 = "%xmm0"; my $CTR2 = "%xmm9"; my $CTR3 = "%xmm10"; my $CTR4 = "%xmm11"; my $CTR5 = "%xmm12"; my $CTR6 = "%xmm13"; my $CTR7 = "%xmm14"; my $SCHED = "%xmm15"; my $TMP1 = "%xmm1"; my $TMP2 = "%xmm2"; my $PT = "%rdi"; my $CT = "%rsi"; my $TAG = "%rdx"; my $KS = "%rcx"; my $LEN = "%r8"; my $aes_round8 = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $SCHED vaesenc $SCHED, $STATE1, $STATE1 vaesenc $SCHED, $STATE2, $STATE2 vaesenc $SCHED, $STATE3, $STATE3 vaesenc $SCHED, $STATE4, $STATE4 vaesenc $SCHED, $STATE5, $STATE5 vaesenc $SCHED, $STATE6, $STATE6 vaesenc $SCHED, $STATE7, $STATE7 vaesenc $SCHED, $STATE8, $STATE8 ___ }; my $aes_lastround8 = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $SCHED vaesenclast $SCHED, $STATE1, $STATE1 vaesenclast $SCHED, $STATE2, $STATE2 vaesenclast $SCHED, $STATE3, $STATE3 vaesenclast $SCHED, $STATE4, $STATE4 vaesenclast $SCHED, $STATE5, $STATE5 vaesenclast $SCHED, $STATE6, $STATE6 vaesenclast $SCHED, $STATE7, $STATE7 vaesenclast $SCHED, $STATE8, $STATE8 ___ }; # void ENC_MSG_x8(unsigned char* PT, # unsigned char* CT, # unsigned char* TAG, # unsigned char* KS, # size_t byte_len); # parameter 1: %rdi #PT # parameter 2: %rsi #CT # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] # parameter 4: %rcx #KS # parameter 5: %r8 #LEN MSG_length in bytes $code.=<<___; .globl aes128gcmsiv_enc_msg_x8 .type aes128gcmsiv_enc_msg_x8,\@function,5 .align 16 aes128gcmsiv_enc_msg_x8: .cfi_startproc test $LEN, $LEN jnz .L128_enc_msg_x8_start ret .L128_enc_msg_x8_start: pushq %r12 .cfi_push %r12 pushq %r13 .cfi_push %r13 pushq %rbp .cfi_push %rbp movq %rsp, %rbp .cfi_def_cfa_register rbp # Place in stack subq \$128, %rsp andq \$-64, %rsp shrq \$4, $LEN # LEN = num of blocks movq $LEN, %r10 shlq \$61, %r10 shrq \$61, %r10 # make IV from TAG vmovdqu ($TAG), $TMP1 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] # store counter8 in the stack vpaddd seven(%rip), $TMP1, $CTR1 vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07] vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] shrq \$3, $LEN je .L128_enc_msg_x8_check_remainder subq \$128, $CT subq \$128, $PT .L128_enc_msg_x8_loop1: addq \$128, $CT addq \$128, $PT vmovdqa $CTR1, $STATE1 vmovdqa $CTR2, $STATE2 vmovdqa $CTR3, $STATE3 vmovdqa $CTR4, $STATE4 vmovdqa $CTR5, $STATE5 vmovdqa $CTR6, $STATE6 vmovdqa $CTR7, $STATE7 # move from stack vmovdqu (%rsp), $STATE8 vpxor ($KS), $STATE1, $STATE1 vpxor ($KS), $STATE2, $STATE2 vpxor ($KS), $STATE3, $STATE3 vpxor ($KS), $STATE4, $STATE4 vpxor ($KS), $STATE5, $STATE5 vpxor ($KS), $STATE6, $STATE6 vpxor ($KS), $STATE7, $STATE7 vpxor ($KS), $STATE8, $STATE8 ${\$aes_round8->(1)} vmovdqu (%rsp), $CTR7 # deal with CTR8 vpaddd eight(%rip), $CTR7, $CTR7 vmovdqu $CTR7, (%rsp) ${\$aes_round8->(2)} vpsubd one(%rip), $CTR7, $CTR7 ${\$aes_round8->(3)} vpaddd eight(%rip), $CTR1, $CTR1 ${\$aes_round8->(4)} vpaddd eight(%rip), $CTR2, $CTR2 ${\$aes_round8->(5)} vpaddd eight(%rip), $CTR3, $CTR3 ${\$aes_round8->(6)} vpaddd eight(%rip), $CTR4, $CTR4 ${\$aes_round8->(7)} vpaddd eight(%rip), $CTR5, $CTR5 ${\$aes_round8->(8)} vpaddd eight(%rip), $CTR6, $CTR6 ${\$aes_round8->(9)} ${\$aes_lastround8->(10)} # XOR with Plaintext vpxor 0*16($PT), $STATE1, $STATE1 vpxor 1*16($PT), $STATE2, $STATE2 vpxor 2*16($PT), $STATE3, $STATE3 vpxor 3*16($PT), $STATE4, $STATE4 vpxor 4*16($PT), $STATE5, $STATE5 vpxor 5*16($PT), $STATE6, $STATE6 vpxor 6*16($PT), $STATE7, $STATE7 vpxor 7*16($PT), $STATE8, $STATE8 dec $LEN vmovdqu $STATE1, 0*16($CT) vmovdqu $STATE2, 1*16($CT) vmovdqu $STATE3, 2*16($CT) vmovdqu $STATE4, 3*16($CT) vmovdqu $STATE5, 4*16($CT) vmovdqu $STATE6, 5*16($CT) vmovdqu $STATE7, 6*16($CT) vmovdqu $STATE8, 7*16($CT) jne .L128_enc_msg_x8_loop1 addq \$128, $CT addq \$128, $PT .L128_enc_msg_x8_check_remainder: cmpq \$0, %r10 je .L128_enc_msg_x8_out .L128_enc_msg_x8_loop2: # enc each block separately # CTR1 is the highest counter (even if no LOOP done) vmovdqa $CTR1, $STATE1 vpaddd one(%rip), $CTR1, $CTR1 # inc counter vpxor ($KS), $STATE1, $STATE1 vaesenc 16($KS), $STATE1, $STATE1 vaesenc 32($KS), $STATE1, $STATE1 vaesenc 48($KS), $STATE1, $STATE1 vaesenc 64($KS), $STATE1, $STATE1 vaesenc 80($KS), $STATE1, $STATE1 vaesenc 96($KS), $STATE1, $STATE1 vaesenc 112($KS), $STATE1, $STATE1 vaesenc 128($KS), $STATE1, $STATE1 vaesenc 144($KS), $STATE1, $STATE1 vaesenclast 160($KS), $STATE1, $STATE1 # XOR with Plaintext vpxor ($PT), $STATE1, $STATE1 vmovdqu $STATE1, ($CT) addq \$16, $PT addq \$16, $CT decq %r10 jne .L128_enc_msg_x8_loop2 .L128_enc_msg_x8_out: movq %rbp, %rsp .cfi_def_cfa_register %rsp popq %rbp .cfi_pop %rbp popq %r13 .cfi_pop %r13 popq %r12 .cfi_pop %r12 ret .cfi_endproc .size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 ___ } aes128gcmsiv_enc_msg_x8(); sub aesgcmsiv_dec { my ($aes256) = @_; my $T = "%xmm0"; my $TMP0 = "%xmm1"; my $TMP1 = "%xmm2"; my $TMP2 = "%xmm3"; my $TMP3 = "%xmm4"; my $TMP4 = "%xmm5"; my $TMP5 = "%xmm6"; my $CTR1 = "%xmm7"; my $CTR2 = "%xmm8"; my $CTR3 = "%xmm9"; my $CTR4 = "%xmm10"; my $CTR5 = "%xmm11"; my $CTR6 = "%xmm12"; my $CTR = "%xmm15"; my $CT = "%rdi"; my $PT = "%rsi"; my $POL = "%rdx"; my $Htbl = "%rcx"; my $KS = "%r8"; my $LEN = "%r9"; my $secureBuffer = "%rax"; my $HTABLE_ROUNDS = "%xmm13"; my $labelPrefix = "128"; if ($aes256) { $labelPrefix = "256"; } my $aes_round_dec = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP3 vaesenc $TMP3, $CTR1, $CTR1 vaesenc $TMP3, $CTR2, $CTR2 vaesenc $TMP3, $CTR3, $CTR3 vaesenc $TMP3, $CTR4, $CTR4 vaesenc $TMP3, $CTR5, $CTR5 vaesenc $TMP3, $CTR6, $CTR6 ___ }; my $aes_lastround_dec = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP3 vaesenclast $TMP3, $CTR1, $CTR1 vaesenclast $TMP3, $CTR2, $CTR2 vaesenclast $TMP3, $CTR3, $CTR3 vaesenclast $TMP3, $CTR4, $CTR4 vaesenclast $TMP3, $CTR5, $CTR5 vaesenclast $TMP3, $CTR6, $CTR6 ___ }; my $schoolbook = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5 vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3 vpxor $TMP3, $TMP1, $TMP1 vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 ___ }; if ($aes256) { $code.=<<___; .globl aes256gcmsiv_dec .type aes256gcmsiv_dec,\@function,6 .align 16 aes256gcmsiv_dec: ___ } else { $code.=<<___; .globl aes128gcmsiv_dec .type aes128gcmsiv_dec,\@function,6 .align 16 aes128gcmsiv_dec: ___ } $code.=<<___; .cfi_startproc test \$~15, $LEN jnz .L${labelPrefix}_dec_start ret .L${labelPrefix}_dec_start: vzeroupper vmovdqa ($POL), $T movq $POL, $secureBuffer leaq 32($secureBuffer), $secureBuffer leaq 32($Htbl), $Htbl # make CTRBLKs from given tag. vmovdqu ($CT,$LEN), $CTR vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00] andq \$~15, $LEN # If less then 6 blocks, make singles cmp \$96, $LEN jb .L${labelPrefix}_dec_loop2 # Decrypt the first six blocks sub \$96, $LEN vmovdqa $CTR, $CTR1 vpaddd one(%rip), $CTR1, $CTR2 vpaddd two(%rip), $CTR1, $CTR3 vpaddd one(%rip), $CTR3, $CTR4 vpaddd two(%rip), $CTR3, $CTR5 vpaddd one(%rip), $CTR5, $CTR6 vpaddd two(%rip), $CTR5, $CTR vpxor ($KS), $CTR1, $CTR1 vpxor ($KS), $CTR2, $CTR2 vpxor ($KS), $CTR3, $CTR3 vpxor ($KS), $CTR4, $CTR4 vpxor ($KS), $CTR5, $CTR5 vpxor ($KS), $CTR6, $CTR6 ${\$aes_round_dec->(1)} ${\$aes_round_dec->(2)} ${\$aes_round_dec->(3)} ${\$aes_round_dec->(4)} ${\$aes_round_dec->(5)} ${\$aes_round_dec->(6)} ${\$aes_round_dec->(7)} ${\$aes_round_dec->(8)} ${\$aes_round_dec->(9)} ___ if ($aes256) { $code.=<<___; ${\$aes_round_dec->(10)} ${\$aes_round_dec->(11)} ${\$aes_round_dec->(12)} ${\$aes_round_dec->(13)} ${\$aes_lastround_dec->(14)} ___ } else { $code.=<<___; ${\$aes_lastround_dec->(10)} ___ } $code.=<<___; # XOR with CT vpxor 0*16($CT), $CTR1, $CTR1 vpxor 1*16($CT), $CTR2, $CTR2 vpxor 2*16($CT), $CTR3, $CTR3 vpxor 3*16($CT), $CTR4, $CTR4 vpxor 4*16($CT), $CTR5, $CTR5 vpxor 5*16($CT), $CTR6, $CTR6 vmovdqu $CTR1, 0*16($PT) vmovdqu $CTR2, 1*16($PT) vmovdqu $CTR3, 2*16($PT) vmovdqu $CTR4, 3*16($PT) vmovdqu $CTR5, 4*16($PT) vmovdqu $CTR6, 5*16($PT) addq \$96, $CT addq \$96, $PT jmp .L${labelPrefix}_dec_loop1 # Decrypt 6 blocks each time while hashing previous 6 blocks .align 64 .L${labelPrefix}_dec_loop1: cmp \$96, $LEN jb .L${labelPrefix}_dec_finish_96 sub \$96, $LEN vmovdqa $CTR6, $TMP5 vmovdqa $CTR5, 1*16-32($secureBuffer) vmovdqa $CTR4, 2*16-32($secureBuffer) vmovdqa $CTR3, 3*16-32($secureBuffer) vmovdqa $CTR2, 4*16-32($secureBuffer) vmovdqa $CTR1, 5*16-32($secureBuffer) vmovdqa $CTR, $CTR1 vpaddd one(%rip), $CTR1, $CTR2 vpaddd two(%rip), $CTR1, $CTR3 vpaddd one(%rip), $CTR3, $CTR4 vpaddd two(%rip), $CTR3, $CTR5 vpaddd one(%rip), $CTR5, $CTR6 vpaddd two(%rip), $CTR5, $CTR vmovdqa ($KS), $TMP3 vpxor $TMP3, $CTR1, $CTR1 vpxor $TMP3, $CTR2, $CTR2 vpxor $TMP3, $CTR3, $CTR3 vpxor $TMP3, $CTR4, $CTR4 vpxor $TMP3, $CTR5, $CTR5 vpxor $TMP3, $CTR6, $CTR6 vmovdqu 0*16-32($Htbl), $TMP3 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 ${\$aes_round_dec->(1)} ${\$schoolbook->(1)} ${\$aes_round_dec->(2)} ${\$schoolbook->(2)} ${\$aes_round_dec->(3)} ${\$schoolbook->(3)} ${\$aes_round_dec->(4)} ${\$schoolbook->(4)} ${\$aes_round_dec->(5)} ${\$aes_round_dec->(6)} ${\$aes_round_dec->(7)} vmovdqa 5*16-32($secureBuffer), $TMP5 vpxor $T, $TMP5, $TMP5 vmovdqu 5*16-32($Htbl), $TMP4 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP1, $TMP1 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 ${\$aes_round_dec->(8)} vpsrldq \$8, $TMP0, $TMP3 vpxor $TMP3, $TMP1, $TMP4 vpslldq \$8, $TMP0, $TMP3 vpxor $TMP3, $TMP2, $T vmovdqa poly(%rip), $TMP2 ${\$aes_round_dec->(9)} ___ if ($aes256) { $code.=<<___; ${\$aes_round_dec->(10)} ${\$aes_round_dec->(11)} ${\$aes_round_dec->(12)} ${\$aes_round_dec->(13)} vmovdqu 14*16($KS), $TMP5 ___ } else { $code.=<<___; vmovdqu 10*16($KS), $TMP5 ___ } $code.=<<___; vpalignr \$8, $T, $T, $TMP1 vpclmulqdq \$0x10, $TMP2, $T, $T vpxor $T, $TMP1, $T vpxor 0*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR1, $CTR1 vpxor 1*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR2, $CTR2 vpxor 2*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR3, $CTR3 vpxor 3*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR4, $CTR4 vpxor 4*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR5, $CTR5 vpxor 5*16($CT), $TMP5, $TMP3 vaesenclast $TMP3, $CTR6, $CTR6 vpalignr \$8, $T, $T, $TMP1 vpclmulqdq \$0x10, $TMP2, $T, $T vpxor $T, $TMP1, $T vmovdqu $CTR1, 0*16($PT) vmovdqu $CTR2, 1*16($PT) vmovdqu $CTR3, 2*16($PT) vmovdqu $CTR4, 3*16($PT) vmovdqu $CTR5, 4*16($PT) vmovdqu $CTR6, 5*16($PT) vpxor $TMP4, $T, $T lea 96($CT), $CT lea 96($PT), $PT jmp .L${labelPrefix}_dec_loop1 .L${labelPrefix}_dec_finish_96: vmovdqa $CTR6, $TMP5 vmovdqa $CTR5, 1*16-32($secureBuffer) vmovdqa $CTR4, 2*16-32($secureBuffer) vmovdqa $CTR3, 3*16-32($secureBuffer) vmovdqa $CTR2, 4*16-32($secureBuffer) vmovdqa $CTR1, 5*16-32($secureBuffer) vmovdqu 0*16-32($Htbl), $TMP3 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 ${\$schoolbook->(1)} ${\$schoolbook->(2)} ${\$schoolbook->(3)} ${\$schoolbook->(4)} vmovdqu 5*16-32($secureBuffer), $TMP5 vpxor $T, $TMP5, $TMP5 vmovdqu 5*16-32($Htbl), $TMP4 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP1, $TMP1 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP2, $TMP2 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 vpxor $TMP3, $TMP0, $TMP0 vpsrldq \$8, $TMP0, $TMP3 vpxor $TMP3, $TMP1, $TMP4 vpslldq \$8, $TMP0, $TMP3 vpxor $TMP3, $TMP2, $T vmovdqa poly(%rip), $TMP2 vpalignr \$8, $T, $T, $TMP1 vpclmulqdq \$0x10, $TMP2, $T, $T vpxor $T, $TMP1, $T vpalignr \$8, $T, $T, $TMP1 vpclmulqdq \$0x10, $TMP2, $T, $T vpxor $T, $TMP1, $T vpxor $TMP4, $T, $T .L${labelPrefix}_dec_loop2: # Here we encrypt any remaining whole block # if there are no whole blocks cmp \$16, $LEN jb .L${labelPrefix}_dec_out sub \$16, $LEN vmovdqa $CTR, $TMP1 vpaddd one(%rip), $CTR, $CTR vpxor 0*16($KS), $TMP1, $TMP1 vaesenc 1*16($KS), $TMP1, $TMP1 vaesenc 2*16($KS), $TMP1, $TMP1 vaesenc 3*16($KS), $TMP1, $TMP1 vaesenc 4*16($KS), $TMP1, $TMP1 vaesenc 5*16($KS), $TMP1, $TMP1 vaesenc 6*16($KS), $TMP1, $TMP1 vaesenc 7*16($KS), $TMP1, $TMP1 vaesenc 8*16($KS), $TMP1, $TMP1 vaesenc 9*16($KS), $TMP1, $TMP1 ___ if ($aes256) { $code.=<<___; vaesenc 10*16($KS), $TMP1, $TMP1 vaesenc 11*16($KS), $TMP1, $TMP1 vaesenc 12*16($KS), $TMP1, $TMP1 vaesenc 13*16($KS), $TMP1, $TMP1 vaesenclast 14*16($KS), $TMP1, $TMP1 ___ } else { $code.=<<___; vaesenclast 10*16($KS), $TMP1, $TMP1 ___ } $code.=<<___; vpxor ($CT), $TMP1, $TMP1 vmovdqu $TMP1, ($PT) addq \$16, $CT addq \$16, $PT vpxor $TMP1, $T, $T vmovdqa -32($Htbl), $TMP0 call GFMUL jmp .L${labelPrefix}_dec_loop2 .L${labelPrefix}_dec_out: vmovdqu $T, ($POL) ret .cfi_endproc ___ if ($aes256) { $code.=<<___; .size aes256gcmsiv_dec, .-aes256gcmsiv_dec ___ } else { $code.=<<___; .size aes128gcmsiv_dec, .-aes128gcmsiv_dec ___ } } aesgcmsiv_dec(0); # emit 128-bit version sub aes128gcmsiv_ecb_enc_block { my $STATE_1 = "%xmm1"; my $KSp = "%rdx"; # parameter 1: PT %rdi (pointer to 128 bit) # parameter 2: CT %rsi (pointer to 128 bit) # parameter 3: ks %rdx (pointer to ks) $code.=<<___; .globl aes128gcmsiv_ecb_enc_block .type aes128gcmsiv_ecb_enc_block,\@function,3 .align 16 aes128gcmsiv_ecb_enc_block: .cfi_startproc vmovdqa (%rdi), $STATE_1 vpxor ($KSp), $STATE_1, $STATE_1 vaesenc 1*16($KSp), $STATE_1, $STATE_1 vaesenc 2*16($KSp), $STATE_1, $STATE_1 vaesenc 3*16($KSp), $STATE_1, $STATE_1 vaesenc 4*16($KSp), $STATE_1, $STATE_1 vaesenc 5*16($KSp), $STATE_1, $STATE_1 vaesenc 6*16($KSp), $STATE_1, $STATE_1 vaesenc 7*16($KSp), $STATE_1, $STATE_1 vaesenc 8*16($KSp), $STATE_1, $STATE_1 vaesenc 9*16($KSp), $STATE_1, $STATE_1 vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV vmovdqa $STATE_1, (%rsi) ret .cfi_endproc .size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block ___ } aes128gcmsiv_ecb_enc_block(); sub aes256gcmsiv_aes_ks_enc_x1 { my $KS = "%rdx"; my $KEYp = "%rcx"; my $CON_MASK = "%xmm0"; my $MASK_256 = "%xmm15"; my $KEY_1 = "%xmm1"; my $KEY_2 = "%xmm3"; my $BLOCK1 = "%xmm8"; my $AUX_REG = "%xmm14"; my $PT = "%rdi"; my $CT = "%rsi"; my $round_double = sub { my ($i, $j) = @_; return <<___; vpshufb %xmm15, %xmm3, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslld \$1, %xmm0, %xmm0 vpslldq \$4, %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vaesenc %xmm1, $BLOCK1, $BLOCK1 vmovdqu %xmm1, ${\eval(16*$i)}($KS) vpshufd \$0xff, %xmm1, %xmm2 vaesenclast %xmm14, %xmm2, %xmm2 vpslldq \$4, %xmm3, %xmm4 vpxor %xmm4, %xmm3, %xmm3 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm3, %xmm3 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm3, %xmm3 vpxor %xmm2, %xmm3, %xmm3 vaesenc %xmm3, $BLOCK1, $BLOCK1 vmovdqu %xmm3, ${\eval(16*$j)}($KS) ___ }; my $round_last = sub { my ($i) = @_; return <<___; vpshufb %xmm15, %xmm3, %xmm2 vaesenclast %xmm0, %xmm2, %xmm2 vpslldq \$4, %xmm1, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpslldq \$4, %xmm4, %xmm4 vpxor %xmm4, %xmm1, %xmm1 vpxor %xmm2, %xmm1, %xmm1 vaesenclast %xmm1, $BLOCK1, $BLOCK1 vmovdqu %xmm1, ${\eval(16*$i)}($KS) ___ }; # parameter 1: %rdi Pointer to PT1 # parameter 2: %rsi Pointer to CT1 # parameter 3: %rdx Pointer to KS # parameter 4: %rcx Pointer to initial key $code.=<<___; .globl aes256gcmsiv_aes_ks_enc_x1 .type aes256gcmsiv_aes_ks_enc_x1,\@function,4 .align 16 aes256gcmsiv_aes_ks_enc_x1: .cfi_startproc vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1 vmovdqa mask(%rip), $MASK_256 # MASK_256 vmovdqa ($PT), $BLOCK1 vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key vmovdqa 16($KEYp), $KEY_2 vpxor $KEY_1, $BLOCK1, $BLOCK1 vaesenc $KEY_2, $BLOCK1, $BLOCK1 vmovdqu $KEY_1, ($KS) # First round key vmovdqu $KEY_2, 16($KS) vpxor $AUX_REG, $AUX_REG, $AUX_REG ${\$round_double->(2, 3)} ${\$round_double->(4, 5)} ${\$round_double->(6, 7)} ${\$round_double->(8, 9)} ${\$round_double->(10, 11)} ${\$round_double->(12, 13)} ${\$round_last->(14)} vmovdqa $BLOCK1, ($CT) ret .cfi_endproc .size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 ___ } aes256gcmsiv_aes_ks_enc_x1(); sub aes256gcmsiv_ecb_enc_block { my $STATE_1 = "%xmm1"; my $PT = "%rdi"; my $CT = "%rsi"; my $KSp = "%rdx"; # parameter 1: PT %rdi (pointer to 128 bit) # parameter 2: CT %rsi (pointer to 128 bit) # parameter 3: ks %rdx (pointer to ks) $code.=<<___; .globl aes256gcmsiv_ecb_enc_block .type aes256gcmsiv_ecb_enc_block,\@function,3 .align 16 aes256gcmsiv_ecb_enc_block: .cfi_startproc vmovdqa (%rdi), $STATE_1 vpxor ($KSp), $STATE_1, $STATE_1 vaesenc 1*16($KSp), $STATE_1, $STATE_1 vaesenc 2*16($KSp), $STATE_1, $STATE_1 vaesenc 3*16($KSp), $STATE_1, $STATE_1 vaesenc 4*16($KSp), $STATE_1, $STATE_1 vaesenc 5*16($KSp), $STATE_1, $STATE_1 vaesenc 6*16($KSp), $STATE_1, $STATE_1 vaesenc 7*16($KSp), $STATE_1, $STATE_1 vaesenc 8*16($KSp), $STATE_1, $STATE_1 vaesenc 9*16($KSp), $STATE_1, $STATE_1 vaesenc 10*16($KSp), $STATE_1, $STATE_1 vaesenc 11*16($KSp), $STATE_1, $STATE_1 vaesenc 12*16($KSp), $STATE_1, $STATE_1 vaesenc 13*16($KSp), $STATE_1, $STATE_1 vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV vmovdqa $STATE_1, (%rsi) ret .cfi_endproc .size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block ___ } aes256gcmsiv_ecb_enc_block(); sub aes256gcmsiv_enc_msg_x4 { my $CTR1 = "%xmm0"; my $CTR2 = "%xmm1"; my $CTR3 = "%xmm2"; my $CTR4 = "%xmm3"; my $ADDER = "%xmm4"; my $STATE1 = "%xmm5"; my $STATE2 = "%xmm6"; my $STATE3 = "%xmm7"; my $STATE4 = "%xmm8"; my $TMP = "%xmm12"; my $TMP2 = "%xmm13"; my $TMP3 = "%xmm14"; my $IV = "%xmm15"; my $PT = "%rdi"; my $CT = "%rsi"; my $TAG = "%rdx"; my $KS = "%rcx"; my $LEN = "%r8"; my $aes_round = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP vaesenc $TMP, $STATE1, $STATE1 vaesenc $TMP, $STATE2, $STATE2 vaesenc $TMP, $STATE3, $STATE3 vaesenc $TMP, $STATE4, $STATE4 ___ }; my $aes_lastround = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $TMP vaesenclast $TMP, $STATE1, $STATE1 vaesenclast $TMP, $STATE2, $STATE2 vaesenclast $TMP, $STATE3, $STATE3 vaesenclast $TMP, $STATE4, $STATE4 ___ }; # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, # unsigned char* TAG, unsigned char* KS, # size_t byte_len); # parameter 1: %rdi #PT # parameter 2: %rsi #CT # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] # parameter 4: %rcx #KS # parameter 5: %r8 #LEN MSG_length in bytes $code.=<<___; .globl aes256gcmsiv_enc_msg_x4 .type aes256gcmsiv_enc_msg_x4,\@function,5 .align 16 aes256gcmsiv_enc_msg_x4: .cfi_startproc test $LEN, $LEN jnz .L256_enc_msg_x4_start ret .L256_enc_msg_x4_start: movq $LEN, %r10 shrq \$4, $LEN # LEN = num of blocks shlq \$60, %r10 jz .L256_enc_msg_x4_start2 addq \$1, $LEN .L256_enc_msg_x4_start2: movq $LEN, %r10 shlq \$62, %r10 shrq \$62, %r10 # make IV from TAG vmovdqa ($TAG), $IV vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00] vmovdqa four(%rip), $ADDER # Register to increment counters vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] shrq \$2, $LEN je .L256_enc_msg_x4_check_remainder subq \$64, $CT subq \$64, $PT .L256_enc_msg_x4_loop1: addq \$64, $CT addq \$64, $PT vmovdqa $CTR1, $STATE1 vmovdqa $CTR2, $STATE2 vmovdqa $CTR3, $STATE3 vmovdqa $CTR4, $STATE4 vpxor ($KS), $STATE1, $STATE1 vpxor ($KS), $STATE2, $STATE2 vpxor ($KS), $STATE3, $STATE3 vpxor ($KS), $STATE4, $STATE4 ${\$aes_round->(1)} vpaddd $ADDER, $CTR1, $CTR1 ${\$aes_round->(2)} vpaddd $ADDER, $CTR2, $CTR2 ${\$aes_round->(3)} vpaddd $ADDER, $CTR3, $CTR3 ${\$aes_round->(4)} vpaddd $ADDER, $CTR4, $CTR4 ${\$aes_round->(5)} ${\$aes_round->(6)} ${\$aes_round->(7)} ${\$aes_round->(8)} ${\$aes_round->(9)} ${\$aes_round->(10)} ${\$aes_round->(11)} ${\$aes_round->(12)} ${\$aes_round->(13)} ${\$aes_lastround->(14)} # XOR with Plaintext vpxor 0*16($PT), $STATE1, $STATE1 vpxor 1*16($PT), $STATE2, $STATE2 vpxor 2*16($PT), $STATE3, $STATE3 vpxor 3*16($PT), $STATE4, $STATE4 subq \$1, $LEN vmovdqu $STATE1, 0*16($CT) vmovdqu $STATE2, 1*16($CT) vmovdqu $STATE3, 2*16($CT) vmovdqu $STATE4, 3*16($CT) jne .L256_enc_msg_x4_loop1 addq \$64, $CT addq \$64, $PT .L256_enc_msg_x4_check_remainder: cmpq \$0, %r10 je .L256_enc_msg_x4_out .L256_enc_msg_x4_loop2: # encrypt each block separately # CTR1 is the highest counter (even if no LOOP done) vmovdqa $CTR1, $STATE1 vpaddd one(%rip), $CTR1, $CTR1 # inc counter vpxor ($KS), $STATE1, $STATE1 vaesenc 16($KS), $STATE1, $STATE1 vaesenc 32($KS), $STATE1, $STATE1 vaesenc 48($KS), $STATE1, $STATE1 vaesenc 64($KS), $STATE1, $STATE1 vaesenc 80($KS), $STATE1, $STATE1 vaesenc 96($KS), $STATE1, $STATE1 vaesenc 112($KS), $STATE1, $STATE1 vaesenc 128($KS), $STATE1, $STATE1 vaesenc 144($KS), $STATE1, $STATE1 vaesenc 160($KS), $STATE1, $STATE1 vaesenc 176($KS), $STATE1, $STATE1 vaesenc 192($KS), $STATE1, $STATE1 vaesenc 208($KS), $STATE1, $STATE1 vaesenclast 224($KS), $STATE1, $STATE1 # XOR with Plaintext vpxor ($PT), $STATE1, $STATE1 vmovdqu $STATE1, ($CT) addq \$16, $PT addq \$16, $CT subq \$1, %r10 jne .L256_enc_msg_x4_loop2 .L256_enc_msg_x4_out: ret .cfi_endproc .size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 ___ } aes256gcmsiv_enc_msg_x4(); sub aes256gcmsiv_enc_msg_x8() { my $STATE1 = "%xmm1"; my $STATE2 = "%xmm2"; my $STATE3 = "%xmm3"; my $STATE4 = "%xmm4"; my $STATE5 = "%xmm5"; my $STATE6 = "%xmm6"; my $STATE7 = "%xmm7"; my $STATE8 = "%xmm8"; my $CTR1 = "%xmm0"; my $CTR2 = "%xmm9"; my $CTR3 = "%xmm10"; my $CTR4 = "%xmm11"; my $CTR5 = "%xmm12"; my $CTR6 = "%xmm13"; my $CTR7 = "%xmm14"; my $TMP1 = "%xmm1"; my $TMP2 = "%xmm2"; my $KS = "%rcx"; my $LEN = "%r8"; my $PT = "%rdi"; my $CT = "%rsi"; my $TAG = "%rdx"; my $SCHED = "%xmm15"; my $aes_round8 = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $SCHED vaesenc $SCHED, $STATE1, $STATE1 vaesenc $SCHED, $STATE2, $STATE2 vaesenc $SCHED, $STATE3, $STATE3 vaesenc $SCHED, $STATE4, $STATE4 vaesenc $SCHED, $STATE5, $STATE5 vaesenc $SCHED, $STATE6, $STATE6 vaesenc $SCHED, $STATE7, $STATE7 vaesenc $SCHED, $STATE8, $STATE8 ___ }; my $aes_lastround8 = sub { my ($i) = @_; return <<___; vmovdqu ${\eval($i*16)}($KS), $SCHED vaesenclast $SCHED, $STATE1, $STATE1 vaesenclast $SCHED, $STATE2, $STATE2 vaesenclast $SCHED, $STATE3, $STATE3 vaesenclast $SCHED, $STATE4, $STATE4 vaesenclast $SCHED, $STATE5, $STATE5 vaesenclast $SCHED, $STATE6, $STATE6 vaesenclast $SCHED, $STATE7, $STATE7 vaesenclast $SCHED, $STATE8, $STATE8 ___ }; # void ENC_MSG_x8(unsigned char* PT, # unsigned char* CT, # unsigned char* TAG, # unsigned char* KS, # size_t byte_len); # parameter 1: %rdi #PT # parameter 2: %rsi #CT # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] # parameter 4: %rcx #KS # parameter 5: %r8 #LEN MSG_length in bytes $code.=<<___; .globl aes256gcmsiv_enc_msg_x8 .type aes256gcmsiv_enc_msg_x8,\@function,5 .align 16 aes256gcmsiv_enc_msg_x8: .cfi_startproc test $LEN, $LEN jnz .L256_enc_msg_x8_start ret .L256_enc_msg_x8_start: # Place in stack movq %rsp, %r11 subq \$16, %r11 andq \$-64, %r11 movq $LEN, %r10 shrq \$4, $LEN # LEN = num of blocks shlq \$60, %r10 jz .L256_enc_msg_x8_start2 addq \$1, $LEN .L256_enc_msg_x8_start2: movq $LEN, %r10 shlq \$61, %r10 shrq \$61, %r10 # Make IV from TAG vmovdqa ($TAG), $TMP1 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] # store counter8 on the stack vpaddd seven(%rip), $TMP1, $CTR1 vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07] vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] shrq \$3, $LEN jz .L256_enc_msg_x8_check_remainder subq \$128, $CT subq \$128, $PT .L256_enc_msg_x8_loop1: addq \$128, $CT addq \$128, $PT vmovdqa $CTR1, $STATE1 vmovdqa $CTR2, $STATE2 vmovdqa $CTR3, $STATE3 vmovdqa $CTR4, $STATE4 vmovdqa $CTR5, $STATE5 vmovdqa $CTR6, $STATE6 vmovdqa $CTR7, $STATE7 # move from stack vmovdqa (%r11), $STATE8 vpxor ($KS), $STATE1, $STATE1 vpxor ($KS), $STATE2, $STATE2 vpxor ($KS), $STATE3, $STATE3 vpxor ($KS), $STATE4, $STATE4 vpxor ($KS), $STATE5, $STATE5 vpxor ($KS), $STATE6, $STATE6 vpxor ($KS), $STATE7, $STATE7 vpxor ($KS), $STATE8, $STATE8 ${\$aes_round8->(1)} vmovdqa (%r11), $CTR7 # deal with CTR8 vpaddd eight(%rip), $CTR7, $CTR7 vmovdqa $CTR7, (%r11) ${\$aes_round8->(2)} vpsubd one(%rip), $CTR7, $CTR7 ${\$aes_round8->(3)} vpaddd eight(%rip), $CTR1, $CTR1 ${\$aes_round8->(4)} vpaddd eight(%rip), $CTR2, $CTR2 ${\$aes_round8->(5)} vpaddd eight(%rip), $CTR3, $CTR3 ${\$aes_round8->(6)} vpaddd eight(%rip), $CTR4, $CTR4 ${\$aes_round8->(7)} vpaddd eight(%rip), $CTR5, $CTR5 ${\$aes_round8->(8)} vpaddd eight(%rip), $CTR6, $CTR6 ${\$aes_round8->(9)} ${\$aes_round8->(10)} ${\$aes_round8->(11)} ${\$aes_round8->(12)} ${\$aes_round8->(13)} ${\$aes_lastround8->(14)} # XOR with Plaintext vpxor 0*16($PT), $STATE1, $STATE1 vpxor 1*16($PT), $STATE2, $STATE2 vpxor 2*16($PT), $STATE3, $STATE3 vpxor 3*16($PT), $STATE4, $STATE4 vpxor 4*16($PT), $STATE5, $STATE5 vpxor 5*16($PT), $STATE6, $STATE6 vpxor 6*16($PT), $STATE7, $STATE7 vpxor 7*16($PT), $STATE8, $STATE8 subq \$1, $LEN vmovdqu $STATE1, 0*16($CT) vmovdqu $STATE2, 1*16($CT) vmovdqu $STATE3, 2*16($CT) vmovdqu $STATE4, 3*16($CT) vmovdqu $STATE5, 4*16($CT) vmovdqu $STATE6, 5*16($CT) vmovdqu $STATE7, 6*16($CT) vmovdqu $STATE8, 7*16($CT) jne .L256_enc_msg_x8_loop1 addq \$128, $CT addq \$128, $PT .L256_enc_msg_x8_check_remainder: cmpq \$0, %r10 je .L256_enc_msg_x8_out .L256_enc_msg_x8_loop2: # encrypt each block separately # CTR1 is the highest counter (even if no LOOP done) vmovdqa $CTR1, $STATE1 vpaddd one(%rip), $CTR1, $CTR1 vpxor ($KS), $STATE1, $STATE1 vaesenc 16($KS), $STATE1, $STATE1 vaesenc 32($KS), $STATE1, $STATE1 vaesenc 48($KS), $STATE1, $STATE1 vaesenc 64($KS), $STATE1, $STATE1 vaesenc 80($KS), $STATE1, $STATE1 vaesenc 96($KS), $STATE1, $STATE1 vaesenc 112($KS), $STATE1, $STATE1 vaesenc 128($KS), $STATE1, $STATE1 vaesenc 144($KS), $STATE1, $STATE1 vaesenc 160($KS), $STATE1, $STATE1 vaesenc 176($KS), $STATE1, $STATE1 vaesenc 192($KS), $STATE1, $STATE1 vaesenc 208($KS), $STATE1, $STATE1 vaesenclast 224($KS), $STATE1, $STATE1 # XOR with Plaintext vpxor ($PT), $STATE1, $STATE1 vmovdqu $STATE1, ($CT) addq \$16, $PT addq \$16, $CT subq \$1, %r10 jnz .L256_enc_msg_x8_loop2 .L256_enc_msg_x8_out: ret .cfi_endproc .size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 ___ } aes256gcmsiv_enc_msg_x8(); aesgcmsiv_dec(1); sub aes256gcmsiv_kdf { my $ONE = "%xmm8"; my $BLOCK1 = "%xmm4"; my $BLOCK2 = "%xmm6"; my $BLOCK3 = "%xmm7"; my $BLOCK4 = "%xmm11"; my $BLOCK5 = "%xmm12"; my $BLOCK6 = "%xmm13"; my $enc_roundx6 = sub { my ($i, $j) = @_; return <<___; vmovdqa ${\eval($i*16)}(%rdx), $j vaesenc $j, $BLOCK1, $BLOCK1 vaesenc $j, $BLOCK2, $BLOCK2 vaesenc $j, $BLOCK3, $BLOCK3 vaesenc $j, $BLOCK4, $BLOCK4 vaesenc $j, $BLOCK5, $BLOCK5 vaesenc $j, $BLOCK6, $BLOCK6 ___ }; my $enc_roundlastx6 = sub { my ($i, $j) = @_; return <<___; vmovdqa ${\eval($i*16)}(%rdx), $j vaesenclast $j, $BLOCK1, $BLOCK1 vaesenclast $j, $BLOCK2, $BLOCK2 vaesenclast $j, $BLOCK3, $BLOCK3 vaesenclast $j, $BLOCK4, $BLOCK4 vaesenclast $j, $BLOCK5, $BLOCK5 vaesenclast $j, $BLOCK6, $BLOCK6 ___ }; # void aes256gcmsiv_kdf(const uint8_t nonce[16], # uint8_t *out_key_material, # const uint8_t *key_schedule); $code.=<<___; .globl aes256gcmsiv_kdf .type aes256gcmsiv_kdf,\@function,3 .align 16 aes256gcmsiv_kdf: .cfi_startproc # parameter 1: %rdi Pointer to NONCE # parameter 2: %rsi Pointer to CT # parameter 4: %rdx Pointer to keys vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key vmovdqa 0*16(%rdi), $BLOCK1 vmovdqa and_mask(%rip), $BLOCK4 vmovdqa one(%rip), $ONE vpshufd \$0x90, $BLOCK1, $BLOCK1 vpand $BLOCK4, $BLOCK1, $BLOCK1 vpaddd $ONE, $BLOCK1, $BLOCK2 vpaddd $ONE, $BLOCK2, $BLOCK3 vpaddd $ONE, $BLOCK3, $BLOCK4 vpaddd $ONE, $BLOCK4, $BLOCK5 vpaddd $ONE, $BLOCK5, $BLOCK6 vpxor %xmm1, $BLOCK1, $BLOCK1 vpxor %xmm1, $BLOCK2, $BLOCK2 vpxor %xmm1, $BLOCK3, $BLOCK3 vpxor %xmm1, $BLOCK4, $BLOCK4 vpxor %xmm1, $BLOCK5, $BLOCK5 vpxor %xmm1, $BLOCK6, $BLOCK6 ${\$enc_roundx6->(1, "%xmm1")} ${\$enc_roundx6->(2, "%xmm2")} ${\$enc_roundx6->(3, "%xmm1")} ${\$enc_roundx6->(4, "%xmm2")} ${\$enc_roundx6->(5, "%xmm1")} ${\$enc_roundx6->(6, "%xmm2")} ${\$enc_roundx6->(7, "%xmm1")} ${\$enc_roundx6->(8, "%xmm2")} ${\$enc_roundx6->(9, "%xmm1")} ${\$enc_roundx6->(10, "%xmm2")} ${\$enc_roundx6->(11, "%xmm1")} ${\$enc_roundx6->(12, "%xmm2")} ${\$enc_roundx6->(13, "%xmm1")} ${\$enc_roundlastx6->(14, "%xmm2")} vmovdqa $BLOCK1, 0*16(%rsi) vmovdqa $BLOCK2, 1*16(%rsi) vmovdqa $BLOCK3, 2*16(%rsi) vmovdqa $BLOCK4, 3*16(%rsi) vmovdqa $BLOCK5, 4*16(%rsi) vmovdqa $BLOCK6, 5*16(%rsi) ret .cfi_endproc .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf ___ } aes256gcmsiv_kdf(); print $code; close STDOUT or die "error closing STDOUT";