• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# Copyright (c) 2017, Shay Gueron.
4# Copyright 2017 The BoringSSL Authors
5#
6# Permission to use, copy, modify, and/or distribute this software for any
7# purpose with or without fee is hereby granted, provided that the above
8# copyright notice and this permission notice appear in all copies.
9#
10# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
13# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
15# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
16# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
17
18use warnings FATAL => 'all';
19
20$flavour = shift;
21$output  = shift;
22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29die "can't locate x86_64-xlate.pl";
30
31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
32*STDOUT=*OUT;
33
34$code.=<<___;
35.section .rodata
36
37.align 16
38one:
39.quad 1,0
40two:
41.quad 2,0
42three:
43.quad 3,0
44four:
45.quad 4,0
46five:
47.quad 5,0
48six:
49.quad 6,0
50seven:
51.quad 7,0
52eight:
53.quad 8,0
54
55OR_MASK:
56.long 0x00000000,0x00000000,0x00000000,0x80000000
57poly:
58.quad 0x1, 0xc200000000000000
59mask:
60.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
61con1:
62.long 1,1,1,1
63con2:
64.long 0x1b,0x1b,0x1b,0x1b
65con3:
66.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
67and_mask:
68.long 0,0xffffffff, 0xffffffff, 0xffffffff
69___
70
71$code.=<<___;
72.text
73___
74
75sub gfmul {
76  #########################
77  # a = T
78  # b = TMP0 - remains unchanged
79  # res = T
80  # uses also TMP1,TMP2,TMP3,TMP4
81  # __m128i GFMUL(__m128i A, __m128i B);
82
83  my $T = "%xmm0";
84  my $TMP0 = "%xmm1";
85  my $TMP1 = "%xmm2";
86  my $TMP2 = "%xmm3";
87  my $TMP3 = "%xmm4";
88  my $TMP4 = "%xmm5";
89
90  $code.=<<___;
91.type GFMUL,\@abi-omnipotent
92.align 16
93GFMUL:
94.cfi_startproc
95    vpclmulqdq  \$0x00, $TMP0, $T, $TMP1
96    vpclmulqdq  \$0x11, $TMP0, $T, $TMP4
97    vpclmulqdq  \$0x10, $TMP0, $T, $TMP2
98    vpclmulqdq  \$0x01, $TMP0, $T, $TMP3
99    vpxor       $TMP3, $TMP2, $TMP2
100    vpslldq     \$8, $TMP2, $TMP3
101    vpsrldq     \$8, $TMP2, $TMP2
102    vpxor       $TMP3, $TMP1, $TMP1
103    vpxor       $TMP2, $TMP4, $TMP4
104
105    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
106    vpshufd     \$78, $TMP1, $TMP3
107    vpxor       $TMP3, $TMP2, $TMP1
108
109    vpclmulqdq  \$0x10, poly(%rip), $TMP1, $TMP2
110    vpshufd     \$78, $TMP1, $TMP3
111    vpxor       $TMP3, $TMP2, $TMP1
112
113    vpxor       $TMP4, $TMP1, $T
114    ret
115.cfi_endproc
116.size GFMUL, .-GFMUL
117___
118}
119gfmul();
120
121sub aesgcmsiv_htable_init {
122  # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
123  # |out_htable|.
124  # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
125
126  my $Htbl = "%rdi";
127  my $H = "%rsi";
128  my $T = "%xmm0";
129  my $TMP0 = "%xmm1";
130
131$code.=<<___;
132.globl aesgcmsiv_htable_init
133.type aesgcmsiv_htable_init,\@function,2
134.align 16
135aesgcmsiv_htable_init:
136.cfi_startproc
137    _CET_ENDBR
138    vmovdqa ($H), $T
139    vmovdqa $T, $TMP0
140    vmovdqa $T, ($Htbl)      # H
141    call GFMUL
142    vmovdqa $T, 16($Htbl)    # H^2
143    call GFMUL
144    vmovdqa $T, 32($Htbl)    # H^3
145    call GFMUL
146    vmovdqa $T, 48($Htbl)    # H^4
147    call GFMUL
148    vmovdqa $T, 64($Htbl)    # H^5
149    call GFMUL
150    vmovdqa $T, 80($Htbl)    # H^6
151    call GFMUL
152    vmovdqa $T, 96($Htbl)    # H^7
153    call GFMUL
154    vmovdqa $T, 112($Htbl)   # H^8
155    ret
156.cfi_endproc
157.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
158___
159}
160aesgcmsiv_htable_init();
161
162sub aesgcmsiv_htable6_init {
163  # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
164  # |out_htable|.
165  # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
166  #
167  my $Htbl = "%rdi";
168  my $H = "%rsi";
169  my $T = "%xmm0";
170  my $TMP0 = "%xmm1";
171
172  $code.=<<___;
173.globl aesgcmsiv_htable6_init
174.type aesgcmsiv_htable6_init,\@function,2
175.align 16
176aesgcmsiv_htable6_init:
177.cfi_startproc
178    _CET_ENDBR
179    vmovdqa ($H), $T
180    vmovdqa $T, $TMP0
181    vmovdqa $T, ($Htbl)      # H
182    call GFMUL
183    vmovdqa $T, 16($Htbl)    # H^2
184    call GFMUL
185    vmovdqa $T, 32($Htbl)    # H^3
186    call GFMUL
187    vmovdqa $T, 48($Htbl)    # H^4
188    call GFMUL
189    vmovdqa $T, 64($Htbl)    # H^5
190    call GFMUL
191    vmovdqa $T, 80($Htbl)    # H^6
192    ret
193.cfi_endproc
194.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
195___
196}
197aesgcmsiv_htable6_init();
198
199sub aesgcmsiv_htable_polyval {
200  # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
201  # parameter 1: %rdi     Htable  - pointer to Htable
202  # parameter 2: %rsi     INp     - pointer to input
203  # parameter 3: %rdx     LEN     - length of BUFFER in bytes
204  # parameter 4: %rcx     T       - pointer to POLYVAL output
205
206  my $DATA = "%xmm0";
207  my $hlp0 = "%r11";
208  my $Htbl = "%rdi";
209  my $inp = "%rsi";
210  my $len = "%rdx";
211  my $TMP0 = "%xmm3";
212  my $TMP1 = "%xmm4";
213  my $TMP2 = "%xmm5";
214  my $TMP3 = "%xmm6";
215  my $TMP4 = "%xmm7";
216  my $Tp = "%rcx";
217  my $T = "%xmm1";
218  my $Xhi = "%xmm9";
219
220  my $SCHOOLBOOK_AAD = sub {
221    my ($i)=@_;
222    return <<___;
223    vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
224    vpxor $TMP3, $TMP2, $TMP2
225    vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
226    vpxor $TMP3, $TMP0, $TMP0
227    vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
228    vpxor $TMP3, $TMP1, $TMP1
229    vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
230    vpxor $TMP3, $TMP2, $TMP2
231___
232  };
233
234  $code.=<<___;
235.globl aesgcmsiv_htable_polyval
236.type aesgcmsiv_htable_polyval,\@function,4
237.align 16
238aesgcmsiv_htable_polyval:
239.cfi_startproc
240    _CET_ENDBR
241    test  $len, $len
242    jnz   .Lhtable_polyval_start
243    ret
244
245.Lhtable_polyval_start:
246    vzeroall
247
248    # We hash 8 blocks each iteration. If the total number of blocks is not a
249    # multiple of 8, we first hash the leading n%8 blocks.
250    movq $len, $hlp0
251    andq \$127, $hlp0
252
253    jz .Lhtable_polyval_no_prefix
254
255    vpxor $Xhi, $Xhi, $Xhi
256    vmovdqa ($Tp), $T
257    sub $hlp0, $len
258
259    sub \$16, $hlp0
260
261    # hash first prefix block
262    vmovdqu ($inp), $DATA
263    vpxor $T, $DATA, $DATA
264
265    vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
266    vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
267    vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
268    vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
269    vpxor $TMP3, $TMP2, $TMP2
270
271    lea 16($inp), $inp
272    test $hlp0, $hlp0
273    jnz .Lhtable_polyval_prefix_loop
274    jmp .Lhtable_polyval_prefix_complete
275
276    # hash remaining prefix bocks (up to 7 total prefix blocks)
277.align 64
278.Lhtable_polyval_prefix_loop:
279    sub \$16, $hlp0
280
281    vmovdqu ($inp), $DATA           # next data block
282
283    vpclmulqdq  \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
284    vpxor       $TMP3, $TMP0, $TMP0
285    vpclmulqdq  \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
286    vpxor       $TMP3, $TMP1, $TMP1
287    vpclmulqdq  \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
288    vpxor       $TMP3, $TMP2, $TMP2
289    vpclmulqdq  \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
290    vpxor       $TMP3, $TMP2, $TMP2
291
292    test $hlp0, $hlp0
293
294    lea 16($inp), $inp
295
296    jnz .Lhtable_polyval_prefix_loop
297
298.Lhtable_polyval_prefix_complete:
299    vpsrldq \$8, $TMP2, $TMP3
300    vpslldq \$8, $TMP2, $TMP2
301
302    vpxor $TMP3, $TMP1, $Xhi
303    vpxor $TMP2, $TMP0, $T
304
305    jmp .Lhtable_polyval_main_loop
306
307.Lhtable_polyval_no_prefix:
308    # At this point we know the number of blocks is a multiple of 8. However,
309    # the reduction in the main loop includes a multiplication by x^(-128). In
310    # order to counter this, the existing tag needs to be multipled by x^128.
311    # In practice, this just means that it is loaded into $Xhi, not $T.
312    vpxor $T, $T, $T
313    vmovdqa ($Tp), $Xhi
314
315.align 64
316.Lhtable_polyval_main_loop:
317    sub \$0x80, $len
318    jb .Lhtable_polyval_out
319
320    vmovdqu 16*7($inp), $DATA      # Ii
321
322    vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
323    vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
324    vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
325    vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
326    vpxor $TMP3, $TMP2, $TMP2
327
328    #########################################################
329    vmovdqu 16*6($inp), $DATA
330    ${\$SCHOOLBOOK_AAD->(1)}
331
332    #########################################################
333    vmovdqu 16*5($inp), $DATA
334
335    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 1a
336    vpalignr \$8, $T, $T, $T
337
338    ${\$SCHOOLBOOK_AAD->(2)}
339
340    vpxor $TMP4, $T, $T                              # reduction stage 1b
341    #########################################################
342    vmovdqu     16*4($inp), $DATA
343
344    ${\$SCHOOLBOOK_AAD->(3)}
345    #########################################################
346    vmovdqu     16*3($inp), $DATA
347
348    vpclmulqdq \$0x10, poly(%rip), $T, $TMP4         # reduction stage 2a
349    vpalignr \$8, $T, $T, $T
350
351    ${\$SCHOOLBOOK_AAD->(4)}
352
353    vpxor $TMP4, $T, $T                              # reduction stage 2b
354    #########################################################
355    vmovdqu 16*2($inp), $DATA
356
357    ${\$SCHOOLBOOK_AAD->(5)}
358
359    vpxor $Xhi, $T, $T                               # reduction finalize
360    #########################################################
361    vmovdqu 16*1($inp), $DATA
362
363    ${\$SCHOOLBOOK_AAD->(6)}
364    #########################################################
365    vmovdqu 16*0($inp), $DATA
366    vpxor $T, $DATA, $DATA
367
368    ${\$SCHOOLBOOK_AAD->(7)}
369    #########################################################
370    vpsrldq \$8, $TMP2, $TMP3
371    vpslldq \$8, $TMP2, $TMP2
372
373    vpxor $TMP3, $TMP1, $Xhi
374    vpxor $TMP2, $TMP0, $T
375
376    lea 16*8($inp), $inp
377    jmp .Lhtable_polyval_main_loop
378
379    #########################################################
380
381.Lhtable_polyval_out:
382    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
383    vpalignr    \$8, $T, $T, $T
384    vpxor       $TMP3, $T, $T
385
386    vpclmulqdq  \$0x10, poly(%rip), $T, $TMP3
387    vpalignr    \$8, $T, $T, $T
388    vpxor       $TMP3, $T, $T
389    vpxor       $Xhi, $T, $T
390
391    vmovdqu $T, ($Tp)
392    vzeroupper
393    ret
394.cfi_endproc
395.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
396___
397}
398aesgcmsiv_htable_polyval();
399
400sub aesgcmsiv_polyval_horner {
401  #void aesgcmsiv_polyval_horner(unsigned char T[16],  // output
402  #      const unsigned char* H, // H
403  #      unsigned char* BUF,  // Buffer
404  #      unsigned int blocks);  // Len2
405  #
406  # parameter 1: %rdi T - pointers to POLYVAL output
407  # parameter 2: %rsi Hp - pointer to H (user key)
408  # parameter 3: %rdx INp - pointer to input
409  # parameter 4: %rcx L - total number of blocks in input BUFFER
410  #
411  my $T = "%rdi";
412  my $Hp = "%rsi";
413  my $INp = "%rdx";
414  my $L = "%rcx";
415  my $LOC = "%r10";
416  my $LEN = "%eax";
417  my $H = "%xmm1";
418  my $RES = "%xmm0";
419
420  $code.=<<___;
421.globl aesgcmsiv_polyval_horner
422.type aesgcmsiv_polyval_horner,\@function,4
423.align 16
424aesgcmsiv_polyval_horner:
425.cfi_startproc
426    _CET_ENDBR
427    test $L, $L
428    jnz .Lpolyval_horner_start
429    ret
430
431.Lpolyval_horner_start:
432    # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
433    # RES = GFMUL(RES, H)
434
435    xorq $LOC, $LOC
436    shlq \$4, $L    # L contains number of bytes to process
437
438    vmovdqa ($Hp), $H
439    vmovdqa ($T), $RES
440
441.Lpolyval_horner_loop:
442    vpxor ($INp,$LOC), $RES, $RES  # RES = RES + Xi
443    call GFMUL  # RES = RES * H
444
445    add \$16, $LOC
446    cmp $LOC, $L
447    jne .Lpolyval_horner_loop
448
449    # calculation of T is complete. RES=T
450    vmovdqa $RES, ($T)
451    ret
452.cfi_endproc
453.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
454___
455}
456aesgcmsiv_polyval_horner();
457
458# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
459# parameter 1: %rdi
460# parameter 2: %rsi
461$code.=<<___;
462.globl aes128gcmsiv_aes_ks
463.type aes128gcmsiv_aes_ks,\@function,2
464.align 16
465aes128gcmsiv_aes_ks:
466.cfi_startproc
467    _CET_ENDBR
468    vmovdqu (%rdi), %xmm1           # xmm1 = user key
469    vmovdqa %xmm1, (%rsi)           # rsi points to output
470
471    vmovdqa con1(%rip), %xmm0
472    vmovdqa mask(%rip), %xmm15
473
474    movq \$8, %rax
475
476.Lks128_loop:
477    addq \$16, %rsi                 # rsi points for next key
478    subq \$1, %rax
479    vpshufb %xmm15, %xmm1, %xmm2    # xmm2 = shuffled user key
480    vaesenclast %xmm0, %xmm2, %xmm2
481    vpslld \$1, %xmm0, %xmm0
482    vpslldq \$4, %xmm1, %xmm3
483    vpxor %xmm3, %xmm1, %xmm1
484    vpslldq \$4, %xmm3, %xmm3
485    vpxor %xmm3, %xmm1, %xmm1
486    vpslldq \$4, %xmm3, %xmm3
487    vpxor %xmm3, %xmm1, %xmm1
488    vpxor %xmm2, %xmm1, %xmm1
489    vmovdqa %xmm1, (%rsi)
490    jne .Lks128_loop
491
492    vmovdqa con2(%rip), %xmm0
493    vpshufb %xmm15, %xmm1, %xmm2
494    vaesenclast %xmm0, %xmm2, %xmm2
495    vpslld \$1, %xmm0, %xmm0
496    vpslldq \$4, %xmm1, %xmm3
497    vpxor %xmm3, %xmm1, %xmm1
498    vpslldq \$4, %xmm3, %xmm3
499    vpxor %xmm3, %xmm1, %xmm1
500    vpslldq \$4, %xmm3, %xmm3
501    vpxor %xmm3, %xmm1, %xmm1
502    vpxor %xmm2, %xmm1, %xmm1
503    vmovdqa %xmm1, 16(%rsi)
504
505    vpshufb %xmm15, %xmm1, %xmm2
506    vaesenclast %xmm0, %xmm2, %xmm2
507    vpslldq \$4, %xmm1, %xmm3
508    vpxor %xmm3, %xmm1, %xmm1
509    vpslldq \$4, %xmm3, %xmm3
510    vpxor %xmm3, %xmm1, %xmm1
511    vpslldq \$4, %xmm3, %xmm3
512    vpxor %xmm3, %xmm1, %xmm1
513    vpxor %xmm2, %xmm1, %xmm1
514    vmovdqa %xmm1, 32(%rsi)
515    ret
516.cfi_endproc
517.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
518___
519
520# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
521# parameter 1: %rdi
522# parameter 2: %rsi
523$code.=<<___;
524.globl aes256gcmsiv_aes_ks
525.type aes256gcmsiv_aes_ks,\@function,2
526.align 16
527aes256gcmsiv_aes_ks:
528.cfi_startproc
529    _CET_ENDBR
530    vmovdqu (%rdi), %xmm1
531    vmovdqu 16(%rdi), %xmm3
532    vmovdqa %xmm1, (%rsi)
533    vmovdqa %xmm3, 16(%rsi)
534    vmovdqa con1(%rip), %xmm0
535    vmovdqa mask(%rip), %xmm15
536    vpxor %xmm14, %xmm14, %xmm14
537    mov \$6, %rax
538
539.Lks256_loop:
540    add \$32, %rsi
541    subq \$1, %rax
542    vpshufb %xmm15, %xmm3, %xmm2
543    vaesenclast %xmm0, %xmm2, %xmm2
544    vpslld \$1, %xmm0, %xmm0
545    vpsllq \$32, %xmm1, %xmm4
546    vpxor %xmm4, %xmm1, %xmm1
547    vpshufb con3(%rip), %xmm1,  %xmm4
548    vpxor %xmm4, %xmm1, %xmm1
549    vpxor %xmm2, %xmm1, %xmm1
550    vmovdqa %xmm1, (%rsi)
551    vpshufd \$0xff, %xmm1, %xmm2
552    vaesenclast %xmm14, %xmm2, %xmm2
553    vpsllq \$32, %xmm3, %xmm4
554    vpxor %xmm4, %xmm3, %xmm3
555    vpshufb con3(%rip), %xmm3,  %xmm4
556    vpxor %xmm4, %xmm3, %xmm3
557    vpxor %xmm2, %xmm3, %xmm3
558    vmovdqa %xmm3, 16(%rsi)
559    jne .Lks256_loop
560
561    vpshufb %xmm15, %xmm3, %xmm2
562    vaesenclast %xmm0, %xmm2, %xmm2
563    vpsllq \$32, %xmm1, %xmm4
564    vpxor %xmm4, %xmm1, %xmm1
565    vpshufb con3(%rip), %xmm1,  %xmm4
566    vpxor %xmm4, %xmm1, %xmm1
567    vpxor %xmm2, %xmm1, %xmm1
568    vmovdqa %xmm1, 32(%rsi)
569    ret
570.cfi_endproc
571___
572
573sub aes128gcmsiv_aes_ks_enc_x1 {
574  my $KS1_REGA = "%xmm1";
575  my $KS1_REGB = "%xmm2";
576  my $BLOCK1 = "%xmm4";
577  my $AUXREG = "%xmm3";
578
579  my $KS_BLOCK = sub {
580    my ($reg, $reg2, $auxReg) = @_;
581    return <<___;
582    vpsllq \$32, $reg, $auxReg         #!!saving mov instruction to xmm3
583    vpxor $auxReg, $reg, $reg
584    vpshufb con3(%rip), $reg,  $auxReg
585    vpxor $auxReg, $reg, $reg
586    vpxor $reg2, $reg, $reg
587___
588  };
589
590  my $round = sub {
591    my ($i, $j) = @_;
592    return <<___;
593    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
594    vaesenclast %xmm0, %xmm2, %xmm2
595    vpslld \$1, %xmm0, %xmm0
596    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
597    vaesenc %xmm1, $BLOCK1, $BLOCK1
598    vmovdqa %xmm1, ${\eval(16*$i)}($j)
599___
600  };
601
602  my $roundlast = sub {
603    my ($i, $j) = @_;
604    return <<___;
605    vpshufb %xmm15, %xmm1, %xmm2      #!!saving mov instruction to xmm2
606    vaesenclast %xmm0, %xmm2, %xmm2
607    ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
608    vaesenclast %xmm1, $BLOCK1, $BLOCK1
609    vmovdqa %xmm1, ${\eval(16*$i)}($j)
610___
611  };
612
613# parameter 1: %rdi                         Pointer to PT
614# parameter 2: %rsi                         Pointer to CT
615# parameter 4: %rdx                         Pointer to keys
616# parameter 5: %rcx                         Pointer to initial key
617  $code.=<<___;
618.globl aes128gcmsiv_aes_ks_enc_x1
619.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
620.align 16
621aes128gcmsiv_aes_ks_enc_x1:
622.cfi_startproc
623    _CET_ENDBR
624    vmovdqa (%rcx), %xmm1                 # xmm1 = first 16 bytes of random key
625    vmovdqa 0*16(%rdi), $BLOCK1
626
627    vmovdqa %xmm1, (%rdx)                 # KEY[0] = first 16 bytes of random key
628    vpxor %xmm1, $BLOCK1, $BLOCK1
629
630    vmovdqa con1(%rip), %xmm0             # xmm0  = 1,1,1,1
631    vmovdqa mask(%rip), %xmm15            # xmm15 = mask
632
633    ${\$round->(1, "%rdx")}
634    ${\$round->(2, "%rdx")}
635    ${\$round->(3, "%rdx")}
636    ${\$round->(4, "%rdx")}
637    ${\$round->(5, "%rdx")}
638    ${\$round->(6, "%rdx")}
639    ${\$round->(7, "%rdx")}
640    ${\$round->(8, "%rdx")}
641
642    vmovdqa con2(%rip), %xmm0
643
644    ${\$round->(9, "%rdx")}
645    ${\$roundlast->(10, "%rdx")}
646
647    vmovdqa $BLOCK1, 0*16(%rsi)
648    ret
649.cfi_endproc
650.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
651___
652}
653aes128gcmsiv_aes_ks_enc_x1();
654
655sub aes128gcmsiv_kdf {
656  my $BLOCK1 = "%xmm9";
657  my $BLOCK2 = "%xmm10";
658  my $BLOCK3 = "%xmm11";
659  my $BLOCK4 = "%xmm12";
660  my $BLOCK5 = "%xmm13";
661  my $BLOCK6 = "%xmm14";
662  my $ONE = "%xmm13";
663  my $KSp = "%rdx";
664  my $STATE_1 = "%xmm1";
665
666  my $enc_roundx4 = sub {
667    my ($i, $j) = @_;
668    return <<___;
669    vmovdqa ${\eval($i*16)}(%rdx), $j
670    vaesenc $j, $BLOCK1, $BLOCK1
671    vaesenc $j, $BLOCK2, $BLOCK2
672    vaesenc $j, $BLOCK3, $BLOCK3
673    vaesenc $j, $BLOCK4, $BLOCK4
674___
675  };
676
677  my $enc_roundlastx4 = sub {
678    my ($i, $j) = @_;
679    return <<___;
680    vmovdqa ${\eval($i*16)}(%rdx), $j
681    vaesenclast $j, $BLOCK1, $BLOCK1
682    vaesenclast $j, $BLOCK2, $BLOCK2
683    vaesenclast $j, $BLOCK3, $BLOCK3
684    vaesenclast $j, $BLOCK4, $BLOCK4
685___
686  };
687
688# void aes128gcmsiv_kdf(const uint8_t nonce[16],
689#                       uint8_t *out_key_material,
690#                       const uint8_t *key_schedule);
691  $code.=<<___;
692.globl aes128gcmsiv_kdf
693.type aes128gcmsiv_kdf,\@function,3
694.align 16
695aes128gcmsiv_kdf:
696.cfi_startproc
697    _CET_ENDBR
698# parameter 1: %rdi                         Pointer to NONCE
699# parameter 2: %rsi                         Pointer to CT
700# parameter 4: %rdx                         Pointer to keys
701
702    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
703    vmovdqa 0*16(%rdi), $BLOCK1
704    vmovdqa and_mask(%rip), $BLOCK4
705    vmovdqa one(%rip), $ONE
706    vpshufd \$0x90, $BLOCK1, $BLOCK1
707    vpand $BLOCK4, $BLOCK1, $BLOCK1
708    vpaddd $ONE, $BLOCK1, $BLOCK2
709    vpaddd $ONE, $BLOCK2, $BLOCK3
710    vpaddd $ONE, $BLOCK3, $BLOCK4
711
712    vpxor %xmm1, $BLOCK1, $BLOCK1
713    vpxor %xmm1, $BLOCK2, $BLOCK2
714    vpxor %xmm1, $BLOCK3, $BLOCK3
715    vpxor %xmm1, $BLOCK4, $BLOCK4
716
717    ${\$enc_roundx4->(1, "%xmm1")}
718    ${\$enc_roundx4->(2, "%xmm2")}
719    ${\$enc_roundx4->(3, "%xmm1")}
720    ${\$enc_roundx4->(4, "%xmm2")}
721    ${\$enc_roundx4->(5, "%xmm1")}
722    ${\$enc_roundx4->(6, "%xmm2")}
723    ${\$enc_roundx4->(7, "%xmm1")}
724    ${\$enc_roundx4->(8, "%xmm2")}
725    ${\$enc_roundx4->(9, "%xmm1")}
726    ${\$enc_roundlastx4->(10, "%xmm2")}
727
728    vmovdqa $BLOCK1, 0*16(%rsi)
729    vmovdqa $BLOCK2, 1*16(%rsi)
730    vmovdqa $BLOCK3, 2*16(%rsi)
731    vmovdqa $BLOCK4, 3*16(%rsi)
732    ret
733.cfi_endproc
734.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
735___
736}
737aes128gcmsiv_kdf();
738
739sub aes128gcmsiv_enc_msg_x4 {
740  my $CTR1 = "%xmm0";
741  my $CTR2 = "%xmm1";
742  my $CTR3 = "%xmm2";
743  my $CTR4 = "%xmm3";
744  my $ADDER = "%xmm4";
745
746  my $STATE1 = "%xmm5";
747  my $STATE2 = "%xmm6";
748  my $STATE3 = "%xmm7";
749  my $STATE4 = "%xmm8";
750
751  my $TMP = "%xmm12";
752  my $TMP2 = "%xmm13";
753  my $TMP3 = "%xmm14";
754  my $IV = "%xmm15";
755
756  my $PT = "%rdi";
757  my $CT = "%rsi";
758  my $TAG = "%rdx";
759  my $KS = "%rcx";
760  my $LEN = "%r8";
761
762  my $aes_round = sub {
763    my ($i) = @_;
764    return <<___;
765    vmovdqu ${\eval($i*16)}($KS), $TMP
766    vaesenc $TMP, $STATE1, $STATE1
767    vaesenc $TMP, $STATE2, $STATE2
768    vaesenc $TMP, $STATE3, $STATE3
769    vaesenc $TMP, $STATE4, $STATE4
770___
771  };
772
773  my $aes_lastround = sub {
774    my ($i) = @_;
775    return <<___;
776    vmovdqu ${\eval($i*16)}($KS), $TMP
777    vaesenclast $TMP, $STATE1, $STATE1
778    vaesenclast $TMP, $STATE2, $STATE2
779    vaesenclast $TMP, $STATE3, $STATE3
780    vaesenclast $TMP, $STATE4, $STATE4
781___
782  };
783
784# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
785#                              unsigned char* TAG, unsigned char* KS,
786#                              size_t byte_len);
787# parameter 1: %rdi     #PT
788# parameter 2: %rsi     #CT
789# parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
790# parameter 4: %rcx     #KS
791# parameter 5: %r8      #LEN MSG_length in bytes
792  $code.=<<___;
793.globl aes128gcmsiv_enc_msg_x4
794.type aes128gcmsiv_enc_msg_x4,\@function,5
795.align 16
796aes128gcmsiv_enc_msg_x4:
797.cfi_startproc
798    _CET_ENDBR
799    test $LEN, $LEN
800    jnz .L128_enc_msg_x4_start
801    ret
802
803.L128_enc_msg_x4_start:
804    pushq %r12
805.cfi_push %r12
806    pushq %r13
807.cfi_push %r13
808
809    shrq \$4, $LEN      # LEN = num of blocks
810    movq $LEN, %r10
811    shlq \$62, %r10
812    shrq \$62, %r10
813
814    # make IV from TAG
815    vmovdqa ($TAG), $IV
816    vpor OR_MASK(%rip), $IV, $IV  #IV = [1]TAG[126...32][00..00]
817
818    vmovdqu four(%rip), $ADDER     # Register to increment counters
819    vmovdqa $IV, $CTR1             # CTR1 = TAG[1][127...32][00..00]
820    vpaddd one(%rip), $IV, $CTR2   # CTR2 = TAG[1][127...32][00..01]
821    vpaddd two(%rip), $IV, $CTR3   # CTR3 = TAG[1][127...32][00..02]
822    vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
823
824    shrq \$2, $LEN
825    je .L128_enc_msg_x4_check_remainder
826
827    subq \$64, $CT
828    subq \$64, $PT
829
830.L128_enc_msg_x4_loop1:
831    addq \$64, $CT
832    addq \$64, $PT
833
834    vmovdqa $CTR1, $STATE1
835    vmovdqa $CTR2, $STATE2
836    vmovdqa $CTR3, $STATE3
837    vmovdqa $CTR4, $STATE4
838
839    vpxor ($KS), $STATE1, $STATE1
840    vpxor ($KS), $STATE2, $STATE2
841    vpxor ($KS), $STATE3, $STATE3
842    vpxor ($KS), $STATE4, $STATE4
843
844    ${\$aes_round->(1)}
845    vpaddd $ADDER, $CTR1, $CTR1
846    ${\$aes_round->(2)}
847    vpaddd $ADDER, $CTR2, $CTR2
848    ${\$aes_round->(3)}
849    vpaddd $ADDER, $CTR3, $CTR3
850    ${\$aes_round->(4)}
851    vpaddd $ADDER, $CTR4, $CTR4
852
853    ${\$aes_round->(5)}
854    ${\$aes_round->(6)}
855    ${\$aes_round->(7)}
856    ${\$aes_round->(8)}
857    ${\$aes_round->(9)}
858    ${\$aes_lastround->(10)}
859
860    # XOR with Plaintext
861    vpxor 0*16($PT), $STATE1, $STATE1
862    vpxor 1*16($PT), $STATE2, $STATE2
863    vpxor 2*16($PT), $STATE3, $STATE3
864    vpxor 3*16($PT), $STATE4, $STATE4
865
866    subq \$1, $LEN
867
868    vmovdqu $STATE1, 0*16($CT)
869    vmovdqu $STATE2, 1*16($CT)
870    vmovdqu $STATE3, 2*16($CT)
871    vmovdqu $STATE4, 3*16($CT)
872
873    jne .L128_enc_msg_x4_loop1
874
875    addq \$64,$CT
876    addq \$64,$PT
877
878.L128_enc_msg_x4_check_remainder:
879    cmpq \$0, %r10
880    je .L128_enc_msg_x4_out
881
882.L128_enc_msg_x4_loop2:
883    # enc each block separately
884    # CTR1 is the highest counter (even if no LOOP done)
885    vmovdqa $CTR1, $STATE1
886    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
887
888    vpxor ($KS), $STATE1, $STATE1
889    vaesenc 16($KS), $STATE1, $STATE1
890    vaesenc 32($KS), $STATE1, $STATE1
891    vaesenc 48($KS), $STATE1, $STATE1
892    vaesenc 64($KS), $STATE1, $STATE1
893    vaesenc 80($KS), $STATE1, $STATE1
894    vaesenc 96($KS), $STATE1, $STATE1
895    vaesenc 112($KS), $STATE1, $STATE1
896    vaesenc 128($KS), $STATE1, $STATE1
897    vaesenc 144($KS), $STATE1, $STATE1
898    vaesenclast 160($KS), $STATE1, $STATE1
899
900    # XOR with plaintext
901    vpxor ($PT), $STATE1, $STATE1
902    vmovdqu $STATE1, ($CT)
903
904    addq \$16, $PT
905    addq \$16, $CT
906
907    subq \$1, %r10
908    jne .L128_enc_msg_x4_loop2
909
910.L128_enc_msg_x4_out:
911    popq %r13
912.cfi_pop %r13
913    popq %r12
914.cfi_pop %r12
915    ret
916.cfi_endproc
917.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
918___
919}
920aes128gcmsiv_enc_msg_x4();
921
922sub aes128gcmsiv_enc_msg_x8 {
923  my $STATE1 = "%xmm1";
924  my $STATE2 = "%xmm2";
925  my $STATE3 = "%xmm3";
926  my $STATE4 = "%xmm4";
927  my $STATE5 = "%xmm5";
928  my $STATE6 = "%xmm6";
929  my $STATE7 = "%xmm7";
930  my $STATE8 = "%xmm8";
931
932  my $CTR1 = "%xmm0";
933  my $CTR2 = "%xmm9";
934  my $CTR3 = "%xmm10";
935  my $CTR4 = "%xmm11";
936  my $CTR5 = "%xmm12";
937  my $CTR6 = "%xmm13";
938  my $CTR7 = "%xmm14";
939  my $SCHED = "%xmm15";
940
941  my $TMP1 = "%xmm1";
942  my $TMP2 = "%xmm2";
943
944  my $PT = "%rdi";
945  my $CT = "%rsi";
946  my $TAG = "%rdx";
947  my $KS = "%rcx";
948  my $LEN = "%r8";
949
950  my $aes_round8 = sub {
951    my ($i) = @_;
952    return <<___;
953    vmovdqu ${\eval($i*16)}($KS), $SCHED
954    vaesenc $SCHED, $STATE1, $STATE1
955    vaesenc $SCHED, $STATE2, $STATE2
956    vaesenc $SCHED, $STATE3, $STATE3
957    vaesenc $SCHED, $STATE4, $STATE4
958    vaesenc $SCHED, $STATE5, $STATE5
959    vaesenc $SCHED, $STATE6, $STATE6
960    vaesenc $SCHED, $STATE7, $STATE7
961    vaesenc $SCHED, $STATE8, $STATE8
962___
963  };
964
965  my $aes_lastround8 = sub {
966    my ($i) = @_;
967    return <<___;
968    vmovdqu ${\eval($i*16)}($KS), $SCHED
969    vaesenclast $SCHED, $STATE1, $STATE1
970    vaesenclast $SCHED, $STATE2, $STATE2
971    vaesenclast $SCHED, $STATE3, $STATE3
972    vaesenclast $SCHED, $STATE4, $STATE4
973    vaesenclast $SCHED, $STATE5, $STATE5
974    vaesenclast $SCHED, $STATE6, $STATE6
975    vaesenclast $SCHED, $STATE7, $STATE7
976    vaesenclast $SCHED, $STATE8, $STATE8
977___
978  };
979
980# void ENC_MSG_x8(unsigned char* PT,
981#                 unsigned char* CT,
982#                 unsigned char* TAG,
983#                 unsigned char* KS,
984#                 size_t byte_len);
985# parameter 1: %rdi     #PT
986# parameter 2: %rsi     #CT
987# parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
988# parameter 4: %rcx     #KS
989# parameter 5: %r8      #LEN MSG_length in bytes
990  $code.=<<___;
991.globl aes128gcmsiv_enc_msg_x8
992.type aes128gcmsiv_enc_msg_x8,\@function,5
993.align 16
994aes128gcmsiv_enc_msg_x8:
995.cfi_startproc
996    _CET_ENDBR
997    test $LEN, $LEN
998    jnz .L128_enc_msg_x8_start
999    ret
1000
1001.L128_enc_msg_x8_start:
1002    pushq %r12
1003.cfi_push %r12
1004    pushq %r13
1005.cfi_push %r13
1006    pushq %rbp
1007.cfi_push %rbp
1008    movq %rsp, %rbp
1009.cfi_def_cfa_register rbp
1010
1011    # Place in stack
1012    subq \$128, %rsp
1013    andq \$-64, %rsp
1014
1015    shrq \$4, $LEN  # LEN = num of blocks
1016    movq $LEN, %r10
1017    shlq \$61, %r10
1018    shrq \$61, %r10
1019
1020    # make IV from TAG
1021    vmovdqu ($TAG), $TMP1
1022    vpor OR_MASK(%rip), $TMP1, $TMP1  # TMP1= IV = [1]TAG[126...32][00..00]
1023
1024    # store counter8 in the stack
1025    vpaddd seven(%rip), $TMP1, $CTR1
1026    vmovdqu $CTR1, (%rsp)             # CTR8 = TAG[127...32][00..07]
1027    vpaddd one(%rip), $TMP1, $CTR2    # CTR2 = TAG[127...32][00..01]
1028    vpaddd two(%rip), $TMP1, $CTR3    # CTR3 = TAG[127...32][00..02]
1029    vpaddd three(%rip), $TMP1, $CTR4  # CTR4 = TAG[127...32][00..03]
1030    vpaddd four(%rip), $TMP1, $CTR5   # CTR5 = TAG[127...32][00..04]
1031    vpaddd five(%rip), $TMP1, $CTR6   # CTR6 = TAG[127...32][00..05]
1032    vpaddd six(%rip), $TMP1, $CTR7    # CTR7 = TAG[127...32][00..06]
1033    vmovdqa $TMP1, $CTR1              # CTR1 = TAG[127...32][00..00]
1034
1035    shrq \$3, $LEN
1036    je .L128_enc_msg_x8_check_remainder
1037
1038    subq \$128, $CT
1039    subq \$128, $PT
1040
1041.L128_enc_msg_x8_loop1:
1042    addq \$128, $CT
1043    addq \$128, $PT
1044
1045    vmovdqa $CTR1, $STATE1
1046    vmovdqa $CTR2, $STATE2
1047    vmovdqa $CTR3, $STATE3
1048    vmovdqa $CTR4, $STATE4
1049    vmovdqa $CTR5, $STATE5
1050    vmovdqa $CTR6, $STATE6
1051    vmovdqa $CTR7, $STATE7
1052    # move from stack
1053    vmovdqu (%rsp), $STATE8
1054
1055    vpxor ($KS), $STATE1, $STATE1
1056    vpxor ($KS), $STATE2, $STATE2
1057    vpxor ($KS), $STATE3, $STATE3
1058    vpxor ($KS), $STATE4, $STATE4
1059    vpxor ($KS), $STATE5, $STATE5
1060    vpxor ($KS), $STATE6, $STATE6
1061    vpxor ($KS), $STATE7, $STATE7
1062    vpxor ($KS), $STATE8, $STATE8
1063
1064    ${\$aes_round8->(1)}
1065    vmovdqu (%rsp), $CTR7  # deal with CTR8
1066    vpaddd eight(%rip), $CTR7, $CTR7
1067    vmovdqu $CTR7, (%rsp)
1068    ${\$aes_round8->(2)}
1069    vpsubd one(%rip), $CTR7, $CTR7
1070    ${\$aes_round8->(3)}
1071    vpaddd eight(%rip), $CTR1, $CTR1
1072    ${\$aes_round8->(4)}
1073    vpaddd eight(%rip), $CTR2, $CTR2
1074    ${\$aes_round8->(5)}
1075    vpaddd eight(%rip), $CTR3, $CTR3
1076    ${\$aes_round8->(6)}
1077    vpaddd eight(%rip), $CTR4, $CTR4
1078    ${\$aes_round8->(7)}
1079    vpaddd eight(%rip), $CTR5, $CTR5
1080    ${\$aes_round8->(8)}
1081    vpaddd eight(%rip), $CTR6, $CTR6
1082    ${\$aes_round8->(9)}
1083    ${\$aes_lastround8->(10)}
1084
1085    # XOR with Plaintext
1086    vpxor 0*16($PT), $STATE1, $STATE1
1087    vpxor 1*16($PT), $STATE2, $STATE2
1088    vpxor 2*16($PT), $STATE3, $STATE3
1089    vpxor 3*16($PT), $STATE4, $STATE4
1090    vpxor 4*16($PT), $STATE5, $STATE5
1091    vpxor 5*16($PT), $STATE6, $STATE6
1092    vpxor 6*16($PT), $STATE7, $STATE7
1093    vpxor 7*16($PT), $STATE8, $STATE8
1094
1095    dec $LEN
1096
1097    vmovdqu $STATE1, 0*16($CT)
1098    vmovdqu $STATE2, 1*16($CT)
1099    vmovdqu $STATE3, 2*16($CT)
1100    vmovdqu $STATE4, 3*16($CT)
1101    vmovdqu $STATE5, 4*16($CT)
1102    vmovdqu $STATE6, 5*16($CT)
1103    vmovdqu $STATE7, 6*16($CT)
1104    vmovdqu $STATE8, 7*16($CT)
1105
1106    jne .L128_enc_msg_x8_loop1
1107
1108    addq \$128, $CT
1109    addq \$128, $PT
1110
1111.L128_enc_msg_x8_check_remainder:
1112    cmpq \$0, %r10
1113    je .L128_enc_msg_x8_out
1114
1115.L128_enc_msg_x8_loop2:
1116    # enc each block separately
1117    # CTR1 is the highest counter (even if no LOOP done)
1118    vmovdqa $CTR1, $STATE1
1119    vpaddd one(%rip), $CTR1, $CTR1  # inc counter
1120
1121    vpxor ($KS), $STATE1, $STATE1
1122    vaesenc 16($KS), $STATE1, $STATE1
1123    vaesenc 32($KS), $STATE1, $STATE1
1124    vaesenc 48($KS), $STATE1, $STATE1
1125    vaesenc 64($KS), $STATE1, $STATE1
1126    vaesenc 80($KS), $STATE1, $STATE1
1127    vaesenc 96($KS), $STATE1, $STATE1
1128    vaesenc 112($KS), $STATE1, $STATE1
1129    vaesenc 128($KS), $STATE1, $STATE1
1130    vaesenc 144($KS), $STATE1, $STATE1
1131    vaesenclast 160($KS), $STATE1, $STATE1
1132
1133    # XOR with Plaintext
1134    vpxor ($PT), $STATE1, $STATE1
1135
1136    vmovdqu $STATE1, ($CT)
1137
1138    addq \$16, $PT
1139    addq \$16, $CT
1140
1141    decq %r10
1142    jne .L128_enc_msg_x8_loop2
1143
1144.L128_enc_msg_x8_out:
1145    movq %rbp, %rsp
1146.cfi_def_cfa_register %rsp
1147    popq %rbp
1148.cfi_pop %rbp
1149    popq %r13
1150.cfi_pop %r13
1151    popq %r12
1152.cfi_pop %r12
1153    ret
1154.cfi_endproc
1155.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
1156___
1157}
1158aes128gcmsiv_enc_msg_x8();
1159
1160sub aesgcmsiv_dec {
1161  my ($aes256) = @_;
1162
1163  my $T = "%xmm0";
1164  my $TMP0 = "%xmm1";
1165  my $TMP1 = "%xmm2";
1166  my $TMP2 = "%xmm3";
1167  my $TMP3 = "%xmm4";
1168  my $TMP4 = "%xmm5";
1169  my $TMP5 = "%xmm6";
1170  my $CTR1 = "%xmm7";
1171  my $CTR2 = "%xmm8";
1172  my $CTR3 = "%xmm9";
1173  my $CTR4 = "%xmm10";
1174  my $CTR5 = "%xmm11";
1175  my $CTR6 = "%xmm12";
1176  my $CTR = "%xmm15";
1177  my $CT = "%rdi";
1178  my $PT = "%rsi";
1179  my $POL = "%rdx";
1180  my $Htbl = "%rcx";
1181  my $KS = "%r8";
1182  my $LEN = "%r9";
1183  my $secureBuffer = "%rax";
1184  my $HTABLE_ROUNDS = "%xmm13";
1185
1186  my $labelPrefix = "128";
1187  if ($aes256) {
1188    $labelPrefix = "256";
1189  }
1190
1191  my $aes_round_dec = sub {
1192    my ($i) = @_;
1193    return <<___;
1194    vmovdqu ${\eval($i*16)}($KS), $TMP3
1195    vaesenc $TMP3, $CTR1, $CTR1
1196    vaesenc $TMP3, $CTR2, $CTR2
1197    vaesenc $TMP3, $CTR3, $CTR3
1198    vaesenc $TMP3, $CTR4, $CTR4
1199    vaesenc $TMP3, $CTR5, $CTR5
1200    vaesenc $TMP3, $CTR6, $CTR6
1201___
1202  };
1203
1204  my $aes_lastround_dec = sub {
1205    my ($i) = @_;
1206    return <<___;
1207    vmovdqu ${\eval($i*16)}($KS), $TMP3
1208    vaesenclast $TMP3, $CTR1, $CTR1
1209    vaesenclast $TMP3, $CTR2, $CTR2
1210    vaesenclast $TMP3, $CTR3, $CTR3
1211    vaesenclast $TMP3, $CTR4, $CTR4
1212    vaesenclast $TMP3, $CTR5, $CTR5
1213    vaesenclast $TMP3, $CTR6, $CTR6
1214___
1215  };
1216
1217  my $schoolbook = sub {
1218    my ($i) = @_;
1219    return <<___;
1220    vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
1221    vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
1222
1223    vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
1224    vpxor $TMP3, $TMP0, $TMP0
1225    vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
1226    vpxor $TMP3, $TMP1, $TMP1
1227    vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
1228    vpxor $TMP3, $TMP2, $TMP2
1229    vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
1230    vpxor $TMP3, $TMP0, $TMP0
1231___
1232  };
1233
1234  if ($aes256) {
1235    $code.=<<___;
1236.globl aes256gcmsiv_dec
1237.type aes256gcmsiv_dec,\@function,6
1238.align 16
1239aes256gcmsiv_dec:
1240___
1241  } else {
1242    $code.=<<___;
1243.globl aes128gcmsiv_dec
1244.type aes128gcmsiv_dec,\@function,6
1245.align 16
1246aes128gcmsiv_dec:
1247___
1248  }
1249
1250  $code.=<<___;
1251.cfi_startproc
1252    _CET_ENDBR
1253    test \$~15, $LEN
1254    jnz .L${labelPrefix}_dec_start
1255    ret
1256
1257.L${labelPrefix}_dec_start:
1258    vzeroupper
1259    vmovdqa ($POL), $T
1260    # The claimed tag is provided after the current calculated tag value.
1261    # CTRBLKs is made from it.
1262    vmovdqu 16($POL), $CTR
1263    vpor OR_MASK(%rip), $CTR, $CTR      # CTR = [1]TAG[126...32][00..00]
1264    movq $POL, $secureBuffer
1265
1266    leaq 32($secureBuffer), $secureBuffer
1267    leaq 32($Htbl), $Htbl
1268
1269    andq \$~15, $LEN
1270
1271    # If less then 6 blocks, make singles
1272    cmp \$96, $LEN
1273    jb .L${labelPrefix}_dec_loop2
1274
1275    # Decrypt the first six blocks
1276    sub \$96, $LEN
1277    vmovdqa $CTR, $CTR1
1278    vpaddd one(%rip), $CTR1, $CTR2
1279    vpaddd two(%rip), $CTR1, $CTR3
1280    vpaddd one(%rip), $CTR3, $CTR4
1281    vpaddd two(%rip), $CTR3, $CTR5
1282    vpaddd one(%rip), $CTR5, $CTR6
1283    vpaddd two(%rip), $CTR5, $CTR
1284
1285    vpxor ($KS), $CTR1, $CTR1
1286    vpxor ($KS), $CTR2, $CTR2
1287    vpxor ($KS), $CTR3, $CTR3
1288    vpxor ($KS), $CTR4, $CTR4
1289    vpxor ($KS), $CTR5, $CTR5
1290    vpxor ($KS), $CTR6, $CTR6
1291
1292    ${\$aes_round_dec->(1)}
1293    ${\$aes_round_dec->(2)}
1294    ${\$aes_round_dec->(3)}
1295    ${\$aes_round_dec->(4)}
1296    ${\$aes_round_dec->(5)}
1297    ${\$aes_round_dec->(6)}
1298    ${\$aes_round_dec->(7)}
1299    ${\$aes_round_dec->(8)}
1300    ${\$aes_round_dec->(9)}
1301___
1302
1303if ($aes256) {
1304$code.=<<___;
1305    ${\$aes_round_dec->(10)}
1306    ${\$aes_round_dec->(11)}
1307    ${\$aes_round_dec->(12)}
1308    ${\$aes_round_dec->(13)}
1309    ${\$aes_lastround_dec->(14)}
1310___
1311} else {
1312$code.=<<___;
1313    ${\$aes_lastround_dec->(10)}
1314___
1315}
1316
1317$code.=<<___;
1318    # XOR with CT
1319    vpxor 0*16($CT), $CTR1, $CTR1
1320    vpxor 1*16($CT), $CTR2, $CTR2
1321    vpxor 2*16($CT), $CTR3, $CTR3
1322    vpxor 3*16($CT), $CTR4, $CTR4
1323    vpxor 4*16($CT), $CTR5, $CTR5
1324    vpxor 5*16($CT), $CTR6, $CTR6
1325
1326    vmovdqu $CTR1, 0*16($PT)
1327    vmovdqu $CTR2, 1*16($PT)
1328    vmovdqu $CTR3, 2*16($PT)
1329    vmovdqu $CTR4, 3*16($PT)
1330    vmovdqu $CTR5, 4*16($PT)
1331    vmovdqu $CTR6, 5*16($PT)
1332
1333    addq \$96, $CT
1334    addq \$96, $PT
1335    jmp .L${labelPrefix}_dec_loop1
1336
1337# Decrypt 6 blocks each time while hashing previous 6 blocks
1338.align 64
1339.L${labelPrefix}_dec_loop1:
1340    cmp \$96, $LEN
1341    jb .L${labelPrefix}_dec_finish_96
1342    sub \$96, $LEN
1343
1344    vmovdqa $CTR6, $TMP5
1345    vmovdqa $CTR5, 1*16-32($secureBuffer)
1346    vmovdqa $CTR4, 2*16-32($secureBuffer)
1347    vmovdqa $CTR3, 3*16-32($secureBuffer)
1348    vmovdqa $CTR2, 4*16-32($secureBuffer)
1349    vmovdqa $CTR1, 5*16-32($secureBuffer)
1350
1351    vmovdqa $CTR, $CTR1
1352    vpaddd one(%rip), $CTR1, $CTR2
1353    vpaddd two(%rip), $CTR1, $CTR3
1354    vpaddd one(%rip), $CTR3, $CTR4
1355    vpaddd two(%rip), $CTR3, $CTR5
1356    vpaddd one(%rip), $CTR5, $CTR6
1357    vpaddd two(%rip), $CTR5, $CTR
1358
1359    vmovdqa ($KS), $TMP3
1360    vpxor $TMP3, $CTR1, $CTR1
1361    vpxor $TMP3, $CTR2, $CTR2
1362    vpxor $TMP3, $CTR3, $CTR3
1363    vpxor $TMP3, $CTR4, $CTR4
1364    vpxor $TMP3, $CTR5, $CTR5
1365    vpxor $TMP3, $CTR6, $CTR6
1366
1367    vmovdqu 0*16-32($Htbl), $TMP3
1368    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1369    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1370    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
1371    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
1372    vpxor $TMP3, $TMP0, $TMP0
1373
1374    ${\$aes_round_dec->(1)}
1375    ${\$schoolbook->(1)}
1376
1377    ${\$aes_round_dec->(2)}
1378    ${\$schoolbook->(2)}
1379
1380    ${\$aes_round_dec->(3)}
1381    ${\$schoolbook->(3)}
1382
1383    ${\$aes_round_dec->(4)}
1384    ${\$schoolbook->(4)}
1385
1386    ${\$aes_round_dec->(5)}
1387    ${\$aes_round_dec->(6)}
1388    ${\$aes_round_dec->(7)}
1389
1390    vmovdqa 5*16-32($secureBuffer), $TMP5
1391    vpxor $T, $TMP5, $TMP5
1392    vmovdqu 5*16-32($Htbl), $TMP4
1393
1394    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1395    vpxor $TMP3, $TMP0, $TMP0
1396    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1397    vpxor $TMP3, $TMP1, $TMP1
1398    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1399    vpxor $TMP3, $TMP2, $TMP2
1400    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1401    vpxor $TMP3, $TMP0, $TMP0
1402
1403    ${\$aes_round_dec->(8)}
1404
1405    vpsrldq \$8, $TMP0, $TMP3
1406    vpxor $TMP3, $TMP1, $TMP4
1407    vpslldq \$8, $TMP0, $TMP3
1408    vpxor $TMP3, $TMP2, $T
1409
1410    vmovdqa poly(%rip), $TMP2
1411
1412    ${\$aes_round_dec->(9)}
1413___
1414
1415if ($aes256) {
1416$code.=<<___;
1417    ${\$aes_round_dec->(10)}
1418    ${\$aes_round_dec->(11)}
1419    ${\$aes_round_dec->(12)}
1420    ${\$aes_round_dec->(13)}
1421    vmovdqu 14*16($KS), $TMP5
1422___
1423} else {
1424$code.=<<___;
1425    vmovdqu 10*16($KS), $TMP5
1426___
1427}
1428
1429$code.=<<___;
1430    vpalignr \$8, $T, $T, $TMP1
1431    vpclmulqdq \$0x10, $TMP2, $T, $T
1432    vpxor $T, $TMP1, $T
1433
1434    vpxor 0*16($CT), $TMP5, $TMP3
1435    vaesenclast $TMP3, $CTR1, $CTR1
1436    vpxor 1*16($CT), $TMP5, $TMP3
1437    vaesenclast $TMP3, $CTR2, $CTR2
1438    vpxor 2*16($CT), $TMP5, $TMP3
1439    vaesenclast $TMP3, $CTR3, $CTR3
1440    vpxor 3*16($CT), $TMP5, $TMP3
1441    vaesenclast $TMP3, $CTR4, $CTR4
1442    vpxor 4*16($CT), $TMP5, $TMP3
1443    vaesenclast $TMP3, $CTR5, $CTR5
1444    vpxor 5*16($CT), $TMP5, $TMP3
1445    vaesenclast $TMP3, $CTR6, $CTR6
1446
1447    vpalignr \$8, $T, $T, $TMP1
1448    vpclmulqdq \$0x10, $TMP2, $T, $T
1449    vpxor $T, $TMP1, $T
1450
1451    vmovdqu $CTR1, 0*16($PT)
1452    vmovdqu $CTR2, 1*16($PT)
1453    vmovdqu $CTR3, 2*16($PT)
1454    vmovdqu $CTR4, 3*16($PT)
1455    vmovdqu $CTR5, 4*16($PT)
1456    vmovdqu $CTR6, 5*16($PT)
1457
1458    vpxor $TMP4, $T, $T
1459
1460    lea 96($CT), $CT
1461    lea 96($PT), $PT
1462    jmp .L${labelPrefix}_dec_loop1
1463
1464.L${labelPrefix}_dec_finish_96:
1465    vmovdqa $CTR6, $TMP5
1466    vmovdqa $CTR5, 1*16-32($secureBuffer)
1467    vmovdqa $CTR4, 2*16-32($secureBuffer)
1468    vmovdqa $CTR3, 3*16-32($secureBuffer)
1469    vmovdqa $CTR2, 4*16-32($secureBuffer)
1470    vmovdqa $CTR1, 5*16-32($secureBuffer)
1471
1472    vmovdqu 0*16-32($Htbl), $TMP3
1473    vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
1474    vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
1475    vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
1476    vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
1477    vpxor $TMP3, $TMP0, $TMP0
1478
1479    ${\$schoolbook->(1)}
1480    ${\$schoolbook->(2)}
1481    ${\$schoolbook->(3)}
1482    ${\$schoolbook->(4)}
1483
1484    vmovdqu 5*16-32($secureBuffer), $TMP5
1485    vpxor $T, $TMP5, $TMP5
1486    vmovdqu 5*16-32($Htbl), $TMP4
1487    vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
1488    vpxor $TMP3, $TMP1, $TMP1
1489    vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
1490    vpxor $TMP3, $TMP2, $TMP2
1491    vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
1492    vpxor $TMP3, $TMP0, $TMP0
1493    vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
1494    vpxor $TMP3, $TMP0, $TMP0
1495
1496    vpsrldq \$8, $TMP0, $TMP3
1497    vpxor $TMP3, $TMP1, $TMP4
1498    vpslldq \$8, $TMP0, $TMP3
1499    vpxor $TMP3, $TMP2, $T
1500
1501    vmovdqa poly(%rip), $TMP2
1502
1503    vpalignr \$8, $T, $T, $TMP1
1504    vpclmulqdq \$0x10, $TMP2, $T, $T
1505    vpxor $T, $TMP1, $T
1506
1507    vpalignr \$8, $T, $T, $TMP1
1508    vpclmulqdq \$0x10, $TMP2, $T, $T
1509    vpxor $T, $TMP1, $T
1510
1511    vpxor $TMP4, $T, $T
1512
1513.L${labelPrefix}_dec_loop2:
1514    # Here we encrypt any remaining whole block
1515
1516    # if there are no whole blocks
1517    cmp \$16, $LEN
1518    jb .L${labelPrefix}_dec_out
1519    sub \$16, $LEN
1520
1521    vmovdqa $CTR, $TMP1
1522    vpaddd one(%rip), $CTR, $CTR
1523
1524    vpxor 0*16($KS), $TMP1, $TMP1
1525    vaesenc 1*16($KS), $TMP1, $TMP1
1526    vaesenc 2*16($KS), $TMP1, $TMP1
1527    vaesenc 3*16($KS), $TMP1, $TMP1
1528    vaesenc 4*16($KS), $TMP1, $TMP1
1529    vaesenc 5*16($KS), $TMP1, $TMP1
1530    vaesenc 6*16($KS), $TMP1, $TMP1
1531    vaesenc 7*16($KS), $TMP1, $TMP1
1532    vaesenc 8*16($KS), $TMP1, $TMP1
1533    vaesenc 9*16($KS), $TMP1, $TMP1
1534___
1535if ($aes256) {
1536$code.=<<___;
1537    vaesenc 10*16($KS), $TMP1, $TMP1
1538    vaesenc 11*16($KS), $TMP1, $TMP1
1539    vaesenc 12*16($KS), $TMP1, $TMP1
1540    vaesenc 13*16($KS), $TMP1, $TMP1
1541    vaesenclast 14*16($KS), $TMP1, $TMP1
1542___
1543} else {
1544$code.=<<___;
1545    vaesenclast 10*16($KS), $TMP1, $TMP1
1546___
1547}
1548
1549$code.=<<___;
1550    vpxor ($CT), $TMP1, $TMP1
1551    vmovdqu $TMP1, ($PT)
1552    addq \$16, $CT
1553    addq \$16, $PT
1554
1555    vpxor $TMP1, $T, $T
1556    vmovdqa -32($Htbl), $TMP0
1557    call GFMUL
1558
1559    jmp .L${labelPrefix}_dec_loop2
1560
1561.L${labelPrefix}_dec_out:
1562    vmovdqu $T, ($POL)
1563    ret
1564.cfi_endproc
1565___
1566
1567  if ($aes256) {
1568    $code.=<<___;
1569.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
1570___
1571  } else {
1572    $code.=<<___;
1573.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
1574___
1575  }
1576}
1577
1578aesgcmsiv_dec(0);  # emit 128-bit version
1579
1580sub aes128gcmsiv_ecb_enc_block {
1581  my $STATE_1 = "%xmm1";
1582  my $KSp = "%rdx";
1583
1584  # parameter 1: PT            %rdi    (pointer to 128 bit)
1585  # parameter 2: CT            %rsi    (pointer to 128 bit)
1586  # parameter 3: ks            %rdx    (pointer to ks)
1587  $code.=<<___;
1588.globl aes128gcmsiv_ecb_enc_block
1589.type aes128gcmsiv_ecb_enc_block,\@function,3
1590.align 16
1591aes128gcmsiv_ecb_enc_block:
1592.cfi_startproc
1593    _CET_ENDBR
1594    vmovdqa (%rdi), $STATE_1
1595
1596    vpxor       ($KSp), $STATE_1, $STATE_1
1597    vaesenc 1*16($KSp), $STATE_1, $STATE_1
1598    vaesenc 2*16($KSp), $STATE_1, $STATE_1
1599    vaesenc 3*16($KSp), $STATE_1, $STATE_1
1600    vaesenc 4*16($KSp), $STATE_1, $STATE_1
1601    vaesenc 5*16($KSp), $STATE_1, $STATE_1
1602    vaesenc 6*16($KSp), $STATE_1, $STATE_1
1603    vaesenc 7*16($KSp), $STATE_1, $STATE_1
1604    vaesenc 8*16($KSp), $STATE_1, $STATE_1
1605    vaesenc 9*16($KSp), $STATE_1, $STATE_1
1606    vaesenclast 10*16($KSp), $STATE_1, $STATE_1    # STATE_1 == IV
1607
1608    vmovdqa $STATE_1, (%rsi)
1609
1610    ret
1611.cfi_endproc
1612.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
1613___
1614}
1615aes128gcmsiv_ecb_enc_block();
1616
1617sub aes256gcmsiv_aes_ks_enc_x1 {
1618  my $KS = "%rdx";
1619  my $KEYp = "%rcx";
1620  my $CON_MASK = "%xmm0";
1621  my $MASK_256 = "%xmm15";
1622  my $KEY_1 = "%xmm1";
1623  my $KEY_2 = "%xmm3";
1624  my $BLOCK1 = "%xmm8";
1625  my $AUX_REG = "%xmm14";
1626  my $PT = "%rdi";
1627  my $CT = "%rsi";
1628
1629  my $round_double = sub {
1630    my ($i, $j) = @_;
1631    return <<___;
1632    vpshufb %xmm15, %xmm3, %xmm2
1633    vaesenclast %xmm0, %xmm2, %xmm2
1634    vpslld \$1, %xmm0, %xmm0
1635    vpslldq \$4, %xmm1, %xmm4
1636    vpxor %xmm4, %xmm1, %xmm1
1637    vpslldq \$4, %xmm4, %xmm4
1638    vpxor %xmm4, %xmm1, %xmm1
1639    vpslldq \$4, %xmm4, %xmm4
1640    vpxor %xmm4, %xmm1, %xmm1
1641    vpxor %xmm2, %xmm1, %xmm1
1642    vaesenc %xmm1, $BLOCK1, $BLOCK1
1643    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1644
1645    vpshufd \$0xff, %xmm1, %xmm2
1646    vaesenclast %xmm14, %xmm2, %xmm2
1647    vpslldq \$4, %xmm3, %xmm4
1648    vpxor %xmm4, %xmm3, %xmm3
1649    vpslldq \$4, %xmm4, %xmm4
1650    vpxor %xmm4, %xmm3, %xmm3
1651    vpslldq \$4, %xmm4, %xmm4
1652    vpxor %xmm4, %xmm3, %xmm3
1653    vpxor %xmm2, %xmm3, %xmm3
1654    vaesenc %xmm3, $BLOCK1, $BLOCK1
1655    vmovdqu %xmm3, ${\eval(16*$j)}($KS)
1656___
1657  };
1658
1659  my $round_last = sub {
1660    my ($i) = @_;
1661    return <<___;
1662    vpshufb %xmm15, %xmm3, %xmm2
1663    vaesenclast %xmm0, %xmm2, %xmm2
1664    vpslldq \$4, %xmm1, %xmm4
1665    vpxor %xmm4, %xmm1, %xmm1
1666    vpslldq \$4, %xmm4, %xmm4
1667    vpxor %xmm4, %xmm1, %xmm1
1668    vpslldq \$4, %xmm4, %xmm4
1669    vpxor %xmm4, %xmm1, %xmm1
1670    vpxor %xmm2, %xmm1, %xmm1
1671    vaesenclast %xmm1, $BLOCK1, $BLOCK1
1672    vmovdqu %xmm1, ${\eval(16*$i)}($KS)
1673___
1674  };
1675
1676  # parameter 1: %rdi         Pointer to PT1
1677  # parameter 2: %rsi         Pointer to CT1
1678  # parameter 3: %rdx         Pointer to KS
1679  # parameter 4: %rcx         Pointer to initial key
1680  $code.=<<___;
1681.globl aes256gcmsiv_aes_ks_enc_x1
1682.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
1683.align 16
1684aes256gcmsiv_aes_ks_enc_x1:
1685.cfi_startproc
1686    _CET_ENDBR
1687    vmovdqa con1(%rip), $CON_MASK    # CON_MASK  = 1,1,1,1
1688    vmovdqa mask(%rip), $MASK_256    # MASK_256
1689    vmovdqa ($PT), $BLOCK1
1690    vmovdqa ($KEYp), $KEY_1          # KEY_1 || KEY_2 [0..7] = user key
1691    vmovdqa 16($KEYp), $KEY_2
1692    vpxor $KEY_1, $BLOCK1, $BLOCK1
1693    vaesenc $KEY_2, $BLOCK1, $BLOCK1
1694    vmovdqu $KEY_1, ($KS)            # First round key
1695    vmovdqu $KEY_2, 16($KS)
1696    vpxor $AUX_REG, $AUX_REG, $AUX_REG
1697
1698    ${\$round_double->(2, 3)}
1699    ${\$round_double->(4, 5)}
1700    ${\$round_double->(6, 7)}
1701    ${\$round_double->(8, 9)}
1702    ${\$round_double->(10, 11)}
1703    ${\$round_double->(12, 13)}
1704    ${\$round_last->(14)}
1705    vmovdqa $BLOCK1, ($CT)
1706    ret
1707.cfi_endproc
1708.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
1709___
1710}
1711aes256gcmsiv_aes_ks_enc_x1();
1712
1713sub aes256gcmsiv_ecb_enc_block {
1714  my $STATE_1 = "%xmm1";
1715  my $PT = "%rdi";
1716  my $CT = "%rsi";
1717  my $KSp = "%rdx";
1718
1719  # parameter 1: PT            %rdi    (pointer to 128 bit)
1720  # parameter 2: CT            %rsi    (pointer to 128 bit)
1721  # parameter 3: ks            %rdx    (pointer to ks)
1722  $code.=<<___;
1723.globl aes256gcmsiv_ecb_enc_block
1724.type aes256gcmsiv_ecb_enc_block,\@function,3
1725.align 16
1726aes256gcmsiv_ecb_enc_block:
1727.cfi_startproc
1728    _CET_ENDBR
1729    vmovdqa (%rdi), $STATE_1
1730    vpxor ($KSp), $STATE_1, $STATE_1
1731    vaesenc 1*16($KSp), $STATE_1, $STATE_1
1732    vaesenc 2*16($KSp), $STATE_1, $STATE_1
1733    vaesenc 3*16($KSp), $STATE_1, $STATE_1
1734    vaesenc 4*16($KSp), $STATE_1, $STATE_1
1735    vaesenc 5*16($KSp), $STATE_1, $STATE_1
1736    vaesenc 6*16($KSp), $STATE_1, $STATE_1
1737    vaesenc 7*16($KSp), $STATE_1, $STATE_1
1738    vaesenc 8*16($KSp), $STATE_1, $STATE_1
1739    vaesenc 9*16($KSp), $STATE_1, $STATE_1
1740    vaesenc 10*16($KSp), $STATE_1, $STATE_1
1741    vaesenc 11*16($KSp), $STATE_1, $STATE_1
1742    vaesenc 12*16($KSp), $STATE_1, $STATE_1
1743    vaesenc 13*16($KSp), $STATE_1, $STATE_1
1744    vaesenclast 14*16($KSp), $STATE_1, $STATE_1    # $STATE_1 == IV
1745    vmovdqa $STATE_1, (%rsi)
1746    ret
1747.cfi_endproc
1748.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
1749___
1750}
1751aes256gcmsiv_ecb_enc_block();
1752
1753sub aes256gcmsiv_enc_msg_x4 {
1754  my $CTR1 = "%xmm0";
1755  my $CTR2 = "%xmm1";
1756  my $CTR3 = "%xmm2";
1757  my $CTR4 = "%xmm3";
1758  my $ADDER = "%xmm4";
1759
1760  my $STATE1 = "%xmm5";
1761  my $STATE2 = "%xmm6";
1762  my $STATE3 = "%xmm7";
1763  my $STATE4 = "%xmm8";
1764
1765  my $TMP = "%xmm12";
1766  my $TMP2 = "%xmm13";
1767  my $TMP3 = "%xmm14";
1768  my $IV = "%xmm15";
1769
1770  my $PT = "%rdi";
1771  my $CT = "%rsi";
1772  my $TAG = "%rdx";
1773  my $KS = "%rcx";
1774  my $LEN = "%r8";
1775
1776  my $aes_round = sub {
1777    my ($i) = @_;
1778    return <<___;
1779    vmovdqu ${\eval($i*16)}($KS), $TMP
1780    vaesenc $TMP, $STATE1, $STATE1
1781    vaesenc $TMP, $STATE2, $STATE2
1782    vaesenc $TMP, $STATE3, $STATE3
1783    vaesenc $TMP, $STATE4, $STATE4
1784___
1785  };
1786
1787  my $aes_lastround = sub {
1788    my ($i) = @_;
1789    return <<___;
1790    vmovdqu ${\eval($i*16)}($KS), $TMP
1791    vaesenclast $TMP, $STATE1, $STATE1
1792    vaesenclast $TMP, $STATE2, $STATE2
1793    vaesenclast $TMP, $STATE3, $STATE3
1794    vaesenclast $TMP, $STATE4, $STATE4
1795___
1796  };
1797
1798  # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
1799  #                              unsigned char* TAG, unsigned char* KS,
1800  #                              size_t byte_len);
1801  # parameter 1: %rdi     #PT
1802  # parameter 2: %rsi     #CT
1803  # parameter 3: %rdx     #TAG  [127 126 ... 0]  IV=[127...32]
1804  # parameter 4: %rcx     #KS
1805  # parameter 5: %r8      #LEN MSG_length in bytes
1806  $code.=<<___;
1807.globl aes256gcmsiv_enc_msg_x4
1808.type aes256gcmsiv_enc_msg_x4,\@function,5
1809.align 16
1810aes256gcmsiv_enc_msg_x4:
1811.cfi_startproc
1812    _CET_ENDBR
1813    test $LEN, $LEN
1814    jnz .L256_enc_msg_x4_start
1815    ret
1816
1817.L256_enc_msg_x4_start:
1818    movq $LEN, %r10
1819    shrq \$4, $LEN                       # LEN = num of blocks
1820    shlq \$60, %r10
1821    jz .L256_enc_msg_x4_start2
1822    addq \$1, $LEN
1823
1824.L256_enc_msg_x4_start2:
1825    movq $LEN, %r10
1826    shlq \$62, %r10
1827    shrq \$62, %r10
1828
1829    # make IV from TAG
1830    vmovdqa ($TAG), $IV
1831    vpor OR_MASK(%rip), $IV, $IV        # IV = [1]TAG[126...32][00..00]
1832
1833    vmovdqa four(%rip), $ADDER          # Register to increment counters
1834    vmovdqa $IV, $CTR1                  # CTR1 = TAG[1][127...32][00..00]
1835    vpaddd one(%rip), $IV, $CTR2        # CTR2 = TAG[1][127...32][00..01]
1836    vpaddd two(%rip), $IV, $CTR3        # CTR3 = TAG[1][127...32][00..02]
1837    vpaddd three(%rip), $IV, $CTR4      # CTR4 = TAG[1][127...32][00..03]
1838
1839    shrq \$2, $LEN
1840    je .L256_enc_msg_x4_check_remainder
1841
1842    subq \$64, $CT
1843    subq \$64, $PT
1844
1845.L256_enc_msg_x4_loop1:
1846    addq \$64, $CT
1847    addq \$64, $PT
1848
1849    vmovdqa $CTR1, $STATE1
1850    vmovdqa $CTR2, $STATE2
1851    vmovdqa $CTR3, $STATE3
1852    vmovdqa $CTR4, $STATE4
1853
1854    vpxor ($KS), $STATE1, $STATE1
1855    vpxor ($KS), $STATE2, $STATE2
1856    vpxor ($KS), $STATE3, $STATE3
1857    vpxor ($KS), $STATE4, $STATE4
1858
1859    ${\$aes_round->(1)}
1860    vpaddd $ADDER, $CTR1, $CTR1
1861    ${\$aes_round->(2)}
1862    vpaddd $ADDER, $CTR2, $CTR2
1863    ${\$aes_round->(3)}
1864    vpaddd $ADDER, $CTR3, $CTR3
1865    ${\$aes_round->(4)}
1866    vpaddd $ADDER, $CTR4, $CTR4
1867
1868    ${\$aes_round->(5)}
1869    ${\$aes_round->(6)}
1870    ${\$aes_round->(7)}
1871    ${\$aes_round->(8)}
1872    ${\$aes_round->(9)}
1873    ${\$aes_round->(10)}
1874    ${\$aes_round->(11)}
1875    ${\$aes_round->(12)}
1876    ${\$aes_round->(13)}
1877    ${\$aes_lastround->(14)}
1878
1879    # XOR with Plaintext
1880    vpxor 0*16($PT), $STATE1, $STATE1
1881    vpxor 1*16($PT), $STATE2, $STATE2
1882    vpxor 2*16($PT), $STATE3, $STATE3
1883    vpxor 3*16($PT), $STATE4, $STATE4
1884
1885    subq \$1, $LEN
1886
1887    vmovdqu $STATE1, 0*16($CT)
1888    vmovdqu $STATE2, 1*16($CT)
1889    vmovdqu $STATE3, 2*16($CT)
1890    vmovdqu $STATE4, 3*16($CT)
1891
1892    jne .L256_enc_msg_x4_loop1
1893
1894    addq \$64, $CT
1895    addq \$64, $PT
1896
1897.L256_enc_msg_x4_check_remainder:
1898    cmpq \$0, %r10
1899    je .L256_enc_msg_x4_out
1900
1901.L256_enc_msg_x4_loop2:
1902    # encrypt each block separately
1903    # CTR1 is the highest counter (even if no LOOP done)
1904
1905    vmovdqa $CTR1, $STATE1
1906    vpaddd one(%rip), $CTR1, $CTR1      # inc counter
1907    vpxor ($KS), $STATE1, $STATE1
1908    vaesenc 16($KS), $STATE1, $STATE1
1909    vaesenc 32($KS), $STATE1, $STATE1
1910    vaesenc 48($KS), $STATE1, $STATE1
1911    vaesenc 64($KS), $STATE1, $STATE1
1912    vaesenc 80($KS), $STATE1, $STATE1
1913    vaesenc 96($KS), $STATE1, $STATE1
1914    vaesenc 112($KS), $STATE1, $STATE1
1915    vaesenc 128($KS), $STATE1, $STATE1
1916    vaesenc 144($KS), $STATE1, $STATE1
1917    vaesenc 160($KS), $STATE1, $STATE1
1918    vaesenc 176($KS), $STATE1, $STATE1
1919    vaesenc 192($KS), $STATE1, $STATE1
1920    vaesenc 208($KS), $STATE1, $STATE1
1921    vaesenclast 224($KS), $STATE1, $STATE1
1922
1923    # XOR with Plaintext
1924    vpxor ($PT), $STATE1, $STATE1
1925
1926    vmovdqu $STATE1, ($CT)
1927
1928    addq \$16, $PT
1929    addq \$16, $CT
1930
1931    subq \$1, %r10
1932    jne .L256_enc_msg_x4_loop2
1933
1934.L256_enc_msg_x4_out:
1935    ret
1936.cfi_endproc
1937.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
1938___
1939}
1940aes256gcmsiv_enc_msg_x4();
1941
1942sub aes256gcmsiv_enc_msg_x8() {
1943  my $STATE1 = "%xmm1";
1944  my $STATE2 = "%xmm2";
1945  my $STATE3 = "%xmm3";
1946  my $STATE4 = "%xmm4";
1947  my $STATE5 = "%xmm5";
1948  my $STATE6 = "%xmm6";
1949  my $STATE7 = "%xmm7";
1950  my $STATE8 = "%xmm8";
1951  my $CTR1 = "%xmm0";
1952  my $CTR2 = "%xmm9";
1953  my $CTR3 = "%xmm10";
1954  my $CTR4 = "%xmm11";
1955  my $CTR5 = "%xmm12";
1956  my $CTR6 = "%xmm13";
1957  my $CTR7 = "%xmm14";
1958  my $TMP1 = "%xmm1";
1959  my $TMP2 = "%xmm2";
1960  my $KS = "%rcx";
1961  my $LEN = "%r8";
1962  my $PT = "%rdi";
1963  my $CT = "%rsi";
1964  my $TAG = "%rdx";
1965  my $SCHED = "%xmm15";
1966
1967  my $aes_round8 = sub {
1968    my ($i) = @_;
1969    return <<___;
1970    vmovdqu ${\eval($i*16)}($KS), $SCHED
1971    vaesenc $SCHED, $STATE1, $STATE1
1972    vaesenc $SCHED, $STATE2, $STATE2
1973    vaesenc $SCHED, $STATE3, $STATE3
1974    vaesenc $SCHED, $STATE4, $STATE4
1975    vaesenc $SCHED, $STATE5, $STATE5
1976    vaesenc $SCHED, $STATE6, $STATE6
1977    vaesenc $SCHED, $STATE7, $STATE7
1978    vaesenc $SCHED, $STATE8, $STATE8
1979___
1980  };
1981
1982  my $aes_lastround8 = sub {
1983    my ($i) = @_;
1984    return <<___;
1985    vmovdqu ${\eval($i*16)}($KS), $SCHED
1986    vaesenclast $SCHED, $STATE1, $STATE1
1987    vaesenclast $SCHED, $STATE2, $STATE2
1988    vaesenclast $SCHED, $STATE3, $STATE3
1989    vaesenclast $SCHED, $STATE4, $STATE4
1990    vaesenclast $SCHED, $STATE5, $STATE5
1991    vaesenclast $SCHED, $STATE6, $STATE6
1992    vaesenclast $SCHED, $STATE7, $STATE7
1993    vaesenclast $SCHED, $STATE8, $STATE8
1994___
1995  };
1996
1997  # void ENC_MSG_x8(unsigned char* PT,
1998  #                 unsigned char* CT,
1999  #                 unsigned char* TAG,
2000  #                 unsigned char* KS,
2001  #                 size_t byte_len);
2002  # parameter 1: %rdi     #PT
2003  # parameter 2: %rsi     #CT
2004  # parameter 3: %rdx     #TAG        [127 126 ... 0]  IV=[127...32]
2005  # parameter 4: %rcx     #KS
2006  # parameter 5: %r8      #LEN MSG_length in bytes
2007  $code.=<<___;
2008.globl aes256gcmsiv_enc_msg_x8
2009.type aes256gcmsiv_enc_msg_x8,\@function,5
2010.align 16
2011aes256gcmsiv_enc_msg_x8:
2012.cfi_startproc
2013    _CET_ENDBR
2014    test $LEN, $LEN
2015    jnz .L256_enc_msg_x8_start
2016    ret
2017
2018.L256_enc_msg_x8_start:
2019    # Place in stack
2020    movq %rsp, %r11
2021    subq \$16, %r11
2022    andq \$-64, %r11
2023
2024    movq $LEN, %r10
2025    shrq \$4, $LEN                       # LEN = num of blocks
2026    shlq \$60, %r10
2027    jz .L256_enc_msg_x8_start2
2028    addq \$1, $LEN
2029
2030.L256_enc_msg_x8_start2:
2031    movq $LEN, %r10
2032    shlq \$61, %r10
2033    shrq \$61, %r10
2034
2035    # Make IV from TAG
2036    vmovdqa ($TAG), $TMP1
2037    vpor OR_MASK(%rip), $TMP1, $TMP1    # TMP1= IV = [1]TAG[126...32][00..00]
2038
2039    # store counter8 on the stack
2040    vpaddd seven(%rip), $TMP1, $CTR1
2041    vmovdqa $CTR1, (%r11)                # CTR8 = TAG[127...32][00..07]
2042    vpaddd one(%rip), $TMP1, $CTR2       # CTR2 = TAG[127...32][00..01]
2043    vpaddd two(%rip), $TMP1, $CTR3       # CTR3 = TAG[127...32][00..02]
2044    vpaddd three(%rip), $TMP1, $CTR4     # CTR4 = TAG[127...32][00..03]
2045    vpaddd four(%rip), $TMP1, $CTR5      # CTR5 = TAG[127...32][00..04]
2046    vpaddd five(%rip), $TMP1, $CTR6      # CTR6 = TAG[127...32][00..05]
2047    vpaddd six(%rip), $TMP1, $CTR7       # CTR7 = TAG[127...32][00..06]
2048    vmovdqa $TMP1, $CTR1                 # CTR1 = TAG[127...32][00..00]
2049
2050    shrq \$3, $LEN
2051    jz .L256_enc_msg_x8_check_remainder
2052
2053    subq \$128, $CT
2054    subq \$128, $PT
2055
2056.L256_enc_msg_x8_loop1:
2057    addq \$128, $CT
2058    addq \$128, $PT
2059
2060    vmovdqa $CTR1, $STATE1
2061    vmovdqa $CTR2, $STATE2
2062    vmovdqa $CTR3, $STATE3
2063    vmovdqa $CTR4, $STATE4
2064    vmovdqa $CTR5, $STATE5
2065    vmovdqa $CTR6, $STATE6
2066    vmovdqa $CTR7, $STATE7
2067    # move from stack
2068    vmovdqa (%r11), $STATE8
2069
2070    vpxor ($KS), $STATE1, $STATE1
2071    vpxor ($KS), $STATE2, $STATE2
2072    vpxor ($KS), $STATE3, $STATE3
2073    vpxor ($KS), $STATE4, $STATE4
2074    vpxor ($KS), $STATE5, $STATE5
2075    vpxor ($KS), $STATE6, $STATE6
2076    vpxor ($KS), $STATE7, $STATE7
2077    vpxor ($KS), $STATE8, $STATE8
2078
2079    ${\$aes_round8->(1)}
2080    vmovdqa (%r11), $CTR7                # deal with CTR8
2081    vpaddd eight(%rip), $CTR7, $CTR7
2082    vmovdqa $CTR7, (%r11)
2083    ${\$aes_round8->(2)}
2084    vpsubd one(%rip), $CTR7, $CTR7
2085    ${\$aes_round8->(3)}
2086    vpaddd eight(%rip), $CTR1, $CTR1
2087    ${\$aes_round8->(4)}
2088    vpaddd eight(%rip), $CTR2, $CTR2
2089    ${\$aes_round8->(5)}
2090    vpaddd eight(%rip), $CTR3, $CTR3
2091    ${\$aes_round8->(6)}
2092    vpaddd eight(%rip), $CTR4, $CTR4
2093    ${\$aes_round8->(7)}
2094    vpaddd eight(%rip), $CTR5, $CTR5
2095    ${\$aes_round8->(8)}
2096    vpaddd eight(%rip), $CTR6, $CTR6
2097    ${\$aes_round8->(9)}
2098    ${\$aes_round8->(10)}
2099    ${\$aes_round8->(11)}
2100    ${\$aes_round8->(12)}
2101    ${\$aes_round8->(13)}
2102    ${\$aes_lastround8->(14)}
2103
2104    # XOR with Plaintext
2105    vpxor 0*16($PT), $STATE1, $STATE1
2106    vpxor 1*16($PT), $STATE2, $STATE2
2107    vpxor 2*16($PT), $STATE3, $STATE3
2108    vpxor 3*16($PT), $STATE4, $STATE4
2109    vpxor 4*16($PT), $STATE5, $STATE5
2110    vpxor 5*16($PT), $STATE6, $STATE6
2111    vpxor 6*16($PT), $STATE7, $STATE7
2112    vpxor 7*16($PT), $STATE8, $STATE8
2113
2114    subq \$1, $LEN
2115
2116    vmovdqu $STATE1, 0*16($CT)
2117    vmovdqu $STATE2, 1*16($CT)
2118    vmovdqu $STATE3, 2*16($CT)
2119    vmovdqu $STATE4, 3*16($CT)
2120    vmovdqu $STATE5, 4*16($CT)
2121    vmovdqu $STATE6, 5*16($CT)
2122    vmovdqu $STATE7, 6*16($CT)
2123    vmovdqu $STATE8, 7*16($CT)
2124
2125    jne .L256_enc_msg_x8_loop1
2126
2127    addq \$128, $CT
2128    addq \$128, $PT
2129
2130.L256_enc_msg_x8_check_remainder:
2131   cmpq \$0, %r10
2132   je .L256_enc_msg_x8_out
2133
2134.L256_enc_msg_x8_loop2:
2135    # encrypt each block separately
2136    # CTR1 is the highest counter (even if no LOOP done)
2137    vmovdqa $CTR1, $STATE1
2138    vpaddd one(%rip), $CTR1, $CTR1
2139
2140    vpxor ($KS), $STATE1, $STATE1
2141    vaesenc 16($KS), $STATE1, $STATE1
2142    vaesenc 32($KS), $STATE1, $STATE1
2143    vaesenc 48($KS), $STATE1, $STATE1
2144    vaesenc 64($KS), $STATE1, $STATE1
2145    vaesenc 80($KS), $STATE1, $STATE1
2146    vaesenc 96($KS), $STATE1, $STATE1
2147    vaesenc 112($KS), $STATE1, $STATE1
2148    vaesenc 128($KS), $STATE1, $STATE1
2149    vaesenc 144($KS), $STATE1, $STATE1
2150    vaesenc 160($KS), $STATE1, $STATE1
2151    vaesenc 176($KS), $STATE1, $STATE1
2152    vaesenc 192($KS), $STATE1, $STATE1
2153    vaesenc 208($KS), $STATE1, $STATE1
2154    vaesenclast 224($KS), $STATE1, $STATE1
2155
2156    # XOR with Plaintext
2157    vpxor ($PT), $STATE1, $STATE1
2158
2159    vmovdqu $STATE1, ($CT)
2160
2161    addq \$16, $PT
2162    addq \$16, $CT
2163    subq \$1, %r10
2164    jnz .L256_enc_msg_x8_loop2
2165
2166.L256_enc_msg_x8_out:
2167    ret
2168
2169.cfi_endproc
2170.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
2171___
2172}
2173aes256gcmsiv_enc_msg_x8();
2174aesgcmsiv_dec(1);
2175
2176sub aes256gcmsiv_kdf {
2177  my $ONE = "%xmm8";
2178  my $BLOCK1 = "%xmm4";
2179  my $BLOCK2 = "%xmm6";
2180  my $BLOCK3 = "%xmm7";
2181  my $BLOCK4 = "%xmm11";
2182  my $BLOCK5 = "%xmm12";
2183  my $BLOCK6 = "%xmm13";
2184
2185  my $enc_roundx6 = sub {
2186    my ($i, $j) = @_;
2187    return <<___;
2188    vmovdqa ${\eval($i*16)}(%rdx), $j
2189    vaesenc $j, $BLOCK1, $BLOCK1
2190    vaesenc $j, $BLOCK2, $BLOCK2
2191    vaesenc $j, $BLOCK3, $BLOCK3
2192    vaesenc $j, $BLOCK4, $BLOCK4
2193    vaesenc $j, $BLOCK5, $BLOCK5
2194    vaesenc $j, $BLOCK6, $BLOCK6
2195___
2196  };
2197
2198  my $enc_roundlastx6 = sub {
2199    my ($i, $j) = @_;
2200    return <<___;
2201    vmovdqa ${\eval($i*16)}(%rdx), $j
2202    vaesenclast $j, $BLOCK1, $BLOCK1
2203    vaesenclast $j, $BLOCK2, $BLOCK2
2204    vaesenclast $j, $BLOCK3, $BLOCK3
2205    vaesenclast $j, $BLOCK4, $BLOCK4
2206    vaesenclast $j, $BLOCK5, $BLOCK5
2207    vaesenclast $j, $BLOCK6, $BLOCK6
2208___
2209  };
2210
2211  # void aes256gcmsiv_kdf(const uint8_t nonce[16],
2212  #                       uint8_t *out_key_material,
2213  #                       const uint8_t *key_schedule);
2214  $code.=<<___;
2215.globl aes256gcmsiv_kdf
2216.type aes256gcmsiv_kdf,\@function,3
2217.align 16
2218aes256gcmsiv_kdf:
2219.cfi_startproc
2220    _CET_ENDBR
2221# parameter 1: %rdi                         Pointer to NONCE
2222# parameter 2: %rsi                         Pointer to CT
2223# parameter 4: %rdx                         Pointer to keys
2224
2225    vmovdqa (%rdx), %xmm1                  # xmm1 = first 16 bytes of random key
2226    vmovdqa 0*16(%rdi), $BLOCK1
2227    vmovdqa and_mask(%rip), $BLOCK4
2228    vmovdqa one(%rip), $ONE
2229    vpshufd \$0x90, $BLOCK1, $BLOCK1
2230    vpand $BLOCK4, $BLOCK1, $BLOCK1
2231    vpaddd $ONE, $BLOCK1, $BLOCK2
2232    vpaddd $ONE, $BLOCK2, $BLOCK3
2233    vpaddd $ONE, $BLOCK3, $BLOCK4
2234    vpaddd $ONE, $BLOCK4, $BLOCK5
2235    vpaddd $ONE, $BLOCK5, $BLOCK6
2236
2237    vpxor %xmm1, $BLOCK1, $BLOCK1
2238    vpxor %xmm1, $BLOCK2, $BLOCK2
2239    vpxor %xmm1, $BLOCK3, $BLOCK3
2240    vpxor %xmm1, $BLOCK4, $BLOCK4
2241    vpxor %xmm1, $BLOCK5, $BLOCK5
2242    vpxor %xmm1, $BLOCK6, $BLOCK6
2243
2244    ${\$enc_roundx6->(1, "%xmm1")}
2245    ${\$enc_roundx6->(2, "%xmm2")}
2246    ${\$enc_roundx6->(3, "%xmm1")}
2247    ${\$enc_roundx6->(4, "%xmm2")}
2248    ${\$enc_roundx6->(5, "%xmm1")}
2249    ${\$enc_roundx6->(6, "%xmm2")}
2250    ${\$enc_roundx6->(7, "%xmm1")}
2251    ${\$enc_roundx6->(8, "%xmm2")}
2252    ${\$enc_roundx6->(9, "%xmm1")}
2253    ${\$enc_roundx6->(10, "%xmm2")}
2254    ${\$enc_roundx6->(11, "%xmm1")}
2255    ${\$enc_roundx6->(12, "%xmm2")}
2256    ${\$enc_roundx6->(13, "%xmm1")}
2257    ${\$enc_roundlastx6->(14, "%xmm2")}
2258
2259    vmovdqa $BLOCK1, 0*16(%rsi)
2260    vmovdqa $BLOCK2, 1*16(%rsi)
2261    vmovdqa $BLOCK3, 2*16(%rsi)
2262    vmovdqa $BLOCK4, 3*16(%rsi)
2263    vmovdqa $BLOCK5, 4*16(%rsi)
2264    vmovdqa $BLOCK6, 5*16(%rsi)
2265    ret
2266.cfi_endproc
2267.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
2268___
2269}
2270aes256gcmsiv_kdf();
2271
2272print $code;
2273
2274close STDOUT or die "error closing STDOUT: $!";
2275