• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# Copyright (c) 2015, CloudFlare Ltd.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
16
17##############################################################################
18#                                                                            #
19# Author:  Vlad Krasnov                                                      #
20#                                                                            #
21##############################################################################
22
23$flavour = shift;
24$output  = shift;
25if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
35*STDOUT=*OUT;
36
37$avx = 2;
38
39$code.=<<___;
40.text
41.extern OPENSSL_ia32cap_P
42
43chacha20_poly1305_constants:
44
45.align 64
46.chacha20_consts:
47.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
48.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
49.rol8:
50.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
51.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
52.rol16:
53.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
54.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
55.avx2_init:
56.long 0,0,0,0
57.sse_inc:
58.long 1,0,0,0
59.avx2_inc:
60.long 2,0,0,0,2,0,0,0
61.clamp:
62.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
63.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
64.align 16
65.and_masks:
66.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
67.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
68.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
69.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
70.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
71.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
72.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
73.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
74.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
75.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
76.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
77.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
78.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
79.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
80.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
81.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
82___
83
84my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
85my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
86my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
87my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
88my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
89my $r_store="0*16(%rbp)";
90my $s_store="1*16(%rbp)";
91my $len_store="2*16(%rbp)";
92my $state1_store="3*16(%rbp)";
93my $state2_store="4*16(%rbp)";
94my $tmp_store="5*16(%rbp)";
95my $ctr0_store="6*16(%rbp)";
96my $ctr1_store="7*16(%rbp)";
97my $ctr2_store="8*16(%rbp)";
98my $ctr3_store="9*16(%rbp)";
99
100sub chacha_qr {
101my ($a,$b,$c,$d,$t,$dir)=@_;
102$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
103$code.="paddd $b, $a
104        pxor $a, $d
105        pshufb .rol16(%rip), $d
106        paddd $d, $c
107        pxor $c, $b
108        movdqa $b, $t
109        pslld \$12, $t
110        psrld \$20, $b
111        pxor $t, $b
112        paddd $b, $a
113        pxor $a, $d
114        pshufb .rol8(%rip), $d
115        paddd $d, $c
116        pxor $c, $b
117        movdqa $b, $t
118        pslld \$7, $t
119        psrld \$25, $b
120        pxor $t, $b\n";
121$code.="palignr \$4, $b, $b
122        palignr \$8, $c, $c
123        palignr \$12, $d, $d\n" if ($dir =~ /left/);
124$code.="palignr \$12, $b, $b
125        palignr \$8, $c, $c
126        palignr \$4, $d, $d\n" if ($dir =~ /right/);
127$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
128}
129
130sub poly_add {
131my ($src)=@_;
132$code.="add $src, $acc0
133        adc 8+$src, $acc1
134        adc \$1, $acc2\n";
135}
136
137sub poly_stage1 {
138$code.="mov 0+$r_store, %rax
139        mov %rax, $t2
140        mul $acc0
141        mov %rax, $t0
142        mov %rdx, $t1
143        mov 0+$r_store, %rax
144        mul $acc1
145        imulq $acc2, $t2
146        add %rax, $t1
147        adc %rdx, $t2\n";
148}
149
150sub poly_stage2 {
151$code.="mov 8+$r_store, %rax
152        mov %rax, $t3
153        mul $acc0
154        add %rax, $t1
155        adc \$0, %rdx
156        mov %rdx, $acc0
157        mov 8+$r_store, %rax
158        mul $acc1
159        add %rax, $t2
160        adc \$0, %rdx\n";
161}
162
163sub poly_stage3 {
164$code.="imulq $acc2, $t3
165        add $acc0, $t2
166        adc %rdx, $t3\n";
167}
168
169sub poly_reduce_stage {
170$code.="mov $t0, $acc0
171        mov $t1, $acc1
172        mov $t2, $acc2
173        and \$3, $acc2
174        mov $t2, $t0
175        and \$-4, $t0
176        mov $t3, $t1
177        shrd \$2, $t3, $t2
178        shr \$2, $t3
179        add $t0, $acc0
180        adc $t1, $acc1
181        adc \$0, $acc2
182        add $t2, $acc0
183        adc $t3, $acc1
184        adc \$0, $acc2\n";
185}
186
187sub poly_mul {
188    &poly_stage1();
189    &poly_stage2();
190    &poly_stage3();
191    &poly_reduce_stage();
192}
193
194sub prep_state {
195my ($n)=@_;
196$code.="movdqa .chacha20_consts(%rip), $A0
197        movdqa $state1_store, $B0
198        movdqa $state2_store, $C0\n";
199$code.="movdqa $A0, $A1
200        movdqa $B0, $B1
201        movdqa $C0, $C1\n" if ($n ge 2);
202$code.="movdqa $A0, $A2
203        movdqa $B0, $B2
204        movdqa $C0, $C2\n" if ($n ge 3);
205$code.="movdqa $A0, $A3
206        movdqa $B0, $B3
207        movdqa $C0, $C3\n" if ($n ge 4);
208$code.="movdqa $ctr0_store, $D0
209        paddd .sse_inc(%rip), $D0
210        movdqa $D0, $ctr0_store\n" if ($n eq 1);
211$code.="movdqa $ctr0_store, $D1
212        paddd .sse_inc(%rip), $D1
213        movdqa $D1, $D0
214        paddd .sse_inc(%rip), $D0
215        movdqa $D0, $ctr0_store
216        movdqa $D1, $ctr1_store\n" if ($n eq 2);
217$code.="movdqa $ctr0_store, $D2
218        paddd .sse_inc(%rip), $D2
219        movdqa $D2, $D1
220        paddd .sse_inc(%rip), $D1
221        movdqa $D1, $D0
222        paddd .sse_inc(%rip), $D0
223        movdqa $D0, $ctr0_store
224        movdqa $D1, $ctr1_store
225        movdqa $D2, $ctr2_store\n" if ($n eq 3);
226$code.="movdqa $ctr0_store, $D3
227        paddd .sse_inc(%rip), $D3
228        movdqa $D3, $D2
229        paddd .sse_inc(%rip), $D2
230        movdqa $D2, $D1
231        paddd .sse_inc(%rip), $D1
232        movdqa $D1, $D0
233        paddd .sse_inc(%rip), $D0
234        movdqa $D0, $ctr0_store
235        movdqa $D1, $ctr1_store
236        movdqa $D2, $ctr2_store
237        movdqa $D3, $ctr3_store\n" if ($n eq 4);
238}
239
240sub finalize_state {
241my ($n)=@_;
242$code.="paddd .chacha20_consts(%rip), $A3
243        paddd $state1_store, $B3
244        paddd $state2_store, $C3
245        paddd $ctr3_store, $D3\n" if ($n eq 4);
246$code.="paddd .chacha20_consts(%rip), $A2
247        paddd $state1_store, $B2
248        paddd $state2_store, $C2
249        paddd $ctr2_store, $D2\n" if ($n ge 3);
250$code.="paddd .chacha20_consts(%rip), $A1
251        paddd $state1_store, $B1
252        paddd $state2_store, $C1
253        paddd $ctr1_store, $D1\n" if ($n ge 2);
254$code.="paddd .chacha20_consts(%rip), $A0
255        paddd $state1_store, $B0
256        paddd $state2_store, $C0
257        paddd $ctr0_store, $D0\n";
258}
259
260sub xor_stream {
261my ($A, $B, $C, $D, $offset)=@_;
262$code.="movdqu 0*16 + $offset($inp), $A3
263        movdqu 1*16 + $offset($inp), $B3
264        movdqu 2*16 + $offset($inp), $C3
265        movdqu 3*16 + $offset($inp), $D3
266        pxor $A3, $A
267        pxor $B3, $B
268        pxor $C3, $C
269        pxor $D, $D3
270        movdqu $A, 0*16 + $offset($oup)
271        movdqu $B, 1*16 + $offset($oup)
272        movdqu $C, 2*16 + $offset($oup)
273        movdqu $D3, 3*16 + $offset($oup)\n";
274}
275
276sub xor_stream_using_temp {
277my ($A, $B, $C, $D, $offset, $temp)=@_;
278$code.="movdqa $temp, $tmp_store
279        movdqu 0*16 + $offset($inp), $temp
280        pxor $A, $temp
281        movdqu $temp, 0*16 + $offset($oup)
282        movdqu 1*16 + $offset($inp), $temp
283        pxor $B, $temp
284        movdqu $temp, 1*16 + $offset($oup)
285        movdqu 2*16 + $offset($inp), $temp
286        pxor $C, $temp
287        movdqu $temp, 2*16 + $offset($oup)
288        movdqu 3*16 + $offset($inp), $temp
289        pxor $D, $temp
290        movdqu $temp, 3*16 + $offset($oup)\n";
291}
292
293sub gen_chacha_round {
294my ($rot1, $rot2, $shift)=@_;
295my $round="";
296$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
297$round.="movdqa $rot2, $C0
298         paddd $B3, $A3
299         paddd $B2, $A2
300         paddd $B1, $A1
301         paddd $B0, $A0
302         pxor $A3, $D3
303         pxor $A2, $D2
304         pxor $A1, $D1
305         pxor $A0, $D0
306         pshufb $C0, $D3
307         pshufb $C0, $D2
308         pshufb $C0, $D1
309         pshufb $C0, $D0
310         movdqa $tmp_store, $C0
311         paddd $D3, $C3
312         paddd $D2, $C2
313         paddd $D1, $C1
314         paddd $D0, $C0
315         pxor $C3, $B3
316         pxor $C2, $B2
317         pxor $C1, $B1
318         pxor $C0, $B0
319         movdqa $C0, $tmp_store
320         movdqa $B3, $C0
321         psrld \$$rot1, $C0
322         pslld \$32-$rot1, $B3
323         pxor $C0, $B3
324         movdqa $B2, $C0
325         psrld \$$rot1, $C0
326         pslld \$32-$rot1, $B2
327         pxor $C0, $B2
328         movdqa $B1, $C0
329         psrld \$$rot1, $C0
330         pslld \$32-$rot1, $B1
331         pxor $C0, $B1
332         movdqa $B0, $C0
333         psrld \$$rot1, $C0
334         pslld \$32-$rot1, $B0
335         pxor $C0, $B0\n";
336($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
337($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
338$round.="movdqa $tmp_store, $C0
339         palignr \$$s1, $B3, $B3
340         palignr \$$s2, $C3, $C3
341         palignr \$$s3, $D3, $D3
342         palignr \$$s1, $B2, $B2
343         palignr \$$s2, $C2, $C2
344         palignr \$$s3, $D2, $D2
345         palignr \$$s1, $B1, $B1
346         palignr \$$s2, $C1, $C1
347         palignr \$$s3, $D1, $D1
348         palignr \$$s1, $B0, $B0
349         palignr \$$s2, $C0, $C0
350         palignr \$$s3, $D0, $D0\n"
351if (($shift =~ /left/) || ($shift =~ /right/));
352return $round;
353};
354
355$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
356               &gen_chacha_round(25, ".rol8(%rip)", "left") .
357               &gen_chacha_round(20, ".rol16(%rip)") .
358               &gen_chacha_round(25, ".rol8(%rip)", "right");
359
360my @loop_body = split /\n/, $chacha_body;
361
362sub emit_body {
363my ($n)=@_;
364    for (my $i=0; $i < $n; $i++) {
365        $code=$code.shift(@loop_body)."\n";
366    };
367}
368
369{
370################################################################################
371# void poly_hash_ad_internal();
372$code.="
373.type poly_hash_ad_internal,\@function,2
374.align 64
375poly_hash_ad_internal:
376.cfi_startproc
377    xor $acc0, $acc0
378    xor $acc1, $acc1
379    xor $acc2, $acc2
380    cmp \$13,  $itr2
381    jne hash_ad_loop
382poly_fast_tls_ad:
383    # Special treatment for the TLS case of 13 bytes
384    mov ($adp), $acc0
385    mov 5($adp), $acc1
386    shr \$24, $acc1
387    mov \$1, $acc2\n";
388    &poly_mul(); $code.="
389    ret
390hash_ad_loop:
391        # Hash in 16 byte chunk
392        cmp \$16, $itr2
393        jb hash_ad_tail\n";
394        &poly_add("0($adp)");
395        &poly_mul(); $code.="
396        lea 1*16($adp), $adp
397        sub \$16, $itr2
398    jmp hash_ad_loop
399hash_ad_tail:
400    cmp \$0, $itr2
401    je 1f
402    # Hash last < 16 byte tail
403    xor $t0, $t0
404    xor $t1, $t1
405    xor $t2, $t2
406    add $itr2, $adp
407hash_ad_tail_loop:
408        shld \$8, $t0, $t1
409        shl \$8, $t0
410        movzxb -1($adp), $t2
411        xor $t2, $t0
412        dec $adp
413        dec $itr2
414    jne hash_ad_tail_loop
415
416    add $t0, $acc0
417    adc $t1, $acc1
418    adc \$1, $acc2\n";
419    &poly_mul(); $code.="
420    # Finished AD
4211:
422    ret
423.cfi_endproc
424.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
425}
426
427{
428################################################################################
429# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
430$code.="
431.globl chacha20_poly1305_open
432.type chacha20_poly1305_open,\@function,2
433.align 64
434chacha20_poly1305_open:
435.cfi_startproc
436    push %rbp
437.cfi_adjust_cfa_offset 8
438    push %rbx
439.cfi_adjust_cfa_offset 8
440    push %r12
441.cfi_adjust_cfa_offset 8
442    push %r13
443.cfi_adjust_cfa_offset 8
444    push %r14
445.cfi_adjust_cfa_offset 8
446    push %r15
447.cfi_adjust_cfa_offset 8
448    # We write the calculated authenticator back to keyp at the end, so save
449    # the pointer on the stack too.
450    push $keyp
451.cfi_adjust_cfa_offset 8
452    sub \$288 + 32, %rsp
453.cfi_adjust_cfa_offset 288 + 32
454.cfi_offset rbp, -16
455.cfi_offset rbx, -24
456.cfi_offset r12, -32
457.cfi_offset r13, -40
458.cfi_offset r14, -48
459.cfi_offset r15, -56
460    lea 32(%rsp), %rbp
461    and \$-32, %rbp
462    mov %rdx, 8+$len_store
463    mov %r8, 0+$len_store
464    mov %rdx, $inl\n"; $code.="
465    mov OPENSSL_ia32cap_P+8(%rip), %eax
466    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
467    xor \$`(1<<5) + (1<<8)`, %eax
468    jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
469$code.="
4701:
471    cmp \$128, $inl
472    jbe open_sse_128
473    # For long buffers, prepare the poly key first
474    movdqa .chacha20_consts(%rip), $A0
475    movdqu 0*16($keyp), $B0
476    movdqu 1*16($keyp), $C0
477    movdqu 2*16($keyp), $D0
478    movdqa $D0, $T1
479    # Store on stack, to free keyp
480    movdqa $B0, $state1_store
481    movdqa $C0, $state2_store
482    movdqa $D0, $ctr0_store
483    mov \$10, $acc0
4841:  \n";
485        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
486        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
487        dec $acc0
488    jne 1b
489    # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
490    paddd .chacha20_consts(%rip), $A0
491    paddd $state1_store, $B0
492    # Clamp and store the key
493    pand .clamp(%rip), $A0
494    movdqa $A0, $r_store
495    movdqa $B0, $s_store
496    # Hash
497    mov %r8, $itr2
498    call poly_hash_ad_internal
499open_sse_main_loop:
500        cmp \$16*16, $inl
501        jb 2f
502        # Load state, increment counter blocks\n";
503        &prep_state(4); $code.="
504        # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
505        # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
506        mov \$4, $itr1
507        mov $inp, $itr2
5081:  \n";
509            &emit_body(20);
510            &poly_add("0($itr2)"); $code.="
511            lea 2*8($itr2), $itr2\n";
512            &emit_body(20);
513            &poly_stage1();
514            &emit_body(20);
515            &poly_stage2();
516            &emit_body(20);
517            &poly_stage3();
518            &emit_body(20);
519            &poly_reduce_stage();
520            foreach $l (@loop_body) {$code.=$l."\n";}
521            @loop_body = split /\n/, $chacha_body; $code.="
522            dec $itr1
523        jge 1b\n";
524            &poly_add("0($itr2)");
525            &poly_mul(); $code.="
526            lea 2*8($itr2), $itr2
527            cmp \$-6, $itr1
528        jg 1b\n";
529        &finalize_state(4);
530        &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
531        &xor_stream($A2, $B2, $C2, $D2, "4*16");
532        &xor_stream($A1, $B1, $C1, $D1, "8*16");
533        &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
534        lea 16*16($inp), $inp
535        lea 16*16($oup), $oup
536        sub \$16*16, $inl
537    jmp open_sse_main_loop
5382:
539    # Handle the various tail sizes efficiently
540    test $inl, $inl
541    jz open_sse_finalize
542    cmp \$4*16, $inl
543    ja 3f\n";
544###############################################################################
545    # At most 64 bytes are left
546    &prep_state(1); $code.="
547    xor $itr2, $itr2
548    mov $inl, $itr1
549    cmp \$16, $itr1
550    jb 2f
5511:  \n";
552        &poly_add("0($inp, $itr2)");
553        &poly_mul(); $code.="
554        sub \$16, $itr1
5552:
556        add \$16, $itr2\n";
557        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
558        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
559        cmp \$16, $itr1
560    jae 1b
561        cmp \$10*16, $itr2
562    jne 2b\n";
563    &finalize_state(1); $code.="
564    jmp open_sse_tail_64_dec_loop
5653:
566    cmp \$8*16, $inl
567    ja 3f\n";
568###############################################################################
569    # 65 - 128 bytes are left
570    &prep_state(2); $code.="
571    mov $inl, $itr1
572    and \$-16, $itr1
573    xor $itr2, $itr2
5741:  \n";
575        &poly_add("0($inp, $itr2)");
576        &poly_mul(); $code.="
5772:
578        add \$16, $itr2\n";
579        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
580        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
581        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
582        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
583        cmp $itr1, $itr2
584    jb 1b
585        cmp \$10*16, $itr2
586    jne 2b\n";
587    &finalize_state(2);
588    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
589    sub \$4*16, $inl
590    lea 4*16($inp), $inp
591    lea 4*16($oup), $oup
592    jmp open_sse_tail_64_dec_loop
5933:
594    cmp \$12*16, $inl
595    ja 3f\n";
596###############################################################################
597    # 129 - 192 bytes are left
598    &prep_state(3); $code.="
599    mov $inl, $itr1
600    mov \$10*16, $itr2
601    cmp \$10*16, $itr1
602    cmovg $itr2, $itr1
603    and \$-16, $itr1
604    xor $itr2, $itr2
6051:  \n";
606        &poly_add("0($inp, $itr2)");
607        &poly_mul(); $code.="
6082:
609        add \$16, $itr2\n";
610        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
611        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
612        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
613        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
614        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
615        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
616        cmp $itr1, $itr2
617    jb 1b
618        cmp \$10*16, $itr2
619    jne 2b
620    cmp \$11*16, $inl
621    jb 1f\n";
622    &poly_add("10*16($inp)");
623    &poly_mul(); $code.="
624    cmp \$12*16, $inl
625    jb 1f\n";
626    &poly_add("11*16($inp)");
627    &poly_mul(); $code.="
6281:  \n";
629    &finalize_state(3);
630    &xor_stream($A2, $B2, $C2, $D2, "0*16");
631    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
632    sub \$8*16, $inl
633    lea 8*16($inp), $inp
634    lea 8*16($oup), $oup
635    jmp open_sse_tail_64_dec_loop
6363:
637###############################################################################\n";
638    # 193 - 255 bytes are left
639    &prep_state(4); $code.="
640    xor $itr2, $itr2
6411:  \n";
642        &poly_add("0($inp, $itr2)");
643        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
644        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
645        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
646        &poly_stage1();
647        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
648        &poly_stage2();
649        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
650        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
651        &poly_stage3();
652        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
653        &poly_reduce_stage();
654        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
655        add \$16, $itr2
656        cmp \$10*16, $itr2
657    jb 1b
658    mov $inl, $itr1
659    and \$-16, $itr1
6601:  \n";
661        &poly_add("0($inp, $itr2)");
662        &poly_mul(); $code.="
663        add \$16, $itr2
664        cmp $itr1, $itr2
665    jb 1b\n";
666    &finalize_state(4);
667    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
668    &xor_stream($A2, $B2, $C2, $D2, "4*16");
669    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
670    movdqa $tmp_store, $D0
671    sub \$12*16, $inl
672    lea 12*16($inp), $inp
673    lea 12*16($oup), $oup
674###############################################################################
675    # Decrypt the remaining data, 16B at a time, using existing stream
676open_sse_tail_64_dec_loop:
677    cmp \$16, $inl
678    jb 1f
679        sub \$16, $inl
680        movdqu ($inp), $T0
681        pxor $T0, $A0
682        movdqu $A0, ($oup)
683        lea 16($inp), $inp
684        lea 16($oup), $oup
685        movdqa $B0, $A0
686        movdqa $C0, $B0
687        movdqa $D0, $C0
688    jmp open_sse_tail_64_dec_loop
6891:
690    movdqa $A0, $A1
691
692    # Decrypt up to 16 bytes at the end.
693open_sse_tail_16:
694    test $inl, $inl
695    jz open_sse_finalize
696
697    # Read the final bytes into $T0. They need to be read in reverse order so
698    # that they end up in the correct order in $T0.
699    pxor $T0, $T0
700    lea -1($inp, $inl), $inp
701    movq $inl, $itr2
7022:
703        pslldq \$1, $T0
704        pinsrb \$0, ($inp), $T0
705        sub \$1, $inp
706        sub \$1, $itr2
707        jnz 2b
708
7093:
710    movq $T0, $t0
711    pextrq \$1, $T0, $t1
712    # The final bytes of keystream are in $A1.
713    pxor $A1, $T0
714
715    # Copy the plaintext bytes out.
7162:
717        pextrb \$0, $T0, ($oup)
718        psrldq \$1, $T0
719        add \$1, $oup
720        sub \$1, $inl
721    jne 2b
722
723    add $t0, $acc0
724    adc $t1, $acc1
725    adc \$1, $acc2\n";
726    &poly_mul(); $code.="
727
728open_sse_finalize:\n";
729    &poly_add($len_store);
730    &poly_mul(); $code.="
731    # Final reduce
732    mov $acc0, $t0
733    mov $acc1, $t1
734    mov $acc2, $t2
735    sub \$-5, $acc0
736    sbb \$-1, $acc1
737    sbb \$3, $acc2
738    cmovc $t0, $acc0
739    cmovc $t1, $acc1
740    cmovc $t2, $acc2
741    # Add in s part of the key
742    add 0+$s_store, $acc0
743    adc 8+$s_store, $acc1
744
745    add \$288 + 32, %rsp
746.cfi_adjust_cfa_offset -(288 + 32)
747    pop $keyp
748.cfi_adjust_cfa_offset -8
749    movq $acc0, ($keyp)
750    movq $acc1, 8($keyp)
751
752    pop %r15
753.cfi_adjust_cfa_offset -8
754    pop %r14
755.cfi_adjust_cfa_offset -8
756    pop %r13
757.cfi_adjust_cfa_offset -8
758    pop %r12
759.cfi_adjust_cfa_offset -8
760    pop %rbx
761.cfi_adjust_cfa_offset -8
762    pop %rbp
763.cfi_adjust_cfa_offset -8
764    ret
765.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
766###############################################################################
767open_sse_128:
768    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
769    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
770    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
771    movdqu 2*16($keyp), $D0
772    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
773    movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
774    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
775    mov \$10, $acc0
7761:  \n";
777        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
778        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
779        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
780        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
781        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
782        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
783    dec $acc0
784    jnz 1b
785    paddd .chacha20_consts(%rip), $A0
786    paddd .chacha20_consts(%rip), $A1
787    paddd .chacha20_consts(%rip), $A2
788    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
789    paddd $T2, $C1\npaddd $T2, $C2
790    paddd $T3, $D1
791    paddd .sse_inc(%rip), $T3
792    paddd $T3, $D2
793    # Clamp and store the key
794    pand .clamp(%rip), $A0
795    movdqa $A0, $r_store
796    movdqa $B0, $s_store
797    # Hash
798    mov %r8, $itr2
799    call poly_hash_ad_internal
8001:
801        cmp \$16, $inl
802        jb open_sse_tail_16
803        sub \$16, $inl\n";
804        # Load for hashing
805        &poly_add("0*8($inp)"); $code.="
806        # Load for decryption
807        movdqu 0*16($inp), $T0
808        pxor $T0, $A1
809        movdqu $A1, 0*16($oup)
810        lea 1*16($inp), $inp
811        lea 1*16($oup), $oup\n";
812        &poly_mul(); $code.="
813        # Shift the stream left
814        movdqa $B1, $A1
815        movdqa $C1, $B1
816        movdqa $D1, $C1
817        movdqa $A2, $D1
818        movdqa $B2, $A2
819        movdqa $C2, $B2
820        movdqa $D2, $C2
821    jmp 1b
822    jmp open_sse_tail_16
823.size chacha20_poly1305_open, .-chacha20_poly1305_open
824.cfi_endproc
825
826################################################################################
827################################################################################
828# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
829.globl  chacha20_poly1305_seal
830.type chacha20_poly1305_seal,\@function,2
831.align 64
832chacha20_poly1305_seal:
833.cfi_startproc
834    push %rbp
835.cfi_adjust_cfa_offset 8
836    push %rbx
837.cfi_adjust_cfa_offset 8
838    push %r12
839.cfi_adjust_cfa_offset 8
840    push %r13
841.cfi_adjust_cfa_offset 8
842    push %r14
843.cfi_adjust_cfa_offset 8
844    push %r15
845.cfi_adjust_cfa_offset 8
846    # We write the calculated authenticator back to keyp at the end, so save
847    # the pointer on the stack too.
848    push $keyp
849.cfi_adjust_cfa_offset 8
850    sub \$288 + 32, %rsp
851.cfi_adjust_cfa_offset 288 + 32
852.cfi_offset rbp, -16
853.cfi_offset rbx, -24
854.cfi_offset r12, -32
855.cfi_offset r13, -40
856.cfi_offset r14, -48
857.cfi_offset r15, -56
858    lea 32(%rsp), %rbp
859    and \$-32, %rbp
860    mov 56($keyp), $inl  # extra_in_len
861    addq %rdx, $inl
862    mov $inl, 8+$len_store
863    mov %r8, 0+$len_store
864    mov %rdx, $inl\n"; $code.="
865    mov OPENSSL_ia32cap_P+8(%rip), %eax
866    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
867    xor \$`(1<<5) + (1<<8)`, %eax
868    jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
869$code.="
870    cmp \$128, $inl
871    jbe seal_sse_128
872    # For longer buffers, prepare the poly key + some stream
873    movdqa .chacha20_consts(%rip), $A0
874    movdqu 0*16($keyp), $B0
875    movdqu 1*16($keyp), $C0
876    movdqu 2*16($keyp), $D0
877    movdqa $A0, $A1
878    movdqa $A0, $A2
879    movdqa $A0, $A3
880    movdqa $B0, $B1
881    movdqa $B0, $B2
882    movdqa $B0, $B3
883    movdqa $C0, $C1
884    movdqa $C0, $C2
885    movdqa $C0, $C3
886    movdqa $D0, $D3
887    paddd .sse_inc(%rip), $D0
888    movdqa $D0, $D2
889    paddd .sse_inc(%rip), $D0
890    movdqa $D0, $D1
891    paddd .sse_inc(%rip), $D0
892    # Store on stack
893    movdqa $B0, $state1_store
894    movdqa $C0, $state2_store
895    movdqa $D0, $ctr0_store
896    movdqa $D1, $ctr1_store
897    movdqa $D2, $ctr2_store
898    movdqa $D3, $ctr3_store
899    mov \$10, $acc0
9001:  \n";
901        foreach $l (@loop_body) {$code.=$l."\n";}
902        @loop_body = split /\n/, $chacha_body; $code.="
903        dec $acc0
904    jnz 1b\n";
905    &finalize_state(4); $code.="
906    # Clamp and store the key
907    pand .clamp(%rip), $A3
908    movdqa $A3, $r_store
909    movdqa $B3, $s_store
910    # Hash
911    mov %r8, $itr2
912    call poly_hash_ad_internal\n";
913    &xor_stream($A2,$B2,$C2,$D2,"0*16");
914    &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
915    cmp \$12*16, $inl
916    ja 1f
917    mov \$8*16, $itr1
918    sub \$8*16, $inl
919    lea 8*16($inp), $inp
920    jmp seal_sse_128_seal_hash
9211:  \n";
922    &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
923    mov \$12*16, $itr1
924    sub \$12*16, $inl
925    lea 12*16($inp), $inp
926    mov \$2, $itr1
927    mov \$8, $itr2
928    cmp \$4*16, $inl
929    jbe seal_sse_tail_64
930    cmp \$8*16, $inl
931    jbe seal_sse_tail_128
932    cmp \$12*16, $inl
933    jbe seal_sse_tail_192
934
9351:  \n";
936    # The main loop
937        &prep_state(4); $code.="
9382:  \n";
939            &emit_body(20);
940            &poly_add("0($oup)");
941            &emit_body(20);
942            &poly_stage1();
943            &emit_body(20);
944            &poly_stage2();
945            &emit_body(20);
946            &poly_stage3();
947            &emit_body(20);
948            &poly_reduce_stage();
949            foreach $l (@loop_body) {$code.=$l."\n";}
950            @loop_body = split /\n/, $chacha_body; $code.="
951            lea 16($oup), $oup
952            dec $itr2
953        jge 2b\n";
954            &poly_add("0*8($oup)");
955            &poly_mul(); $code.="
956            lea 16($oup), $oup
957            dec $itr1
958        jg 2b\n";
959
960        &finalize_state(4);$code.="
961        movdqa $D2, $tmp_store\n";
962        &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
963        movdqa $tmp_store, $D2\n";
964        &xor_stream($A2,$B2,$C2,$D2, 4*16);
965        &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
966        cmp \$16*16, $inl
967        ja 3f
968
969        mov \$12*16, $itr1
970        sub \$12*16, $inl
971        lea 12*16($inp), $inp
972        jmp seal_sse_128_seal_hash
9733:  \n";
974        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
975        lea 16*16($inp), $inp
976        sub \$16*16, $inl
977        mov \$6, $itr1
978        mov \$4, $itr2
979        cmp \$12*16, $inl
980    jg 1b
981    mov $inl, $itr1
982    test $inl, $inl
983    je seal_sse_128_seal_hash
984    mov \$6, $itr1
985    cmp \$4*16, $inl
986    jg 3f
987###############################################################################
988seal_sse_tail_64:\n";
989    &prep_state(1); $code.="
9901:  \n";
991        &poly_add("0($oup)");
992        &poly_mul(); $code.="
993        lea 16($oup), $oup
9942:  \n";
995        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
996        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
997        &poly_add("0($oup)");
998        &poly_mul(); $code.="
999        lea 16($oup), $oup
1000    dec $itr1
1001    jg 1b
1002    dec $itr2
1003    jge 2b\n";
1004    &finalize_state(1); $code.="
1005    jmp seal_sse_128_seal
10063:
1007    cmp \$8*16, $inl
1008    jg 3f
1009###############################################################################
1010seal_sse_tail_128:\n";
1011    &prep_state(2); $code.="
10121:  \n";
1013        &poly_add("0($oup)");
1014        &poly_mul(); $code.="
1015        lea 16($oup), $oup
10162:  \n";
1017        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1018        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1019        &poly_add("0($oup)");
1020        &poly_mul();
1021        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1022        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1023        lea 16($oup), $oup
1024    dec $itr1
1025    jg 1b
1026    dec $itr2
1027    jge 2b\n";
1028    &finalize_state(2);
1029    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
1030    mov \$4*16, $itr1
1031    sub \$4*16, $inl
1032    lea 4*16($inp), $inp
1033    jmp seal_sse_128_seal_hash
10343:
1035###############################################################################
1036seal_sse_tail_192:\n";
1037    &prep_state(3); $code.="
10381:  \n";
1039        &poly_add("0($oup)");
1040        &poly_mul(); $code.="
1041        lea 16($oup), $oup
10422:  \n";
1043        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1044        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1045        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1046        &poly_add("0($oup)");
1047        &poly_mul();
1048        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1049        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1050        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1051        lea 16($oup), $oup
1052    dec $itr1
1053    jg 1b
1054    dec $itr2
1055    jge 2b\n";
1056    &finalize_state(3);
1057    &xor_stream($A2,$B2,$C2,$D2,0*16);
1058    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
1059    mov \$8*16, $itr1
1060    sub \$8*16, $inl
1061    lea 8*16($inp), $inp
1062###############################################################################
1063seal_sse_128_seal_hash:
1064        cmp \$16, $itr1
1065        jb seal_sse_128_seal\n";
1066        &poly_add("0($oup)");
1067        &poly_mul(); $code.="
1068        sub \$16, $itr1
1069        lea 16($oup), $oup
1070    jmp seal_sse_128_seal_hash
1071
1072seal_sse_128_seal:
1073        cmp \$16, $inl
1074        jb seal_sse_tail_16
1075        sub \$16, $inl
1076        # Load for decryption
1077        movdqu 0*16($inp), $T0
1078        pxor $T0, $A0
1079        movdqu $A0, 0*16($oup)
1080        # Then hash
1081        add 0*8($oup), $acc0
1082        adc 1*8($oup), $acc1
1083        adc \$1, $acc2
1084        lea 1*16($inp), $inp
1085        lea 1*16($oup), $oup\n";
1086        &poly_mul(); $code.="
1087        # Shift the stream left
1088        movdqa $B0, $A0
1089        movdqa $C0, $B0
1090        movdqa $D0, $C0
1091        movdqa $A1, $D0
1092        movdqa $B1, $A1
1093        movdqa $C1, $B1
1094        movdqa $D1, $C1
1095    jmp seal_sse_128_seal
1096
1097seal_sse_tail_16:
1098    test $inl, $inl
1099    jz process_blocks_of_extra_in
1100    # We can only load the PT one byte at a time to avoid buffer overread
1101    mov $inl, $itr2
1102    mov $inl, $itr1
1103    lea -1($inp, $inl), $inp
1104    pxor $T3, $T3
11051:
1106        pslldq \$1, $T3
1107        pinsrb \$0, ($inp), $T3
1108        lea -1($inp), $inp
1109        dec $itr1
1110        jne 1b
1111
1112    # XOR the keystream with the plaintext.
1113    pxor $A0, $T3
1114
1115    # Write ciphertext out, byte-by-byte.
1116    movq $inl, $itr1
1117    movdqu $T3, $A0
11182:
1119        pextrb \$0, $A0, ($oup)
1120        psrldq \$1, $A0
1121        add \$1, $oup
1122        sub \$1, $itr1
1123        jnz 2b
1124
1125    # $T3 contains the final (partial, non-empty) block of ciphertext which
1126    # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
1127    # are valid. We need to fill it with extra_in bytes until full, or until we
1128    # run out of bytes.
1129    #
1130    # $keyp points to the tag output, which is actually a struct with the
1131    # extra_in pointer and length at offset 48.
1132    movq 288+32(%rsp), $keyp
1133    movq 56($keyp), $t1  # extra_in_len
1134    movq 48($keyp), $t0  # extra_in
1135    test $t1, $t1
1136    jz process_partial_block  # Common case: no bytes of extra_in
1137
1138    movq \$16, $t2
1139    subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
1140    cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
1141                    # (note that AT&T syntax reverses the arguments)
1142    jge load_extra_in
1143    movq $t1, $t2
1144
1145load_extra_in:
1146    # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
1147    # into $T3. They are loaded in reverse order.
1148    leaq -1($t0, $t2), $inp
1149    # Update extra_in and extra_in_len to reflect the bytes that are about to
1150    # be read.
1151    addq $t2, $t0
1152    subq $t2, $t1
1153    movq $t0, 48($keyp)
1154    movq $t1, 56($keyp)
1155
1156    # Update $itr2, which is used to select the mask later on, to reflect the
1157    # extra bytes about to be added.
1158    addq $t2, $itr2
1159
1160    # Load $t2 bytes of extra_in into $T2.
1161    pxor $T2, $T2
11623:
1163        pslldq \$1, $T2
1164        pinsrb \$0, ($inp), $T2
1165        lea -1($inp), $inp
1166        sub \$1, $t2
1167        jnz 3b
1168
1169    # Shift $T2 up the length of the remainder from the main encryption. Sadly,
1170    # the shift for an XMM register has to be a constant, thus we loop to do
1171    # this.
1172    movq $inl, $t2
1173
11744:
1175        pslldq \$1, $T2
1176        sub \$1, $t2
1177        jnz 4b
1178
1179    # Mask $T3 (the remainder from the main encryption) so that superfluous
1180    # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
1181    # disjoint and so we can merge them with an OR.
1182    lea .and_masks(%rip), $t2
1183    shl \$4, $inl
1184    pand -16($t2, $inl), $T3
1185
1186    # Merge $T2 into $T3, forming the remainder block.
1187    por $T2, $T3
1188
1189    # The block of ciphertext + extra_in is ready to be included in the
1190    # Poly1305 state.
1191    movq $T3, $t0
1192    pextrq \$1, $T3, $t1
1193    add $t0, $acc0
1194    adc $t1, $acc1
1195    adc \$1, $acc2\n";
1196    &poly_mul(); $code.="
1197
1198process_blocks_of_extra_in:
1199    # There may be additional bytes of extra_in to process.
1200    movq 288+32(%rsp), $keyp
1201    movq 48($keyp), $inp   # extra_in
1202    movq 56($keyp), $itr2  # extra_in_len
1203    movq $itr2, $itr1
1204    shr \$4, $itr2         # number of blocks
1205
12065:
1207        jz process_extra_in_trailer\n";
1208        &poly_add("0($inp)");
1209        &poly_mul(); $code.="
1210        leaq 16($inp), $inp
1211        subq \$1, $itr2
1212        jmp 5b
1213
1214process_extra_in_trailer:
1215    andq \$15, $itr1       # remaining num bytes (<16) of extra_in
1216    movq $itr1, $inl
1217    jz do_length_block
1218    leaq -1($inp, $itr1), $inp
1219
12206:
1221        pslldq \$1, $T3
1222        pinsrb \$0, ($inp), $T3
1223        lea -1($inp), $inp
1224        sub \$1, $itr1
1225        jnz 6b
1226
1227process_partial_block:
1228    # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
1229    lea .and_masks(%rip), $t2
1230    shl \$4, $inl
1231    pand -16($t2, $inl), $T3
1232    movq $T3, $t0
1233    pextrq \$1, $T3, $t1
1234    add $t0, $acc0
1235    adc $t1, $acc1
1236    adc \$1, $acc2\n";
1237    &poly_mul(); $code.="
1238
1239do_length_block:\n";
1240    &poly_add($len_store);
1241    &poly_mul(); $code.="
1242    # Final reduce
1243    mov $acc0, $t0
1244    mov $acc1, $t1
1245    mov $acc2, $t2
1246    sub \$-5, $acc0
1247    sbb \$-1, $acc1
1248    sbb \$3, $acc2
1249    cmovc $t0, $acc0
1250    cmovc $t1, $acc1
1251    cmovc $t2, $acc2
1252    # Add in s part of the key
1253    add 0+$s_store, $acc0
1254    adc 8+$s_store, $acc1
1255
1256    add \$288 + 32, %rsp
1257.cfi_adjust_cfa_offset -(288 + 32)
1258    pop $keyp
1259.cfi_adjust_cfa_offset -8
1260    mov $acc0, 0*8($keyp)
1261    mov $acc1, 1*8($keyp)
1262
1263    pop %r15
1264.cfi_adjust_cfa_offset -8
1265    pop %r14
1266.cfi_adjust_cfa_offset -8
1267    pop %r13
1268.cfi_adjust_cfa_offset -8
1269    pop %r12
1270.cfi_adjust_cfa_offset -8
1271    pop %rbx
1272.cfi_adjust_cfa_offset -8
1273    pop %rbp
1274.cfi_adjust_cfa_offset -8
1275    ret
1276.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
1277################################################################################
1278seal_sse_128:
1279    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
1280    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
1281    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
1282    movdqu 2*16($keyp), $D2
1283    movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
1284    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
1285    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
1286    mov \$10, $acc0
12871:\n";
1288        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
1289        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
1290        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
1291        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
1292        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
1293        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1294        dec $acc0
1295    jnz 1b
1296    paddd .chacha20_consts(%rip), $A0
1297    paddd .chacha20_consts(%rip), $A1
1298    paddd .chacha20_consts(%rip), $A2
1299    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
1300    paddd $T2, $C0\npaddd $T2, $C1
1301    paddd $T3, $D0
1302    paddd .sse_inc(%rip), $T3
1303    paddd $T3, $D1
1304    # Clamp and store the key
1305    pand .clamp(%rip), $A2
1306    movdqa $A2, $r_store
1307    movdqa $B2, $s_store
1308    # Hash
1309    mov %r8, $itr2
1310    call poly_hash_ad_internal
1311    jmp seal_sse_128_seal
1312.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
1313}
1314
1315# There should have been a cfi_endproc at the end of that function, but the two
1316# following blocks of code are jumped to without a stack frame and the CFI
1317# context which they are used in happens to match the CFI context at the end of
1318# the previous function. So the CFI table is just extended to the end of them.
1319
1320if ($avx>1) {
1321
1322($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
1323my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
1324($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
1325$state1_store="2*32(%rbp)";
1326$state2_store="3*32(%rbp)";
1327$tmp_store="4*32(%rbp)";
1328$ctr0_store="5*32(%rbp)";
1329$ctr1_store="6*32(%rbp)";
1330$ctr2_store="7*32(%rbp)";
1331$ctr3_store="8*32(%rbp)";
1332
1333sub chacha_qr_avx2 {
1334my ($a,$b,$c,$d,$t,$dir)=@_;
1335$code.=<<___ if ($dir =~ /store/);
1336    vmovdqa $t, $tmp_store
1337___
1338$code.=<<___;
1339    vpaddd $b, $a, $a
1340    vpxor $a, $d, $d
1341    vpshufb .rol16(%rip), $d, $d
1342    vpaddd $d, $c, $c
1343    vpxor $c, $b, $b
1344    vpsrld \$20, $b, $t
1345    vpslld \$12, $b, $b
1346    vpxor $t, $b, $b
1347    vpaddd $b, $a, $a
1348    vpxor $a, $d, $d
1349    vpshufb .rol8(%rip), $d, $d
1350    vpaddd $d, $c, $c
1351    vpxor $c, $b, $b
1352    vpslld \$7, $b, $t
1353    vpsrld \$25, $b, $b
1354    vpxor $t, $b, $b
1355___
1356$code.=<<___ if ($dir =~ /left/);
1357    vpalignr \$12, $d, $d, $d
1358    vpalignr \$8, $c, $c, $c
1359    vpalignr \$4, $b, $b, $b
1360___
1361$code.=<<___ if ($dir =~ /right/);
1362    vpalignr \$4, $d, $d, $d
1363    vpalignr \$8, $c, $c, $c
1364    vpalignr \$12, $b, $b, $b
1365___
1366$code.=<<___ if ($dir =~ /load/);
1367    vmovdqa $tmp_store, $t
1368___
1369}
1370
1371sub prep_state_avx2 {
1372my ($n)=@_;
1373$code.=<<___;
1374    vmovdqa .chacha20_consts(%rip), $A0
1375    vmovdqa $state1_store, $B0
1376    vmovdqa $state2_store, $C0
1377___
1378$code.=<<___ if ($n ge 2);
1379    vmovdqa $A0, $A1
1380    vmovdqa $B0, $B1
1381    vmovdqa $C0, $C1
1382___
1383$code.=<<___ if ($n ge 3);
1384    vmovdqa $A0, $A2
1385    vmovdqa $B0, $B2
1386    vmovdqa $C0, $C2
1387___
1388$code.=<<___ if ($n ge 4);
1389    vmovdqa $A0, $A3
1390    vmovdqa $B0, $B3
1391    vmovdqa $C0, $C3
1392___
1393$code.=<<___ if ($n eq 1);
1394    vmovdqa .avx2_inc(%rip), $D0
1395    vpaddd $ctr0_store, $D0, $D0
1396    vmovdqa $D0, $ctr0_store
1397___
1398$code.=<<___ if ($n eq 2);
1399    vmovdqa .avx2_inc(%rip), $D0
1400    vpaddd $ctr0_store, $D0, $D1
1401    vpaddd $D1, $D0, $D0
1402    vmovdqa $D0, $ctr0_store
1403    vmovdqa $D1, $ctr1_store
1404___
1405$code.=<<___ if ($n eq 3);
1406    vmovdqa .avx2_inc(%rip), $D0
1407    vpaddd $ctr0_store, $D0, $D2
1408    vpaddd $D2, $D0, $D1
1409    vpaddd $D1, $D0, $D0
1410    vmovdqa $D0, $ctr0_store
1411    vmovdqa $D1, $ctr1_store
1412    vmovdqa $D2, $ctr2_store
1413___
1414$code.=<<___ if ($n eq 4);
1415    vmovdqa .avx2_inc(%rip), $D0
1416    vpaddd $ctr0_store, $D0, $D3
1417    vpaddd $D3, $D0, $D2
1418    vpaddd $D2, $D0, $D1
1419    vpaddd $D1, $D0, $D0
1420    vmovdqa $D3, $ctr3_store
1421    vmovdqa $D2, $ctr2_store
1422    vmovdqa $D1, $ctr1_store
1423    vmovdqa $D0, $ctr0_store
1424___
1425}
1426
1427sub finalize_state_avx2 {
1428my ($n)=@_;
1429$code.=<<___ if ($n eq 4);
1430    vpaddd .chacha20_consts(%rip), $A3, $A3
1431    vpaddd $state1_store, $B3, $B3
1432    vpaddd $state2_store, $C3, $C3
1433    vpaddd $ctr3_store, $D3, $D3
1434___
1435$code.=<<___ if ($n ge 3);
1436    vpaddd .chacha20_consts(%rip), $A2, $A2
1437    vpaddd $state1_store, $B2, $B2
1438    vpaddd $state2_store, $C2, $C2
1439    vpaddd $ctr2_store, $D2, $D2
1440___
1441$code.=<<___ if ($n ge 2);
1442    vpaddd .chacha20_consts(%rip), $A1, $A1
1443    vpaddd $state1_store, $B1, $B1
1444    vpaddd $state2_store, $C1, $C1
1445    vpaddd $ctr1_store, $D1, $D1
1446___
1447$code.=<<___;
1448    vpaddd .chacha20_consts(%rip), $A0, $A0
1449    vpaddd $state1_store, $B0, $B0
1450    vpaddd $state2_store, $C0, $C0
1451    vpaddd $ctr0_store, $D0, $D0
1452___
1453}
1454
1455sub xor_stream_avx2 {
1456my ($A, $B, $C, $D, $offset, $hlp)=@_;
1457$code.=<<___;
1458    vperm2i128 \$0x02, $A, $B, $hlp
1459    vperm2i128 \$0x13, $A, $B, $B
1460    vperm2i128 \$0x02, $C, $D, $A
1461    vperm2i128 \$0x13, $C, $D, $C
1462    vpxor 0*32+$offset($inp), $hlp, $hlp
1463    vpxor 1*32+$offset($inp), $A, $A
1464    vpxor 2*32+$offset($inp), $B, $B
1465    vpxor 3*32+$offset($inp), $C, $C
1466    vmovdqu $hlp, 0*32+$offset($oup)
1467    vmovdqu $A, 1*32+$offset($oup)
1468    vmovdqu $B, 2*32+$offset($oup)
1469    vmovdqu $C, 3*32+$offset($oup)
1470___
1471}
1472
1473sub finish_stream_avx2 {
1474my ($A, $B, $C, $D, $hlp)=@_;
1475$code.=<<___;
1476    vperm2i128 \$0x13, $A, $B, $hlp
1477    vperm2i128 \$0x02, $A, $B, $A
1478    vperm2i128 \$0x02, $C, $D, $B
1479    vperm2i128 \$0x13, $C, $D, $D
1480    vmovdqa $hlp, $C
1481___
1482}
1483
1484sub poly_stage1_mulx {
1485$code.=<<___;
1486    mov 0+$r_store, %rdx
1487    mov %rdx, $t2
1488    mulx $acc0, $t0, $t1
1489    mulx $acc1, %rax, %rdx
1490    imulq $acc2, $t2
1491    add %rax, $t1
1492    adc %rdx, $t2
1493___
1494}
1495
1496sub poly_stage2_mulx {
1497$code.=<<___;
1498    mov 8+$r_store, %rdx
1499    mulx $acc0, $acc0, %rax
1500    add $acc0, $t1
1501    mulx $acc1, $acc1, $t3
1502    adc $acc1, $t2
1503    adc \$0, $t3
1504    imulq $acc2, %rdx
1505___
1506}
1507
1508sub poly_stage3_mulx {
1509$code.=<<___;
1510    add %rax, $t2
1511    adc %rdx, $t3
1512___
1513}
1514
1515sub poly_mul_mulx {
1516    &poly_stage1_mulx();
1517    &poly_stage2_mulx();
1518    &poly_stage3_mulx();
1519    &poly_reduce_stage();
1520}
1521
1522sub gen_chacha_round_avx2 {
1523my ($rot1, $rot2, $shift)=@_;
1524my $round="";
1525$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
1526$round=$round ."vmovdqa $rot2, $C0
1527                vpaddd $B3, $A3, $A3
1528                vpaddd $B2, $A2, $A2
1529                vpaddd $B1, $A1, $A1
1530                vpaddd $B0, $A0, $A0
1531                vpxor $A3, $D3, $D3
1532                vpxor $A2, $D2, $D2
1533                vpxor $A1, $D1, $D1
1534                vpxor $A0, $D0, $D0
1535                vpshufb $C0, $D3, $D3
1536                vpshufb $C0, $D2, $D2
1537                vpshufb $C0, $D1, $D1
1538                vpshufb $C0, $D0, $D0
1539                vmovdqa $tmp_store, $C0
1540                vpaddd $D3, $C3, $C3
1541                vpaddd $D2, $C2, $C2
1542                vpaddd $D1, $C1, $C1
1543                vpaddd $D0, $C0, $C0
1544                vpxor $C3, $B3, $B3
1545                vpxor $C2, $B2, $B2
1546                vpxor $C1, $B1, $B1
1547                vpxor $C0, $B0, $B0
1548                vmovdqa $C0, $tmp_store
1549                vpsrld \$$rot1, $B3, $C0
1550                vpslld \$32-$rot1, $B3, $B3
1551                vpxor $C0, $B3, $B3
1552                vpsrld \$$rot1, $B2, $C0
1553                vpslld \$32-$rot1, $B2, $B2
1554                vpxor $C0, $B2, $B2
1555                vpsrld \$$rot1, $B1, $C0
1556                vpslld \$32-$rot1, $B1, $B1
1557                vpxor $C0, $B1, $B1
1558                vpsrld \$$rot1, $B0, $C0
1559                vpslld \$32-$rot1, $B0, $B0
1560                vpxor $C0, $B0, $B0\n";
1561($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
1562($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
1563$round=$round ."vmovdqa $tmp_store, $C0
1564                vpalignr \$$s1, $B3, $B3, $B3
1565                vpalignr \$$s2, $C3, $C3, $C3
1566                vpalignr \$$s3, $D3, $D3, $D3
1567                vpalignr \$$s1, $B2, $B2, $B2
1568                vpalignr \$$s2, $C2, $C2, $C2
1569                vpalignr \$$s3, $D2, $D2, $D2
1570                vpalignr \$$s1, $B1, $B1, $B1
1571                vpalignr \$$s2, $C1, $C1, $C1
1572                vpalignr \$$s3, $D1, $D1, $D1
1573                vpalignr \$$s1, $B0, $B0, $B0
1574                vpalignr \$$s2, $C0, $C0, $C0
1575                vpalignr \$$s3, $D0, $D0, $D0\n"
1576if (($shift =~ /left/) || ($shift =~ /right/));
1577return $round;
1578};
1579
1580$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1581               &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
1582               &gen_chacha_round_avx2(20, ".rol16(%rip)") .
1583               &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
1584
1585@loop_body = split /\n/, $chacha_body;
1586
1587$code.="
1588###############################################################################
1589.type chacha20_poly1305_open_avx2,\@function,2
1590.align 64
1591chacha20_poly1305_open_avx2:
1592    vzeroupper
1593    vmovdqa .chacha20_consts(%rip), $A0
1594    vbroadcasti128 0*16($keyp), $B0
1595    vbroadcasti128 1*16($keyp), $C0
1596    vbroadcasti128 2*16($keyp), $D0
1597    vpaddd .avx2_init(%rip), $D0, $D0
1598    cmp \$6*32, $inl
1599    jbe open_avx2_192
1600    cmp \$10*32, $inl
1601    jbe open_avx2_320
1602
1603    vmovdqa $B0, $state1_store
1604    vmovdqa $C0, $state2_store
1605    vmovdqa $D0, $ctr0_store
1606    mov \$10, $acc0
16071:  \n";
1608        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1609        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1610        dec $acc0
1611    jne 1b
1612    vpaddd .chacha20_consts(%rip), $A0, $A0
1613    vpaddd $state1_store, $B0, $B0
1614    vpaddd $state2_store, $C0, $C0
1615    vpaddd $ctr0_store, $D0, $D0
1616
1617    vperm2i128 \$0x02, $A0, $B0, $T0
1618    # Clamp and store key
1619    vpand .clamp(%rip), $T0, $T0
1620    vmovdqa $T0, $r_store
1621    # Stream for the first 64 bytes
1622    vperm2i128 \$0x13, $A0, $B0, $A0
1623    vperm2i128 \$0x13, $C0, $D0, $B0
1624    # Hash AD + first 64 bytes
1625    mov %r8, $itr2
1626    call poly_hash_ad_internal
1627    xor $itr1, $itr1
1628    # Hash first 64 bytes
16291:  \n";
1630       &poly_add("0($inp, $itr1)");
1631       &poly_mul(); $code.="
1632       add \$16, $itr1
1633       cmp \$2*32, $itr1
1634    jne 1b
1635    # Decrypt first 64 bytes
1636    vpxor 0*32($inp), $A0, $A0
1637    vpxor 1*32($inp), $B0, $B0
1638    vmovdqu $A0, 0*32($oup)
1639    vmovdqu $B0, 1*32($oup)
1640    lea 2*32($inp), $inp
1641    lea 2*32($oup), $oup
1642    sub \$2*32, $inl
16431:
1644        # Hash and decrypt 512 bytes each iteration
1645        cmp \$16*32, $inl
1646        jb 3f\n";
1647        &prep_state_avx2(4); $code.="
1648        xor $itr1, $itr1
16492:  \n";
1650            &poly_add("0*8($inp, $itr1)");
1651            &emit_body(10);
1652            &poly_stage1_mulx();
1653            &emit_body(9);
1654            &poly_stage2_mulx();
1655            &emit_body(12);
1656            &poly_stage3_mulx();
1657            &emit_body(10);
1658            &poly_reduce_stage();
1659            &emit_body(9);
1660            &poly_add("2*8($inp, $itr1)");
1661            &emit_body(8);
1662            &poly_stage1_mulx();
1663            &emit_body(18);
1664            &poly_stage2_mulx();
1665            &emit_body(18);
1666            &poly_stage3_mulx();
1667            &emit_body(9);
1668            &poly_reduce_stage();
1669            &emit_body(8);
1670            &poly_add("4*8($inp, $itr1)"); $code.="
1671            lea 6*8($itr1), $itr1\n";
1672            &emit_body(18);
1673            &poly_stage1_mulx();
1674            &emit_body(8);
1675            &poly_stage2_mulx();
1676            &emit_body(8);
1677            &poly_stage3_mulx();
1678            &emit_body(18);
1679            &poly_reduce_stage();
1680            foreach $l (@loop_body) {$code.=$l."\n";}
1681            @loop_body = split /\n/, $chacha_body; $code.="
1682            cmp \$10*6*8, $itr1
1683        jne 2b\n";
1684        &finalize_state_avx2(4); $code.="
1685        vmovdqa $A0, $tmp_store\n";
1686        &poly_add("10*6*8($inp)");
1687        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1688        vmovdqa $tmp_store, $A0\n";
1689        &poly_mul();
1690        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1691        &poly_add("10*6*8+2*8($inp)");
1692        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1693        &poly_mul();
1694        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
1695        lea 16*32($inp), $inp
1696        lea 16*32($oup), $oup
1697        sub \$16*32, $inl
1698    jmp 1b
16993:
1700    test $inl, $inl
1701    vzeroupper
1702    je open_sse_finalize
17033:
1704    cmp \$4*32, $inl
1705    ja 3f\n";
1706###############################################################################
1707    # 1-128 bytes left
1708    &prep_state_avx2(1); $code.="
1709    xor $itr2, $itr2
1710    mov $inl, $itr1
1711    and \$-16, $itr1
1712    test $itr1, $itr1
1713    je 2f
17141:  \n";
1715        &poly_add("0*8($inp, $itr2)");
1716        &poly_mul(); $code.="
17172:
1718        add \$16, $itr2\n";
1719        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1720        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1721        cmp $itr1, $itr2
1722    jb 1b
1723        cmp \$160, $itr2
1724    jne 2b\n";
1725    &finalize_state_avx2(1);
1726    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
1727    jmp open_avx2_tail_loop
17283:
1729    cmp \$8*32, $inl
1730    ja 3f\n";
1731###############################################################################
1732    # 129-256 bytes left
1733    &prep_state_avx2(2); $code.="
1734    mov $inl, $tmp_store
1735    mov $inl, $itr1
1736    sub \$4*32, $itr1
1737    shr \$4, $itr1
1738    mov \$10, $itr2
1739    cmp \$10, $itr1
1740    cmovg $itr2, $itr1
1741    mov $inp, $inl
1742    xor $itr2, $itr2
17431:  \n";
1744        &poly_add("0*8($inl)");
1745        &poly_mul_mulx(); $code.="
1746        lea 16($inl), $inl
17472:  \n";
1748        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1749        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
1750        inc $itr2\n";
1751        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1752        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1753        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
1754        cmp $itr1, $itr2
1755    jb 1b
1756        cmp \$10, $itr2
1757    jne 2b
1758    mov $inl, $itr2
1759    sub $inp, $inl
1760    mov $inl, $itr1
1761    mov $tmp_store, $inl
17621:
1763        add \$16, $itr1
1764        cmp $inl, $itr1
1765        jg 1f\n";
1766        &poly_add("0*8($itr2)");
1767        &poly_mul_mulx(); $code.="
1768        lea 16($itr2), $itr2
1769    jmp 1b
17701:  \n";
1771    &finalize_state_avx2(2);
1772    &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
1773    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1774    lea 4*32($inp), $inp
1775    lea 4*32($oup), $oup
1776    sub \$4*32, $inl
1777    jmp open_avx2_tail_loop
17783:
1779    cmp \$12*32, $inl
1780    ja 3f\n";
1781###############################################################################
1782    # 257-383 bytes left
1783    &prep_state_avx2(3); $code.="
1784    mov $inl, $tmp_store
1785    mov $inl, $itr1
1786    sub \$8*32, $itr1
1787    shr \$4, $itr1
1788    add \$6, $itr1
1789    mov \$10, $itr2
1790    cmp \$10, $itr1
1791    cmovg $itr2, $itr1
1792    mov $inp, $inl
1793    xor $itr2, $itr2
17941:  \n";
1795        &poly_add("0*8($inl)");
1796        &poly_mul_mulx(); $code.="
1797        lea 16($inl), $inl
17982:  \n";
1799        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
1800        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1801        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1802        &poly_add("0*8($inl)");
1803        &poly_mul(); $code.="
1804        lea 16($inl), $inl
1805        inc $itr2\n";
1806        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
1807        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
1808        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
1809        cmp $itr1, $itr2
1810    jb 1b
1811        cmp \$10, $itr2
1812    jne 2b
1813    mov $inl, $itr2
1814    sub $inp, $inl
1815    mov $inl, $itr1
1816    mov $tmp_store, $inl
18171:
1818        add \$16, $itr1
1819        cmp $inl, $itr1
1820        jg 1f\n";
1821        &poly_add("0*8($itr2)");
1822        &poly_mul_mulx(); $code.="
1823        lea 16($itr2), $itr2
1824    jmp 1b
18251:  \n";
1826    &finalize_state_avx2(3);
1827    &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
1828    &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
1829    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
1830    lea 8*32($inp), $inp
1831    lea 8*32($oup), $oup
1832    sub \$8*32, $inl
1833    jmp open_avx2_tail_loop
18343:  \n";
1835###############################################################################
1836    # 384-512 bytes left
1837    &prep_state_avx2(4); $code.="
1838    xor $itr1, $itr1
1839    mov $inp, $itr2
18401:  \n";
1841        &poly_add("0*8($itr2)");
1842        &poly_mul(); $code.="
1843        lea 2*8($itr2), $itr2
18442:  \n";
1845        &emit_body(37);
1846        &poly_add("0*8($itr2)");
1847        &poly_mul_mulx();
1848        &emit_body(48);
1849        &poly_add("2*8($itr2)");
1850        &poly_mul_mulx(); $code.="
1851        lea 4*8($itr2), $itr2\n";
1852        foreach $l (@loop_body) {$code.=$l."\n";}
1853        @loop_body = split /\n/, $chacha_body; $code.="
1854        inc $itr1
1855        cmp \$4, $itr1
1856    jl  1b
1857        cmp \$10, $itr1
1858    jne 2b
1859    mov $inl, $itr1
1860    sub \$12*32, $itr1
1861    and \$-16, $itr1
18621:
1863        test $itr1, $itr1
1864        je 1f\n";
1865        &poly_add("0*8($itr2)");
1866        &poly_mul_mulx(); $code.="
1867        lea 2*8($itr2), $itr2
1868        sub \$2*8, $itr1
1869    jmp 1b
18701:  \n";
1871    &finalize_state_avx2(4); $code.="
1872    vmovdqa $A0, $tmp_store\n";
1873    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
1874    vmovdqa $tmp_store, $A0\n";
1875    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
1876    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
1877    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
1878    lea 12*32($inp), $inp
1879    lea 12*32($oup), $oup
1880    sub \$12*32, $inl
1881open_avx2_tail_loop:
1882    cmp \$32, $inl
1883    jb open_avx2_tail
1884        sub \$32, $inl
1885        vpxor ($inp), $A0, $A0
1886        vmovdqu $A0, ($oup)
1887        lea 1*32($inp), $inp
1888        lea 1*32($oup), $oup
1889        vmovdqa $B0, $A0
1890        vmovdqa $C0, $B0
1891        vmovdqa $D0, $C0
1892    jmp open_avx2_tail_loop
1893open_avx2_tail:
1894    cmp \$16, $inl
1895    vmovdqa $A0x, $A1x
1896    jb 1f
1897    sub \$16, $inl
1898    #load for decryption
1899    vpxor ($inp), $A0x, $A1x
1900    vmovdqu $A1x, ($oup)
1901    lea 1*16($inp), $inp
1902    lea 1*16($oup), $oup
1903    vperm2i128 \$0x11, $A0, $A0, $A0
1904    vmovdqa $A0x, $A1x
19051:
1906    vzeroupper
1907    jmp open_sse_tail_16
1908###############################################################################
1909open_avx2_192:
1910    vmovdqa $A0, $A1
1911    vmovdqa $A0, $A2
1912    vmovdqa $B0, $B1
1913    vmovdqa $B0, $B2
1914    vmovdqa $C0, $C1
1915    vmovdqa $C0, $C2
1916    vpaddd .avx2_inc(%rip), $D0, $D1
1917    vmovdqa $D0, $T2
1918    vmovdqa $D1, $T3
1919    mov \$10, $acc0
19201:  \n";
1921        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
1922        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
1923        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
1924        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
1925        dec $acc0
1926    jne 1b
1927    vpaddd $A2, $A0, $A0
1928    vpaddd $A2, $A1, $A1
1929    vpaddd $B2, $B0, $B0
1930    vpaddd $B2, $B1, $B1
1931    vpaddd $C2, $C0, $C0
1932    vpaddd $C2, $C1, $C1
1933    vpaddd $T2, $D0, $D0
1934    vpaddd $T3, $D1, $D1
1935    vperm2i128 \$0x02, $A0, $B0, $T0
1936    # Clamp and store the key
1937    vpand .clamp(%rip), $T0, $T0
1938    vmovdqa $T0, $r_store
1939    # Stream for up to 192 bytes
1940    vperm2i128 \$0x13, $A0, $B0, $A0
1941    vperm2i128 \$0x13, $C0, $D0, $B0
1942    vperm2i128 \$0x02, $A1, $B1, $C0
1943    vperm2i128 \$0x02, $C1, $D1, $D0
1944    vperm2i128 \$0x13, $A1, $B1, $A1
1945    vperm2i128 \$0x13, $C1, $D1, $B1
1946open_avx2_short:
1947    mov %r8, $itr2
1948    call poly_hash_ad_internal
1949open_avx2_hash_and_xor_loop:
1950        cmp \$32, $inl
1951        jb open_avx2_short_tail_32
1952        sub \$32, $inl\n";
1953        # Load + hash
1954        &poly_add("0*8($inp)");
1955        &poly_mul();
1956        &poly_add("2*8($inp)");
1957        &poly_mul(); $code.="
1958        # Load + decrypt
1959        vpxor ($inp), $A0, $A0
1960        vmovdqu $A0, ($oup)
1961        lea 1*32($inp), $inp
1962        lea 1*32($oup), $oup
1963        # Shift stream
1964        vmovdqa $B0, $A0
1965        vmovdqa $C0, $B0
1966        vmovdqa $D0, $C0
1967        vmovdqa $A1, $D0
1968        vmovdqa $B1, $A1
1969        vmovdqa $C1, $B1
1970        vmovdqa $D1, $C1
1971        vmovdqa $A2, $D1
1972        vmovdqa $B2, $A2
1973    jmp open_avx2_hash_and_xor_loop
1974open_avx2_short_tail_32:
1975    cmp \$16, $inl
1976    vmovdqa $A0x, $A1x
1977    jb 1f
1978    sub \$16, $inl\n";
1979    &poly_add("0*8($inp)");
1980    &poly_mul(); $code.="
1981    vpxor ($inp), $A0x, $A3x
1982    vmovdqu $A3x, ($oup)
1983    lea 1*16($inp), $inp
1984    lea 1*16($oup), $oup
1985    vextracti128 \$1, $A0, $A1x
19861:
1987    vzeroupper
1988    jmp open_sse_tail_16
1989###############################################################################
1990open_avx2_320:
1991    vmovdqa $A0, $A1
1992    vmovdqa $A0, $A2
1993    vmovdqa $B0, $B1
1994    vmovdqa $B0, $B2
1995    vmovdqa $C0, $C1
1996    vmovdqa $C0, $C2
1997    vpaddd .avx2_inc(%rip), $D0, $D1
1998    vpaddd .avx2_inc(%rip), $D1, $D2
1999    vmovdqa $B0, $T1
2000    vmovdqa $C0, $T2
2001    vmovdqa $D0, $ctr0_store
2002    vmovdqa $D1, $ctr1_store
2003    vmovdqa $D2, $ctr2_store
2004    mov \$10, $acc0
20051:  \n";
2006        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2007        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2008        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2009        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2010        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2011        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2012        dec $acc0
2013    jne 1b
2014    vpaddd .chacha20_consts(%rip), $A0, $A0
2015    vpaddd .chacha20_consts(%rip), $A1, $A1
2016    vpaddd .chacha20_consts(%rip), $A2, $A2
2017    vpaddd $T1, $B0, $B0
2018    vpaddd $T1, $B1, $B1
2019    vpaddd $T1, $B2, $B2
2020    vpaddd $T2, $C0, $C0
2021    vpaddd $T2, $C1, $C1
2022    vpaddd $T2, $C2, $C2
2023    vpaddd $ctr0_store, $D0, $D0
2024    vpaddd $ctr1_store, $D1, $D1
2025    vpaddd $ctr2_store, $D2, $D2
2026    vperm2i128 \$0x02, $A0, $B0, $T0
2027    # Clamp and store the key
2028    vpand .clamp(%rip), $T0, $T0
2029    vmovdqa $T0, $r_store
2030    # Stream for up to 320 bytes
2031    vperm2i128 \$0x13, $A0, $B0, $A0
2032    vperm2i128 \$0x13, $C0, $D0, $B0
2033    vperm2i128 \$0x02, $A1, $B1, $C0
2034    vperm2i128 \$0x02, $C1, $D1, $D0
2035    vperm2i128 \$0x13, $A1, $B1, $A1
2036    vperm2i128 \$0x13, $C1, $D1, $B1
2037    vperm2i128 \$0x02, $A2, $B2, $C1
2038    vperm2i128 \$0x02, $C2, $D2, $D1
2039    vperm2i128 \$0x13, $A2, $B2, $A2
2040    vperm2i128 \$0x13, $C2, $D2, $B2
2041    jmp open_avx2_short
2042.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
2043###############################################################################
2044###############################################################################
2045.type chacha20_poly1305_seal_avx2,\@function,2
2046.align 64
2047chacha20_poly1305_seal_avx2:
2048    vzeroupper
2049    vmovdqa .chacha20_consts(%rip), $A0
2050    vbroadcasti128 0*16($keyp), $B0
2051    vbroadcasti128 1*16($keyp), $C0
2052    vbroadcasti128 2*16($keyp), $D0
2053    vpaddd .avx2_init(%rip), $D0, $D0
2054    cmp \$6*32, $inl
2055    jbe seal_avx2_192
2056    cmp \$10*32, $inl
2057    jbe seal_avx2_320
2058    vmovdqa $A0, $A1
2059    vmovdqa $A0, $A2
2060    vmovdqa $A0, $A3
2061    vmovdqa $B0, $B1
2062    vmovdqa $B0, $B2
2063    vmovdqa $B0, $B3
2064    vmovdqa $B0, $state1_store
2065    vmovdqa $C0, $C1
2066    vmovdqa $C0, $C2
2067    vmovdqa $C0, $C3
2068    vmovdqa $C0, $state2_store
2069    vmovdqa $D0, $D3
2070    vpaddd .avx2_inc(%rip), $D3, $D2
2071    vpaddd .avx2_inc(%rip), $D2, $D1
2072    vpaddd .avx2_inc(%rip), $D1, $D0
2073    vmovdqa $D0, $ctr0_store
2074    vmovdqa $D1, $ctr1_store
2075    vmovdqa $D2, $ctr2_store
2076    vmovdqa $D3, $ctr3_store
2077    mov \$10, $acc0
20781:  \n";
2079        foreach $l (@loop_body) {$code.=$l."\n";}
2080        @loop_body = split /\n/, $chacha_body; $code.="
2081        dec $acc0
2082        jnz 1b\n";
2083    &finalize_state_avx2(4); $code.="
2084    vperm2i128 \$0x13, $C3, $D3, $C3
2085    vperm2i128 \$0x02, $A3, $B3, $D3
2086    vperm2i128 \$0x13, $A3, $B3, $A3
2087    vpand .clamp(%rip), $D3, $D3
2088    vmovdqa $D3, $r_store
2089    mov %r8, $itr2
2090    call poly_hash_ad_internal
2091    # Safely store 320 bytes (otherwise would handle with optimized call)
2092    vpxor 0*32($inp), $A3, $A3
2093    vpxor 1*32($inp), $C3, $C3
2094    vmovdqu $A3, 0*32($oup)
2095    vmovdqu $C3, 1*32($oup)\n";
2096    &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
2097    &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
2098    &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
2099    lea 10*32($inp), $inp
2100    sub \$10*32, $inl
2101    mov \$10*32, $itr1
2102    cmp \$4*32, $inl
2103    jbe seal_avx2_hash
2104    vpxor 0*32($inp), $A0, $A0
2105    vpxor 1*32($inp), $B0, $B0
2106    vpxor 2*32($inp), $C0, $C0
2107    vpxor 3*32($inp), $D0, $D0
2108    vmovdqu $A0, 10*32($oup)
2109    vmovdqu $B0, 11*32($oup)
2110    vmovdqu $C0, 12*32($oup)
2111    vmovdqu $D0, 13*32($oup)
2112    lea 4*32($inp), $inp
2113    sub \$4*32, $inl
2114    mov \$8, $itr1
2115    mov \$2, $itr2
2116    cmp \$4*32, $inl
2117    jbe seal_avx2_tail_128
2118    cmp \$8*32, $inl
2119    jbe seal_avx2_tail_256
2120    cmp \$12*32, $inl
2121    jbe seal_avx2_tail_384
2122    cmp \$16*32, $inl
2123    jbe seal_avx2_tail_512\n";
2124    # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2125    &prep_state_avx2(4);
2126    foreach $l (@loop_body) {$code.=$l."\n";}
2127    @loop_body = split /\n/, $chacha_body;
2128    &emit_body(41);
2129    @loop_body = split /\n/, $chacha_body; $code.="
2130    sub \$16, $oup
2131    mov \$9, $itr1
2132    jmp 4f
21331:  \n";
2134        &prep_state_avx2(4); $code.="
2135        mov \$10, $itr1
21362:  \n";
2137            &poly_add("0*8($oup)");
2138            &emit_body(10);
2139            &poly_stage1_mulx();
2140            &emit_body(9);
2141            &poly_stage2_mulx();
2142            &emit_body(12);
2143            &poly_stage3_mulx();
2144            &emit_body(10);
2145            &poly_reduce_stage(); $code.="
21464:  \n";
2147            &emit_body(9);
2148            &poly_add("2*8($oup)");
2149            &emit_body(8);
2150            &poly_stage1_mulx();
2151            &emit_body(18);
2152            &poly_stage2_mulx();
2153            &emit_body(18);
2154            &poly_stage3_mulx();
2155            &emit_body(9);
2156            &poly_reduce_stage();
2157            &emit_body(8);
2158            &poly_add("4*8($oup)"); $code.="
2159            lea 6*8($oup), $oup\n";
2160            &emit_body(18);
2161            &poly_stage1_mulx();
2162            &emit_body(8);
2163            &poly_stage2_mulx();
2164            &emit_body(8);
2165            &poly_stage3_mulx();
2166            &emit_body(18);
2167            &poly_reduce_stage();
2168            foreach $l (@loop_body) {$code.=$l."\n";}
2169            @loop_body = split /\n/, $chacha_body; $code.="
2170            dec $itr1
2171        jne 2b\n";
2172        &finalize_state_avx2(4); $code.="
2173        lea 4*8($oup), $oup
2174        vmovdqa $A0, $tmp_store\n";
2175        &poly_add("-4*8($oup)");
2176        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2177        vmovdqa $tmp_store, $A0\n";
2178        &poly_mul();
2179        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2180        &poly_add("-2*8($oup)");
2181        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2182        &poly_mul();
2183        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
2184        lea 16*32($inp), $inp
2185        sub \$16*32, $inl
2186        cmp \$16*32, $inl
2187    jg 1b\n";
2188    &poly_add("0*8($oup)");
2189    &poly_mul();
2190    &poly_add("2*8($oup)");
2191    &poly_mul(); $code.="
2192    lea 4*8($oup), $oup
2193    mov \$10, $itr1
2194    xor $itr2, $itr2
2195    cmp \$4*32, $inl
2196    ja 3f
2197###############################################################################
2198seal_avx2_tail_128:\n";
2199    &prep_state_avx2(1); $code.="
22001:  \n";
2201        &poly_add("0($oup)");
2202        &poly_mul(); $code.="
2203        lea 2*8($oup), $oup
22042:  \n";
2205        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2206        &poly_add("0*8($oup)");
2207        &poly_mul();
2208        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2209        &poly_add("2*8($oup)");
2210        &poly_mul(); $code.="
2211        lea 4*8($oup), $oup
2212        dec $itr1
2213    jg 1b
2214        dec $itr2
2215    jge 2b\n";
2216    &finalize_state_avx2(1);
2217    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2218    jmp seal_avx2_short_loop
22193:
2220    cmp \$8*32, $inl
2221    ja 3f
2222###############################################################################
2223seal_avx2_tail_256:\n";
2224    &prep_state_avx2(2); $code.="
22251:  \n";
2226        &poly_add("0($oup)");
2227        &poly_mul(); $code.="
2228        lea 2*8($oup), $oup
22292:  \n";
2230        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2231        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2232        &poly_add("0*8($oup)");
2233        &poly_mul();
2234        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2235        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2236        &poly_add("2*8($oup)");
2237        &poly_mul(); $code.="
2238        lea 4*8($oup), $oup
2239        dec $itr1
2240    jg 1b
2241        dec $itr2
2242    jge 2b\n";
2243    &finalize_state_avx2(2);
2244    &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
2245    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2246    mov \$4*32, $itr1
2247    lea 4*32($inp), $inp
2248    sub \$4*32, $inl
2249    jmp seal_avx2_hash
22503:
2251    cmp \$12*32, $inl
2252    ja seal_avx2_tail_512
2253###############################################################################
2254seal_avx2_tail_384:\n";
2255    &prep_state_avx2(3); $code.="
22561:  \n";
2257        &poly_add("0($oup)");
2258        &poly_mul(); $code.="
2259        lea 2*8($oup), $oup
22602:  \n";
2261        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2262        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2263        &poly_add("0*8($oup)");
2264        &poly_mul();
2265        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2266        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2267        &poly_add("2*8($oup)");
2268        &poly_mul();
2269        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2270        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2271        lea 4*8($oup), $oup
2272        dec $itr1
2273    jg 1b
2274        dec $itr2
2275    jge 2b\n";
2276    &finalize_state_avx2(3);
2277    &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
2278    &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
2279    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2280    mov \$8*32, $itr1
2281    lea 8*32($inp), $inp
2282    sub \$8*32, $inl
2283    jmp seal_avx2_hash
2284###############################################################################
2285seal_avx2_tail_512:\n";
2286    &prep_state_avx2(4); $code.="
22871:  \n";
2288        &poly_add("0($oup)");
2289        &poly_mul_mulx(); $code.="
2290        lea 2*8($oup), $oup
22912:  \n";
2292        &emit_body(20);
2293        &poly_add("0*8($oup)");
2294        &emit_body(20);
2295        &poly_stage1_mulx();
2296        &emit_body(20);
2297        &poly_stage2_mulx();
2298        &emit_body(20);
2299        &poly_stage3_mulx();
2300        &emit_body(20);
2301        &poly_reduce_stage();
2302        &emit_body(20);
2303        &poly_add("2*8($oup)");
2304        &emit_body(20);
2305        &poly_stage1_mulx();
2306        &emit_body(20);
2307        &poly_stage2_mulx();
2308        &emit_body(20);
2309        &poly_stage3_mulx();
2310        &emit_body(20);
2311        &poly_reduce_stage();
2312        foreach $l (@loop_body) {$code.=$l."\n";}
2313        @loop_body = split /\n/, $chacha_body; $code.="
2314        lea 4*8($oup), $oup
2315        dec $itr1
2316    jg 1b
2317        dec $itr2
2318    jge 2b\n";
2319    &finalize_state_avx2(4); $code.="
2320    vmovdqa $A0, $tmp_store\n";
2321    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
2322    vmovdqa $tmp_store, $A0\n";
2323    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
2324    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
2325    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
2326    mov \$12*32, $itr1
2327    lea 12*32($inp), $inp
2328    sub \$12*32, $inl
2329    jmp seal_avx2_hash
2330################################################################################
2331seal_avx2_320:
2332    vmovdqa $A0, $A1
2333    vmovdqa $A0, $A2
2334    vmovdqa $B0, $B1
2335    vmovdqa $B0, $B2
2336    vmovdqa $C0, $C1
2337    vmovdqa $C0, $C2
2338    vpaddd .avx2_inc(%rip), $D0, $D1
2339    vpaddd .avx2_inc(%rip), $D1, $D2
2340    vmovdqa $B0, $T1
2341    vmovdqa $C0, $T2
2342    vmovdqa $D0, $ctr0_store
2343    vmovdqa $D1, $ctr1_store
2344    vmovdqa $D2, $ctr2_store
2345    mov \$10, $acc0
23461:  \n";
2347        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2348        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2349        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
2350        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2351        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
2352        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
2353        dec $acc0
2354    jne 1b
2355    vpaddd .chacha20_consts(%rip), $A0, $A0
2356    vpaddd .chacha20_consts(%rip), $A1, $A1
2357    vpaddd .chacha20_consts(%rip), $A2, $A2
2358    vpaddd $T1, $B0, $B0
2359    vpaddd $T1, $B1, $B1
2360    vpaddd $T1, $B2, $B2
2361    vpaddd $T2, $C0, $C0
2362    vpaddd $T2, $C1, $C1
2363    vpaddd $T2, $C2, $C2
2364    vpaddd $ctr0_store, $D0, $D0
2365    vpaddd $ctr1_store, $D1, $D1
2366    vpaddd $ctr2_store, $D2, $D2
2367    vperm2i128 \$0x02, $A0, $B0, $T0
2368    # Clamp and store the key
2369    vpand .clamp(%rip), $T0, $T0
2370    vmovdqa $T0, $r_store
2371    # Stream for up to 320 bytes
2372    vperm2i128 \$0x13, $A0, $B0, $A0
2373    vperm2i128 \$0x13, $C0, $D0, $B0
2374    vperm2i128 \$0x02, $A1, $B1, $C0
2375    vperm2i128 \$0x02, $C1, $D1, $D0
2376    vperm2i128 \$0x13, $A1, $B1, $A1
2377    vperm2i128 \$0x13, $C1, $D1, $B1
2378    vperm2i128 \$0x02, $A2, $B2, $C1
2379    vperm2i128 \$0x02, $C2, $D2, $D1
2380    vperm2i128 \$0x13, $A2, $B2, $A2
2381    vperm2i128 \$0x13, $C2, $D2, $B2
2382    jmp seal_avx2_short
2383################################################################################
2384seal_avx2_192:
2385    vmovdqa $A0, $A1
2386    vmovdqa $A0, $A2
2387    vmovdqa $B0, $B1
2388    vmovdqa $B0, $B2
2389    vmovdqa $C0, $C1
2390    vmovdqa $C0, $C2
2391    vpaddd .avx2_inc(%rip), $D0, $D1
2392    vmovdqa $D0, $T2
2393    vmovdqa $D1, $T3
2394    mov \$10, $acc0
23951:  \n";
2396        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
2397        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
2398        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
2399        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
2400        dec $acc0
2401    jne 1b
2402    vpaddd $A2, $A0, $A0
2403    vpaddd $A2, $A1, $A1
2404    vpaddd $B2, $B0, $B0
2405    vpaddd $B2, $B1, $B1
2406    vpaddd $C2, $C0, $C0
2407    vpaddd $C2, $C1, $C1
2408    vpaddd $T2, $D0, $D0
2409    vpaddd $T3, $D1, $D1
2410    vperm2i128 \$0x02, $A0, $B0, $T0
2411    # Clamp and store the key
2412    vpand .clamp(%rip), $T0, $T0
2413    vmovdqa $T0, $r_store
2414    # Stream for up to 192 bytes
2415    vperm2i128 \$0x13, $A0, $B0, $A0
2416    vperm2i128 \$0x13, $C0, $D0, $B0
2417    vperm2i128 \$0x02, $A1, $B1, $C0
2418    vperm2i128 \$0x02, $C1, $D1, $D0
2419    vperm2i128 \$0x13, $A1, $B1, $A1
2420    vperm2i128 \$0x13, $C1, $D1, $B1
2421seal_avx2_short:
2422    mov %r8, $itr2
2423    call poly_hash_ad_internal
2424    xor $itr1, $itr1
2425seal_avx2_hash:
2426        cmp \$16, $itr1
2427        jb seal_avx2_short_loop\n";
2428        &poly_add("0($oup)");
2429        &poly_mul(); $code.="
2430        sub \$16, $itr1
2431        add \$16, $oup
2432    jmp seal_avx2_hash
2433seal_avx2_short_loop:
2434        cmp \$32, $inl
2435        jb seal_avx2_short_tail
2436        sub \$32, $inl
2437        # Encrypt
2438        vpxor ($inp), $A0, $A0
2439        vmovdqu $A0, ($oup)
2440        lea 1*32($inp), $inp
2441        # Load + hash\n";
2442        &poly_add("0*8($oup)");
2443        &poly_mul();
2444        &poly_add("2*8($oup)");
2445        &poly_mul(); $code.="
2446        lea 1*32($oup), $oup
2447        # Shift stream
2448        vmovdqa $B0, $A0
2449        vmovdqa $C0, $B0
2450        vmovdqa $D0, $C0
2451        vmovdqa $A1, $D0
2452        vmovdqa $B1, $A1
2453        vmovdqa $C1, $B1
2454        vmovdqa $D1, $C1
2455        vmovdqa $A2, $D1
2456        vmovdqa $B2, $A2
2457    jmp seal_avx2_short_loop
2458seal_avx2_short_tail:
2459    cmp \$16, $inl
2460    jb 1f
2461    sub \$16, $inl
2462    vpxor ($inp), $A0x, $A3x
2463    vmovdqu $A3x, ($oup)
2464    lea 1*16($inp), $inp\n";
2465    &poly_add("0*8($oup)");
2466    &poly_mul(); $code.="
2467    lea 1*16($oup), $oup
2468    vextracti128 \$1, $A0, $A0x
24691:
2470    vzeroupper
2471    jmp seal_sse_tail_16
2472.cfi_endproc
2473";
2474}
2475
2476if (!$win64) {
2477  $code =~ s/\`([^\`]*)\`/eval $1/gem;
2478  print $code;
2479} else {
2480  print <<___;
2481.text
2482.globl dummy_chacha20_poly1305_asm
2483.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
2484dummy_chacha20_poly1305_asm:
2485    ret
2486___
2487}
2488
2489close STDOUT or die "error closing STDOUT";
2490