• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# Copyright (c) 2020, CloudFlare Ltd.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
16
17##############################################################################
18#                                                                            #
19# Author:  Vlad Krasnov                                                      #
20#                                                                            #
21##############################################################################
22
23$flavour = shift;
24while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
29die "can't locate arm-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33
34my ($oup,$inp,$inl,$adp,$adl,$keyp,$itr1,$itr2) = ("x0","x1","x2","x3","x4","x5","x6","x7");
35my ($acc0,$acc1,$acc2) = map("x$_",(8..10));
36my ($t0,$t1,$t2,$t3) = map("x$_",(11..14));
37my ($one, $r0, $r1) = ("x15","x16","x17");
38my ($t0w) = $t0 =~ s/x/w/r;
39
40my ($A0,$A1,$A2,$A3,$A4,$B0,$B1,$B2,$B3,$B4,$C0,$C1,$C2,$C3,$C4,$D0,$D1,$D2,$D3,$D4) = map("v$_",(0..19));
41my ($T0,$T1,$T2,$T3) = map("v$_",(20..23));
42
43my $CONSTS = "v24";
44my $INC = "v25";
45my $ROL8 = "v26";
46my $CLAMP = "v27";
47
48my ($B_STORE, $C_STORE, $D_STORE) = map("v$_",(28..30));
49
50my $S_STORE = $CLAMP;
51my $LEN_STORE = "v31";
52
53sub chacha_qr {
54my ($a,$b,$c,$d,$t,$dir)=@_;
55my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
56$code.=<<___;
57    add   $a.4s, $a.4s, $b.4s
58    eor   $d.16b, $d.16b, $a.16b
59    rev32 $d.8h, $d.8h
60
61    add   $c.4s, $c.4s, $d.4s
62    eor   $b.16b, $b.16b, $c.16b
63    ushr  $t.4s, $b.4s, #20
64    sli   $t.4s, $b.4s, #12
65___
66    ($t,$b) = ($b,$t);
67$code.=<<___;
68    add   $a.4s, $a.4s, $b.4s
69    eor   $d.16b, $d.16b, $a.16b
70    tbl   $d.16b, {$d.16b}, $ROL8.16b
71
72    add   $c.4s, $c.4s, $d.4s
73    eor   $b.16b, $b.16b, $c.16b
74    ushr  $t.4s, $b.4s, #25
75    sli   $t.4s, $b.4s, #7
76___
77    ($t,$b) = ($b,$t);
78$code.=<<___;
79    ext $b.16b, $b.16b, $b.16b, $shift_b
80    ext $c.16b, $c.16b, $c.16b, #8
81    ext $d.16b, $d.16b, $d.16b, $shift_d
82___
83}
84
85sub poly_add {
86my ($src)=@_;
87$code.="ldp  $t0, $t1, [$src], 16
88        adds $acc0, $acc0, $t0
89        adcs $acc1, $acc1, $t1
90        adc  $acc2, $acc2, $one\n";
91}
92
93sub poly_add_vec {
94my ($src)=@_;
95$code.="mov  $t0, $src.d[0]
96        mov  $t1, $src.d[1]
97        adds $acc0, $acc0, $t0
98        adcs $acc1, $acc1, $t1
99        adc  $acc2, $acc2, $one\n";
100}
101
102sub poly_stage1 {
103$code.="mul   $t0, $acc0, $r0     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
104        umulh $t1, $acc0, $r0
105        mul   $t2, $acc1, $r0
106        umulh $t3, $acc1, $r0
107        adds  $t1, $t1, $t2
108        mul   $t2, $acc2, $r0
109        adc   $t2, $t2, $t3\n";
110}
111
112sub poly_stage2 {
113$code.="mul   $t3, $acc0, $r1       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
114        umulh $acc0, $acc0, $r1
115        adds  $t1, $t1, $t3
116        mul   $t3, $acc1, $r1
117        umulh $acc1, $acc1, $r1
118        adcs  $t3, $t3, $acc0
119        mul   $acc2, $acc2, $r1
120        adc   $acc2, $acc2, $acc1
121        adds  $t2, $t2, $t3
122        adc   $t3, $acc2, xzr\n";
123}
124
125# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of
126# r = [r1:r0] and acc = [acc2:acc1:acc0]
127# r is 124 bits at most (due to clamping) and acc is 131 bits at most
128# (acc2 is at most 4 before the addition and can be at most 6 when we add in
129# the next block) therefore t is at most 255 bits big, and t3 is 63 bits.
130sub poly_reduce_stage {
131$code.="and  $acc2, $t2, #3         // At this point acc2 is 2 bits at most (value of 3)
132        and  $acc0, $t2, #-4
133        extr $t2, $t3, $t2, #2
134        adds $acc0, $acc0, $t0
135        lsr  $t0, $t3, #2
136        adc  $acc1, $t3, $t0        // No carry out since t0 is 61 bits and t3 is 63 bits
137        adds $acc0, $acc0, $t2
138        adcs $acc1, $acc1, $t1
139        adc  $acc2, $acc2, xzr      // At this point acc2 has the value of 4 at most \n";
140}
141
142sub poly_mul {
143    &poly_stage1();
144    &poly_stage2();
145    &poly_reduce_stage();
146}
147
148sub chacha_qr_x3 {
149my ($dir)=@_;
150my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
151$code.=<<___;
152    add   $A0.4s, $A0.4s, $B0.4s
153    add   $A1.4s, $A1.4s, $B1.4s
154    add   $A2.4s, $A2.4s, $B2.4s
155    eor   $D0.16b, $D0.16b, $A0.16b
156    eor   $D1.16b, $D1.16b, $A1.16b
157    eor   $D2.16b, $D2.16b, $A2.16b
158    rev32 $D0.8h, $D0.8h
159    rev32 $D1.8h, $D1.8h
160    rev32 $D2.8h, $D2.8h
161
162    add   $C0.4s, $C0.4s, $D0.4s
163    add   $C1.4s, $C1.4s, $D1.4s
164    add   $C2.4s, $C2.4s, $D2.4s
165    eor   $B0.16b, $B0.16b, $C0.16b
166    eor   $B1.16b, $B1.16b, $C1.16b
167    eor   $B2.16b, $B2.16b, $C2.16b
168    ushr  $T0.4s, $B0.4s, #20
169    sli   $T0.4s, $B0.4s, #12
170    ushr  $B0.4s, $B1.4s, #20
171    sli   $B0.4s, $B1.4s, #12
172    ushr  $B1.4s, $B2.4s, #20
173    sli   $B1.4s, $B2.4s, #12
174
175    add   $A0.4s, $A0.4s, $T0.4s
176    add   $A1.4s, $A1.4s, $B0.4s
177    add   $A2.4s, $A2.4s, $B1.4s
178    eor   $D0.16b, $D0.16b, $A0.16b
179    eor   $D1.16b, $D1.16b, $A1.16b
180    eor   $D2.16b, $D2.16b, $A2.16b
181    tbl   $D0.16b, {$D0.16b}, $ROL8.16b
182    tbl   $D1.16b, {$D1.16b}, $ROL8.16b
183    tbl   $D2.16b, {$D2.16b}, $ROL8.16b
184
185    add   $C0.4s, $C0.4s, $D0.4s
186    add   $C1.4s, $C1.4s, $D1.4s
187    add   $C2.4s, $C2.4s, $D2.4s
188    eor   $T0.16b, $T0.16b, $C0.16b
189    eor   $B0.16b, $B0.16b, $C1.16b
190    eor   $B1.16b, $B1.16b, $C2.16b
191    ushr  $B2.4s, $B1.4s, #25
192    sli   $B2.4s, $B1.4s, #7
193    ushr  $B1.4s, $B0.4s, #25
194    sli   $B1.4s, $B0.4s, #7
195    ushr  $B0.4s, $T0.4s, #25
196    sli   $B0.4s, $T0.4s, #7
197
198    ext $B0.16b, $B0.16b, $B0.16b, $shift_b
199    ext $B1.16b, $B1.16b, $B1.16b, $shift_b
200    ext $B2.16b, $B2.16b, $B2.16b, $shift_b
201
202    ext $C0.16b, $C0.16b, $C0.16b, #8
203    ext $C1.16b, $C1.16b, $C1.16b, #8
204    ext $C2.16b, $C2.16b, $C2.16b, #8
205
206    ext $D0.16b, $D0.16b, $D0.16b, $shift_d
207    ext $D1.16b, $D1.16b, $D1.16b, $shift_d
208    ext $D2.16b, $D2.16b, $D2.16b, $shift_d
209___
210}
211
212# When preparing 5 ChaCha20 blocks in parallel, we operate on 4 blocks vertically as introduced by Andrew Moon
213# the fifth block is done horizontally
214sub chacha_qr_x5 {
215my ($dir)=@_;
216my ($a0,$a1,$a2,$a3) = $dir =~ /left/ ? ($A0,$A1,$A2,$A3) : ($A0,$A1,$A2,$A3);
217my ($b0,$b1,$b2,$b3) = $dir =~ /left/ ? ($B0,$B1,$B2,$B3) : ($B1,$B2,$B3,$B0);
218my ($c0,$c1,$c2,$c3) = $dir =~ /left/ ? ($C0,$C1,$C2,$C3) : ($C2,$C3,$C0,$C1);
219my ($d0,$d1,$d2,$d3) = $dir =~ /left/ ? ($D0,$D1,$D2,$D3) : ($D3,$D0,$D1,$D2);
220my ($shift_b,$shift_d) = $dir =~ /left/ ? ("#4","#12") : ("#12","#4");
221$code.=<<___;
222    add   $a0.4s, $a0.4s, $b0.4s
223    add   $a1.4s, $a1.4s, $b1.4s
224    add   $a2.4s, $a2.4s, $b2.4s
225    add   $a3.4s, $a3.4s, $b3.4s
226    add   $A4.4s, $A4.4s, $B4.4s
227
228    eor   $d0.16b, $d0.16b, $a0.16b
229    eor   $d1.16b, $d1.16b, $a1.16b
230    eor   $d2.16b, $d2.16b, $a2.16b
231    eor   $d3.16b, $d3.16b, $a3.16b
232    eor   $D4.16b, $D4.16b, $A4.16b
233
234    rev32 $d0.8h, $d0.8h
235    rev32 $d1.8h, $d1.8h
236    rev32 $d2.8h, $d2.8h
237    rev32 $d3.8h, $d3.8h
238    rev32 $D4.8h, $D4.8h
239
240    add   $c0.4s, $c0.4s, $d0.4s
241    add   $c1.4s, $c1.4s, $d1.4s
242    add   $c2.4s, $c2.4s, $d2.4s
243    add   $c3.4s, $c3.4s, $d3.4s
244    add   $C4.4s, $C4.4s, $D4.4s
245
246    eor   $b0.16b, $b0.16b, $c0.16b
247    eor   $b1.16b, $b1.16b, $c1.16b
248    eor   $b2.16b, $b2.16b, $c2.16b
249    eor   $b3.16b, $b3.16b, $c3.16b
250    eor   $B4.16b, $B4.16b, $C4.16b
251
252    ushr  $T0.4s, $b0.4s, #20
253    sli   $T0.4s, $b0.4s, #12
254    ushr  $b0.4s, $b1.4s, #20
255    sli   $b0.4s, $b1.4s, #12
256    ushr  $b1.4s, $b2.4s, #20
257    sli   $b1.4s, $b2.4s, #12
258    ushr  $b2.4s, $b3.4s, #20
259    sli   $b2.4s, $b3.4s, #12
260    ushr  $b3.4s, $B4.4s, #20
261    sli   $b3.4s, $B4.4s, #12
262
263    add   $a0.4s, $a0.4s, $T0.4s
264    add   $a1.4s, $a1.4s, $b0.4s
265    add   $a2.4s, $a2.4s, $b1.4s
266    add   $a3.4s, $a3.4s, $b2.4s
267    add   $A4.4s, $A4.4s, $b3.4s
268
269    eor   $d0.16b, $d0.16b, $a0.16b
270    eor   $d1.16b, $d1.16b, $a1.16b
271    eor   $d2.16b, $d2.16b, $a2.16b
272    eor   $d3.16b, $d3.16b, $a3.16b
273    eor   $D4.16b, $D4.16b, $A4.16b
274
275    tbl   $d0.16b, {$d0.16b}, $ROL8.16b
276    tbl   $d1.16b, {$d1.16b}, $ROL8.16b
277    tbl   $d2.16b, {$d2.16b}, $ROL8.16b
278    tbl   $d3.16b, {$d3.16b}, $ROL8.16b
279    tbl   $D4.16b, {$D4.16b}, $ROL8.16b
280
281    add   $c0.4s, $c0.4s, $d0.4s
282    add   $c1.4s, $c1.4s, $d1.4s
283    add   $c2.4s, $c2.4s, $d2.4s
284    add   $c3.4s, $c3.4s, $d3.4s
285    add   $C4.4s, $C4.4s, $D4.4s
286
287    eor   $T0.16b, $T0.16b, $c0.16b
288    eor   $b0.16b, $b0.16b, $c1.16b
289    eor   $b1.16b, $b1.16b, $c2.16b
290    eor   $b2.16b, $b2.16b, $c3.16b
291    eor   $b3.16b, $b3.16b, $C4.16b
292
293    ushr  $B4.4s, $b3.4s, #25
294    sli   $B4.4s, $b3.4s, #7
295    ushr  $b3.4s, $b2.4s, #25
296    sli   $b3.4s, $b2.4s, #7
297    ushr  $b2.4s, $b1.4s, #25
298    sli   $b2.4s, $b1.4s, #7
299    ushr  $b1.4s, $b0.4s, #25
300    sli   $b1.4s, $b0.4s, #7
301    ushr  $b0.4s, $T0.4s, #25
302    sli   $b0.4s, $T0.4s, #7
303
304    ext $B4.16b, $B4.16b, $B4.16b, $shift_b
305    ext $C4.16b, $C4.16b, $C4.16b, #8
306    ext $D4.16b, $D4.16b, $D4.16b, $shift_d
307___
308}
309
310{
311$code.=<<___;
312#include <openssl/arm_arch.h>
313.section .rodata
314
315.align 7
316.Lchacha20_consts:
317.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
318.Linc:
319.long 1,2,3,4
320.Lrol8:
321.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
322.Lclamp:
323.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
324
325.text
326
327.type   .Lpoly_hash_ad_internal,%function
328.align  6
329.Lpoly_hash_ad_internal:
330    .cfi_startproc
331    cbnz $adl, .Lpoly_hash_intro
332    ret
333
334.Lpoly_hash_intro:
335    cmp $adl, #16
336    b.lt .Lpoly_hash_ad_tail
337___
338        &poly_add($adp);
339        &poly_mul();
340$code.=<<___;
341        sub $adl, $adl, #16
342        b .Lpoly_hash_ad_internal
343
344.Lpoly_hash_ad_tail:
345    cbz $adl, .Lpoly_hash_ad_ret
346
347    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the AAD
348    sub $adl, $adl, #1
349
350.Lpoly_hash_tail_16_compose:
351        ext  $T0.16b, $T0.16b, $T0.16b, #15
352        ldrb $t0w, [$adp, $adl]
353        mov  $T0.b[0], $t0w
354        subs $adl, $adl, #1
355        b.ge .Lpoly_hash_tail_16_compose
356___
357    &poly_add_vec($T0);
358    &poly_mul();
359$code.=<<___;
360
361.Lpoly_hash_ad_ret:
362    ret
363    .cfi_endproc
364.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
365
366/////////////////////////////////
367//
368// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
369//
370.globl  chacha20_poly1305_seal
371.type   chacha20_poly1305_seal,%function
372.align  6
373chacha20_poly1305_seal:
374    AARCH64_SIGN_LINK_REGISTER
375.cfi_startproc
376    stp x29, x30, [sp, #-80]!
377.cfi_def_cfa_offset 80
378.cfi_offset w30, -72
379.cfi_offset w29, -80
380    mov x29, sp
381    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
382    // we don't actually use the frame pointer like that, it's probably not
383    // worth bothering.
384    stp d8, d9, [sp, #16]
385    stp d10, d11, [sp, #32]
386    stp d12, d13, [sp, #48]
387    stp d14, d15, [sp, #64]
388.cfi_offset b15, -8
389.cfi_offset b14, -16
390.cfi_offset b13, -24
391.cfi_offset b12, -32
392.cfi_offset b11, -40
393.cfi_offset b10, -48
394.cfi_offset b9, -56
395.cfi_offset b8, -64
396
397    adrp $t0, :pg_hi21:.Lchacha20_consts
398    add  $t0, $t0, :lo12:.Lchacha20_consts
399
400    ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values
401    ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp]
402
403    mov $one, #1 // Prepare the Poly1305 state
404    mov $acc0, #0
405    mov $acc1, #0
406    mov $acc2, #0
407
408    ldr $t1, [$keyp, #56]   // The total cipher text length includes extra_in_len
409    add $t1, $t1, $inl
410    mov $LEN_STORE.d[0], $adl  // Store the input and aad lengths
411    mov $LEN_STORE.d[1], $t1
412
413    cmp $inl, #128
414    b.le .Lseal_128 // Optimization for smaller buffers
415
416    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
417    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
418    // the fifth block (A4-D4) horizontally.
419    ld4r {$A0.4s-$A3.4s}, [$t0]
420    mov $A4.16b, $CONSTS.16b
421
422    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
423    mov $B4.16b, $B_STORE.16b
424
425    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
426    mov $C4.16b, $C_STORE.16b
427
428    ld4r {$D0.4s-$D3.4s}, [$keyp]
429    add $D0.4s, $D0.4s, $INC.4s
430    mov $D4.16b, $D_STORE.16b
431
432    sub $keyp, $keyp, #32
433
434    mov  $itr1, #10
435
436.align 5
437.Lseal_init_rounds:
438___
439        &chacha_qr_x5("left");
440        &chacha_qr_x5("right");
441$code.=<<___;
442        subs $itr1, $itr1, #1
443    b.hi .Lseal_init_rounds
444
445    add $D0.4s, $D0.4s, $INC.4s
446    mov $t0, #4
447    dup $T0.4s, $t0w
448    add $INC.4s, $INC.4s, $T0.4s
449
450    zip1 $T0.4s, $A0.4s, $A1.4s
451    zip2 $T1.4s, $A0.4s, $A1.4s
452    zip1 $T2.4s, $A2.4s, $A3.4s
453    zip2 $T3.4s, $A2.4s, $A3.4s
454
455    zip1 $A0.2d, $T0.2d, $T2.2d
456    zip2 $A1.2d, $T0.2d, $T2.2d
457    zip1 $A2.2d, $T1.2d, $T3.2d
458    zip2 $A3.2d, $T1.2d, $T3.2d
459
460    zip1 $T0.4s, $B0.4s, $B1.4s
461    zip2 $T1.4s, $B0.4s, $B1.4s
462    zip1 $T2.4s, $B2.4s, $B3.4s
463    zip2 $T3.4s, $B2.4s, $B3.4s
464
465    zip1 $B0.2d, $T0.2d, $T2.2d
466    zip2 $B1.2d, $T0.2d, $T2.2d
467    zip1 $B2.2d, $T1.2d, $T3.2d
468    zip2 $B3.2d, $T1.2d, $T3.2d
469
470    zip1 $T0.4s, $C0.4s, $C1.4s
471    zip2 $T1.4s, $C0.4s, $C1.4s
472    zip1 $T2.4s, $C2.4s, $C3.4s
473    zip2 $T3.4s, $C2.4s, $C3.4s
474
475    zip1 $C0.2d, $T0.2d, $T2.2d
476    zip2 $C1.2d, $T0.2d, $T2.2d
477    zip1 $C2.2d, $T1.2d, $T3.2d
478    zip2 $C3.2d, $T1.2d, $T3.2d
479
480    zip1 $T0.4s, $D0.4s, $D1.4s
481    zip2 $T1.4s, $D0.4s, $D1.4s
482    zip1 $T2.4s, $D2.4s, $D3.4s
483    zip2 $T3.4s, $D2.4s, $D3.4s
484
485    zip1 $D0.2d, $T0.2d, $T2.2d
486    zip2 $D1.2d, $T0.2d, $T2.2d
487    zip1 $D2.2d, $T1.2d, $T3.2d
488    zip2 $D3.2d, $T1.2d, $T3.2d
489
490    add $A4.4s, $A4.4s, $CONSTS.4s
491    add $B4.4s, $B4.4s, $B_STORE.4s
492    and $A4.16b, $A4.16b, $CLAMP.16b
493
494    add $A0.4s, $A0.4s, $CONSTS.4s
495    add $B0.4s, $B0.4s, $B_STORE.4s
496    add $C0.4s, $C0.4s, $C_STORE.4s
497    add $D0.4s, $D0.4s, $D_STORE.4s
498
499    add $A1.4s, $A1.4s, $CONSTS.4s
500    add $B1.4s, $B1.4s, $B_STORE.4s
501    add $C1.4s, $C1.4s, $C_STORE.4s
502    add $D1.4s, $D1.4s, $D_STORE.4s
503
504    add $A2.4s, $A2.4s, $CONSTS.4s
505    add $B2.4s, $B2.4s, $B_STORE.4s
506    add $C2.4s, $C2.4s, $C_STORE.4s
507    add $D2.4s, $D2.4s, $D_STORE.4s
508
509    add $A3.4s, $A3.4s, $CONSTS.4s
510    add $B3.4s, $B3.4s, $B_STORE.4s
511    add $C3.4s, $C3.4s, $C_STORE.4s
512    add $D3.4s, $D3.4s, $D_STORE.4s
513
514    mov $r0, $A4.d[0] // Move the R key to GPRs
515    mov $r1, $A4.d[1]
516    mov $S_STORE.16b, $B4.16b // Store the S key
517
518    bl  .Lpoly_hash_ad_internal
519
520    mov $adp, $oup
521    cmp $inl, #256
522    b.le .Lseal_tail
523
524    ld1 {$T0.16b - $T3.16b}, [$inp], #64
525    eor $T0.16b, $T0.16b, $A0.16b
526    eor $T1.16b, $T1.16b, $B0.16b
527    eor $T2.16b, $T2.16b, $C0.16b
528    eor $T3.16b, $T3.16b, $D0.16b
529    st1 {$T0.16b - $T3.16b}, [$oup], #64
530
531    ld1 {$T0.16b - $T3.16b}, [$inp], #64
532    eor $T0.16b, $T0.16b, $A1.16b
533    eor $T1.16b, $T1.16b, $B1.16b
534    eor $T2.16b, $T2.16b, $C1.16b
535    eor $T3.16b, $T3.16b, $D1.16b
536    st1 {$T0.16b - $T3.16b}, [$oup], #64
537
538    ld1 {$T0.16b - $T3.16b}, [$inp], #64
539    eor $T0.16b, $T0.16b, $A2.16b
540    eor $T1.16b, $T1.16b, $B2.16b
541    eor $T2.16b, $T2.16b, $C2.16b
542    eor $T3.16b, $T3.16b, $D2.16b
543    st1 {$T0.16b - $T3.16b}, [$oup], #64
544
545    ld1 {$T0.16b - $T3.16b}, [$inp], #64
546    eor $T0.16b, $T0.16b, $A3.16b
547    eor $T1.16b, $T1.16b, $B3.16b
548    eor $T2.16b, $T2.16b, $C3.16b
549    eor $T3.16b, $T3.16b, $D3.16b
550    st1 {$T0.16b - $T3.16b}, [$oup], #64
551
552    sub $inl, $inl, #256
553
554    mov $itr1, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
555    mov $itr2, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
556
557.Lseal_main_loop:
558    adrp $t0, :pg_hi21:.Lchacha20_consts
559    add  $t0, $t0, :lo12:.Lchacha20_consts
560
561    ld4r {$A0.4s-$A3.4s}, [$t0]
562    mov $A4.16b, $CONSTS.16b
563
564    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
565    mov $B4.16b, $B_STORE.16b
566
567    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
568    mov $C4.16b, $C_STORE.16b
569
570    ld4r {$D0.4s-$D3.4s}, [$keyp]
571    add $D0.4s, $D0.4s, $INC.4s
572    mov $D4.16b, $D_STORE.16b
573
574    eor $T0.16b, $T0.16b, $T0.16b //zero
575    not $T1.16b, $T0.16b // -1
576    sub $T1.4s, $INC.4s, $T1.4s // Add +1
577    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
578    add $D4.4s, $D4.4s, $T0.4s
579
580    sub $keyp, $keyp, #32
581.align 5
582.Lseal_main_loop_rounds:
583___
584        &chacha_qr_x5("left");
585        &poly_add($adp);
586        &poly_mul();
587        &chacha_qr_x5("right");
588$code.=<<___;
589        subs $itr1, $itr1, #1
590        b.ge .Lseal_main_loop_rounds
591___
592        &poly_add($adp);
593        &poly_mul();
594$code.=<<___;
595        subs $itr2, $itr2, #1
596        b.gt .Lseal_main_loop_rounds
597
598    eor $T0.16b, $T0.16b, $T0.16b //zero
599    not $T1.16b, $T0.16b // -1
600    sub $T1.4s, $INC.4s, $T1.4s // Add +1
601    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
602    add $D4.4s, $D4.4s, $T0.4s
603
604    add $D0.4s, $D0.4s, $INC.4s
605    mov $t0, #5
606    dup $T0.4s, $t0w
607    add $INC.4s, $INC.4s, $T0.4s
608
609    zip1 $T0.4s, $A0.4s, $A1.4s
610    zip2 $T1.4s, $A0.4s, $A1.4s
611    zip1 $T2.4s, $A2.4s, $A3.4s
612    zip2 $T3.4s, $A2.4s, $A3.4s
613
614    zip1 $A0.2d, $T0.2d, $T2.2d
615    zip2 $A1.2d, $T0.2d, $T2.2d
616    zip1 $A2.2d, $T1.2d, $T3.2d
617    zip2 $A3.2d, $T1.2d, $T3.2d
618
619    zip1 $T0.4s, $B0.4s, $B1.4s
620    zip2 $T1.4s, $B0.4s, $B1.4s
621    zip1 $T2.4s, $B2.4s, $B3.4s
622    zip2 $T3.4s, $B2.4s, $B3.4s
623
624    zip1 $B0.2d, $T0.2d, $T2.2d
625    zip2 $B1.2d, $T0.2d, $T2.2d
626    zip1 $B2.2d, $T1.2d, $T3.2d
627    zip2 $B3.2d, $T1.2d, $T3.2d
628
629    zip1 $T0.4s, $C0.4s, $C1.4s
630    zip2 $T1.4s, $C0.4s, $C1.4s
631    zip1 $T2.4s, $C2.4s, $C3.4s
632    zip2 $T3.4s, $C2.4s, $C3.4s
633
634    zip1 $C0.2d, $T0.2d, $T2.2d
635    zip2 $C1.2d, $T0.2d, $T2.2d
636    zip1 $C2.2d, $T1.2d, $T3.2d
637    zip2 $C3.2d, $T1.2d, $T3.2d
638
639    zip1 $T0.4s, $D0.4s, $D1.4s
640    zip2 $T1.4s, $D0.4s, $D1.4s
641    zip1 $T2.4s, $D2.4s, $D3.4s
642    zip2 $T3.4s, $D2.4s, $D3.4s
643
644    zip1 $D0.2d, $T0.2d, $T2.2d
645    zip2 $D1.2d, $T0.2d, $T2.2d
646    zip1 $D2.2d, $T1.2d, $T3.2d
647    zip2 $D3.2d, $T1.2d, $T3.2d
648
649    add $A0.4s, $A0.4s, $CONSTS.4s
650    add $B0.4s, $B0.4s, $B_STORE.4s
651    add $C0.4s, $C0.4s, $C_STORE.4s
652    add $D0.4s, $D0.4s, $D_STORE.4s
653
654    add $A1.4s, $A1.4s, $CONSTS.4s
655    add $B1.4s, $B1.4s, $B_STORE.4s
656    add $C1.4s, $C1.4s, $C_STORE.4s
657    add $D1.4s, $D1.4s, $D_STORE.4s
658
659    add $A2.4s, $A2.4s, $CONSTS.4s
660    add $B2.4s, $B2.4s, $B_STORE.4s
661    add $C2.4s, $C2.4s, $C_STORE.4s
662    add $D2.4s, $D2.4s, $D_STORE.4s
663
664    add $A3.4s, $A3.4s, $CONSTS.4s
665    add $B3.4s, $B3.4s, $B_STORE.4s
666    add $C3.4s, $C3.4s, $C_STORE.4s
667    add $D3.4s, $D3.4s, $D_STORE.4s
668
669    add $A4.4s, $A4.4s, $CONSTS.4s
670    add $B4.4s, $B4.4s, $B_STORE.4s
671    add $C4.4s, $C4.4s, $C_STORE.4s
672    add $D4.4s, $D4.4s, $D_STORE.4s
673
674    cmp $inl, #320
675    b.le .Lseal_tail
676
677    ld1 {$T0.16b - $T3.16b}, [$inp], #64
678    eor $T0.16b, $T0.16b, $A0.16b
679    eor $T1.16b, $T1.16b, $B0.16b
680    eor $T2.16b, $T2.16b, $C0.16b
681    eor $T3.16b, $T3.16b, $D0.16b
682    st1 {$T0.16b - $T3.16b}, [$oup], #64
683
684    ld1 {$T0.16b - $T3.16b}, [$inp], #64
685    eor $T0.16b, $T0.16b, $A1.16b
686    eor $T1.16b, $T1.16b, $B1.16b
687    eor $T2.16b, $T2.16b, $C1.16b
688    eor $T3.16b, $T3.16b, $D1.16b
689    st1 {$T0.16b - $T3.16b}, [$oup], #64
690
691    ld1 {$T0.16b - $T3.16b}, [$inp], #64
692    eor $T0.16b, $T0.16b, $A2.16b
693    eor $T1.16b, $T1.16b, $B2.16b
694    eor $T2.16b, $T2.16b, $C2.16b
695    eor $T3.16b, $T3.16b, $D2.16b
696    st1 {$T0.16b - $T3.16b}, [$oup], #64
697
698    ld1 {$T0.16b - $T3.16b}, [$inp], #64
699    eor $T0.16b, $T0.16b, $A3.16b
700    eor $T1.16b, $T1.16b, $B3.16b
701    eor $T2.16b, $T2.16b, $C3.16b
702    eor $T3.16b, $T3.16b, $D3.16b
703    st1 {$T0.16b - $T3.16b}, [$oup], #64
704
705    ld1 {$T0.16b - $T3.16b}, [$inp], #64
706    eor $T0.16b, $T0.16b, $A4.16b
707    eor $T1.16b, $T1.16b, $B4.16b
708    eor $T2.16b, $T2.16b, $C4.16b
709    eor $T3.16b, $T3.16b, $D4.16b
710    st1 {$T0.16b - $T3.16b}, [$oup], #64
711
712    sub $inl, $inl, #320
713
714    mov $itr1, #0
715    mov $itr2, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
716
717    b .Lseal_main_loop
718
719.Lseal_tail:
720    // This part of the function handles the storage and authentication of the last [0,320) bytes
721    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
722    cmp $inl, #64
723    b.lt .Lseal_tail_64
724
725    // Store and authenticate 64B blocks per iteration
726    ld1 {$T0.16b - $T3.16b}, [$inp], #64
727
728    eor $T0.16b, $T0.16b, $A0.16b
729    eor $T1.16b, $T1.16b, $B0.16b
730    eor $T2.16b, $T2.16b, $C0.16b
731    eor $T3.16b, $T3.16b, $D0.16b
732___
733    &poly_add_vec($T0);
734    &poly_mul();
735    &poly_add_vec($T1);
736    &poly_mul();
737    &poly_add_vec($T2);
738    &poly_mul();
739    &poly_add_vec($T3);
740    &poly_mul();
741$code.=<<___;
742    st1 {$T0.16b - $T3.16b}, [$oup], #64
743    sub $inl, $inl, #64
744
745    // Shift the state left by 64 bytes for the next iteration of the loop
746    mov $A0.16b, $A1.16b
747    mov $B0.16b, $B1.16b
748    mov $C0.16b, $C1.16b
749    mov $D0.16b, $D1.16b
750
751    mov $A1.16b, $A2.16b
752    mov $B1.16b, $B2.16b
753    mov $C1.16b, $C2.16b
754    mov $D1.16b, $D2.16b
755
756    mov $A2.16b, $A3.16b
757    mov $B2.16b, $B3.16b
758    mov $C2.16b, $C3.16b
759    mov $D2.16b, $D3.16b
760
761    mov $A3.16b, $A4.16b
762    mov $B3.16b, $B4.16b
763    mov $C3.16b, $C4.16b
764    mov $D3.16b, $D4.16b
765
766    b .Lseal_tail
767
768.Lseal_tail_64:
769    ldp $adp, $adl, [$keyp, #48] // extra_in_len and extra_in_ptr
770
771    // Here we handle the last [0,64) bytes of plaintext
772    cmp $inl, #16
773    b.lt .Lseal_tail_16
774    // Each iteration encrypt and authenticate a 16B block
775    ld1 {$T0.16b}, [$inp], #16
776    eor $T0.16b, $T0.16b, $A0.16b
777___
778    &poly_add_vec($T0);
779    &poly_mul();
780$code.=<<___;
781    st1 {$T0.16b}, [$oup], #16
782
783    sub $inl, $inl, #16
784
785    // Shift the state left by 16 bytes for the next iteration of the loop
786    mov $A0.16b, $B0.16b
787    mov $B0.16b, $C0.16b
788    mov $C0.16b, $D0.16b
789
790    b .Lseal_tail_64
791
792.Lseal_tail_16:
793    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
794    cbz $inl, .Lseal_hash_extra
795
796    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the plaintext/extra in
797    eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
798    not $T2.16b, $T0.16b
799
800    mov $itr1, $inl
801    add $inp, $inp, $inl
802
803    cbz $adl, .Lseal_tail_16_compose // No extra data to pad with, zero padding
804
805    mov $itr2, #16          // We need to load some extra_in first for padding
806    sub $itr2, $itr2, $inl
807    cmp $adl, $itr2
808    csel $itr2, $adl, $itr2, lt // Load the minimum of extra_in_len and the amount needed to fill the register
809    mov $t1, $itr2
810    add $adp, $adp, $itr2
811    sub $adl, $adl, $itr2
812
813.Lseal_tail16_compose_extra_in:
814        ext  $T0.16b, $T0.16b, $T0.16b, #15
815        ldrb $t0w, [$adp, #-1]!
816        mov  $T0.b[0], $t0w
817        subs $itr2, $itr2, #1
818        b.gt .Lseal_tail16_compose_extra_in
819
820    add $adp, $adp, $t1
821
822.Lseal_tail_16_compose:
823        ext  $T0.16b, $T0.16b, $T0.16b, #15
824        ldrb $t0w, [$inp, #-1]!
825        mov  $T0.b[0], $t0w
826        ext  $T1.16b, $T2.16b, $T1.16b, #15
827        subs $inl, $inl, #1
828        b.gt .Lseal_tail_16_compose
829
830    and $A0.16b, $A0.16b, $T1.16b
831    eor $T0.16b, $T0.16b, $A0.16b
832    mov $T1.16b, $T0.16b
833
834.Lseal_tail_16_store:
835        umov $t0w, $T0.b[0]
836        strb $t0w, [$oup], #1
837        ext  $T0.16b, $T0.16b, $T0.16b, #1
838        subs $itr1, $itr1, #1
839        b.gt .Lseal_tail_16_store
840
841    // Hash in the final ct block concatenated with extra_in
842___
843    &poly_add_vec($T1);
844    &poly_mul();
845$code.=<<___;
846
847.Lseal_hash_extra:
848    cbz $adl, .Lseal_finalize
849
850.Lseal_hash_extra_loop:
851    cmp $adl, #16
852    b.lt .Lseal_hash_extra_tail
853    ld1 {$T0.16b}, [$adp], #16
854___
855    &poly_add_vec($T0);
856    &poly_mul();
857$code.=<<___;
858    sub $adl, $adl, #16
859    b .Lseal_hash_extra_loop
860
861.Lseal_hash_extra_tail:
862    cbz $adl, .Lseal_finalize
863    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the remaining extra ciphertext
864    add $adp, $adp, $adl
865
866.Lseal_hash_extra_load:
867        ext  $T0.16b, $T0.16b, $T0.16b, #15
868        ldrb $t0w, [$adp, #-1]!
869        mov  $T0.b[0], $t0w
870        subs $adl, $adl, #1
871        b.gt .Lseal_hash_extra_load
872
873    // Hash in the final padded extra_in blcok
874___
875    &poly_add_vec($T0);
876    &poly_mul();
877$code.=<<___;
878
879.Lseal_finalize:
880___
881    &poly_add_vec($LEN_STORE);
882    &poly_mul();
883$code.=<<___;
884    // Final reduction step
885    sub  $t1, xzr, $one
886    orr  $t2, xzr, #3
887    subs $t0, $acc0, #-5
888    sbcs $t1, $acc1, $t1
889    sbcs $t2, $acc2, $t2
890    csel $acc0, $t0, $acc0, cs
891    csel $acc1, $t1, $acc1, cs
892    csel $acc2, $t2, $acc2, cs
893___
894    &poly_add_vec($S_STORE);
895$code.=<<___;
896
897    stp  $acc0, $acc1, [$keyp]
898
899    ldp d8, d9, [sp, #16]
900    ldp d10, d11, [sp, #32]
901    ldp d12, d13, [sp, #48]
902    ldp d14, d15, [sp, #64]
903.cfi_restore b15
904.cfi_restore b14
905.cfi_restore b13
906.cfi_restore b12
907.cfi_restore b11
908.cfi_restore b10
909.cfi_restore b9
910.cfi_restore b8
911    ldp x29, x30, [sp], 80
912.cfi_restore w29
913.cfi_restore w30
914.cfi_def_cfa_offset 0
915    AARCH64_VALIDATE_LINK_REGISTER
916    ret
917
918.Lseal_128:
919    // On some architectures preparing 5 blocks for small buffers is wasteful
920    eor $INC.16b, $INC.16b, $INC.16b
921    mov $t0, #1
922    mov $INC.s[0], $t0w
923    mov $A0.16b, $CONSTS.16b
924    mov $A1.16b, $CONSTS.16b
925    mov $A2.16b, $CONSTS.16b
926    mov $B0.16b, $B_STORE.16b
927    mov $B1.16b, $B_STORE.16b
928    mov $B2.16b, $B_STORE.16b
929    mov $C0.16b, $C_STORE.16b
930    mov $C1.16b, $C_STORE.16b
931    mov $C2.16b, $C_STORE.16b
932    mov $D2.16b, $D_STORE.16b
933    add $D0.4s, $D2.4s, $INC.4s
934    add $D1.4s, $D0.4s, $INC.4s
935
936    mov  $itr1, #10
937
938.Lseal_128_rounds:
939___
940        &chacha_qr_x3("left");
941        &chacha_qr_x3("right");
942$code.=<<___;
943        subs $itr1, $itr1, #1
944    b.hi .Lseal_128_rounds
945
946    add $A0.4s, $A0.4s, $CONSTS.4s
947    add $A1.4s, $A1.4s, $CONSTS.4s
948    add $A2.4s, $A2.4s, $CONSTS.4s
949
950    add $B0.4s, $B0.4s, $B_STORE.4s
951    add $B1.4s, $B1.4s, $B_STORE.4s
952    add $B2.4s, $B2.4s, $B_STORE.4s
953
954    // Only the first 32 bytes of the third block (counter = 0) are needed,
955    // so skip updating $C2 and $D2.
956    add $C0.4s, $C0.4s, $C_STORE.4s
957    add $C1.4s, $C1.4s, $C_STORE.4s
958
959    add $D_STORE.4s, $D_STORE.4s, $INC.4s
960    add $D0.4s, $D0.4s, $D_STORE.4s
961    add $D_STORE.4s, $D_STORE.4s, $INC.4s
962    add $D1.4s, $D1.4s, $D_STORE.4s
963
964    and $A2.16b, $A2.16b, $CLAMP.16b
965    mov $r0, $A2.d[0] // Move the R key to GPRs
966    mov $r1, $A2.d[1]
967    mov $S_STORE.16b, $B2.16b // Store the S key
968
969    bl  .Lpoly_hash_ad_internal
970    b   .Lseal_tail
971.cfi_endproc
972.size chacha20_poly1305_seal,.-chacha20_poly1305_seal
973
974/////////////////////////////////
975//
976// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
977//
978.globl	chacha20_poly1305_open
979.type	chacha20_poly1305_open,%function
980.align	6
981chacha20_poly1305_open:
982    AARCH64_SIGN_LINK_REGISTER
983.cfi_startproc
984    stp x29, x30, [sp, #-80]!
985.cfi_def_cfa_offset 80
986.cfi_offset w30, -72
987.cfi_offset w29, -80
988    mov x29, sp
989    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
990    // we don't actually use the frame pointer like that, it's probably not
991    // worth bothering.
992    stp	d8, d9, [sp, #16]
993    stp	d10, d11, [sp, #32]
994    stp	d12, d13, [sp, #48]
995    stp	d14, d15, [sp, #64]
996.cfi_offset b15, -8
997.cfi_offset b14, -16
998.cfi_offset b13, -24
999.cfi_offset b12, -32
1000.cfi_offset b11, -40
1001.cfi_offset b10, -48
1002.cfi_offset b9, -56
1003.cfi_offset b8, -64
1004
1005    adrp $t0, :pg_hi21:.Lchacha20_consts
1006    add  $t0, $t0, :lo12:.Lchacha20_consts
1007
1008    ld1 {$CONSTS.16b - $CLAMP.16b}, [$t0] // Load the CONSTS, INC, ROL8 and CLAMP values
1009    ld1 {$B_STORE.16b - $D_STORE.16b}, [$keyp]
1010
1011    mov $one, #1 // Prepare the Poly1305 state
1012    mov $acc0, #0
1013    mov $acc1, #0
1014    mov $acc2, #0
1015
1016    mov $LEN_STORE.d[0], $adl  // Store the input and aad lengths
1017    mov $LEN_STORE.d[1], $inl
1018
1019    cmp $inl, #128
1020    b.le .Lopen_128 // Optimization for smaller buffers
1021
1022    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
1023    mov $A0.16b, $CONSTS.16b
1024    mov $B0.16b, $B_STORE.16b
1025    mov $C0.16b, $C_STORE.16b
1026    mov $D0.16b, $D_STORE.16b
1027
1028    mov  $itr1, #10
1029
1030.align 5
1031.Lopen_init_rounds:
1032___
1033        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
1034        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
1035$code.=<<___;
1036        subs $itr1, $itr1, #1
1037    b.hi .Lopen_init_rounds
1038
1039    add $A0.4s, $A0.4s, $CONSTS.4s
1040    add $B0.4s, $B0.4s, $B_STORE.4s
1041
1042    and $A0.16b, $A0.16b, $CLAMP.16b
1043    mov $r0, $A0.d[0] // Move the R key to GPRs
1044    mov $r1, $A0.d[1]
1045    mov $S_STORE.16b, $B0.16b // Store the S key
1046
1047    bl  .Lpoly_hash_ad_internal
1048
1049.Lopen_ad_done:
1050    mov $adp, $inp
1051
1052// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
1053.Lopen_main_loop:
1054
1055    cmp $inl, #192
1056    b.lt .Lopen_tail
1057
1058    adrp $t0, :pg_hi21:.Lchacha20_consts
1059    add  $t0, $t0, :lo12:.Lchacha20_consts
1060
1061    ld4r {$A0.4s-$A3.4s}, [$t0]
1062    mov $A4.16b, $CONSTS.16b
1063
1064    ld4r {$B0.4s-$B3.4s}, [$keyp], #16
1065    mov $B4.16b, $B_STORE.16b
1066
1067    ld4r {$C0.4s-$C3.4s}, [$keyp], #16
1068    mov $C4.16b, $C_STORE.16b
1069
1070    ld4r {$D0.4s-$D3.4s}, [$keyp]
1071    sub $keyp, $keyp, #32
1072    add $D0.4s, $D0.4s, $INC.4s
1073    mov $D4.16b, $D_STORE.16b
1074
1075    eor $T0.16b, $T0.16b, $T0.16b //zero
1076    not $T1.16b, $T0.16b // -1
1077    sub $T1.4s, $INC.4s, $T1.4s // Add +1
1078    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
1079    add $D4.4s, $D4.4s, $T0.4s
1080
1081    lsr $adl, $inl, #4 // How many whole blocks we have to hash, will always be at least 12
1082    sub $adl, $adl, #10
1083
1084    mov $itr2, #10
1085    subs $itr1, $itr2, $adl
1086    subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 320 bytes to hash
1087    csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
1088
1089    cbz $itr2, .Lopen_main_loop_rounds_short
1090
1091.align 5
1092.Lopen_main_loop_rounds:
1093___
1094        &poly_add($adp);
1095        &poly_mul();
1096$code.=<<___;
1097.Lopen_main_loop_rounds_short:
1098___
1099        &chacha_qr_x5("left");
1100        &poly_add($adp);
1101        &poly_mul();
1102        &chacha_qr_x5("right");
1103$code.=<<___;
1104        subs $itr2, $itr2, #1
1105        b.gt .Lopen_main_loop_rounds
1106        subs $itr1, $itr1, #1
1107        b.ge .Lopen_main_loop_rounds_short
1108___
1109$code.=<<___;
1110
1111    eor $T0.16b, $T0.16b, $T0.16b //zero
1112    not $T1.16b, $T0.16b // -1
1113    sub $T1.4s, $INC.4s, $T1.4s // Add +1
1114    ext $T0.16b, $T1.16b, $T0.16b, #12 // Get the last element (counter)
1115    add $D4.4s, $D4.4s, $T0.4s
1116
1117    add $D0.4s, $D0.4s, $INC.4s
1118    mov $t0, #5
1119    dup $T0.4s, $t0w
1120    add $INC.4s, $INC.4s, $T0.4s
1121
1122    zip1 $T0.4s, $A0.4s, $A1.4s
1123    zip2 $T1.4s, $A0.4s, $A1.4s
1124    zip1 $T2.4s, $A2.4s, $A3.4s
1125    zip2 $T3.4s, $A2.4s, $A3.4s
1126
1127    zip1 $A0.2d, $T0.2d, $T2.2d
1128    zip2 $A1.2d, $T0.2d, $T2.2d
1129    zip1 $A2.2d, $T1.2d, $T3.2d
1130    zip2 $A3.2d, $T1.2d, $T3.2d
1131
1132    zip1 $T0.4s, $B0.4s, $B1.4s
1133    zip2 $T1.4s, $B0.4s, $B1.4s
1134    zip1 $T2.4s, $B2.4s, $B3.4s
1135    zip2 $T3.4s, $B2.4s, $B3.4s
1136
1137    zip1 $B0.2d, $T0.2d, $T2.2d
1138    zip2 $B1.2d, $T0.2d, $T2.2d
1139    zip1 $B2.2d, $T1.2d, $T3.2d
1140    zip2 $B3.2d, $T1.2d, $T3.2d
1141
1142    zip1 $T0.4s, $C0.4s, $C1.4s
1143    zip2 $T1.4s, $C0.4s, $C1.4s
1144    zip1 $T2.4s, $C2.4s, $C3.4s
1145    zip2 $T3.4s, $C2.4s, $C3.4s
1146
1147    zip1 $C0.2d, $T0.2d, $T2.2d
1148    zip2 $C1.2d, $T0.2d, $T2.2d
1149    zip1 $C2.2d, $T1.2d, $T3.2d
1150    zip2 $C3.2d, $T1.2d, $T3.2d
1151
1152    zip1 $T0.4s, $D0.4s, $D1.4s
1153    zip2 $T1.4s, $D0.4s, $D1.4s
1154    zip1 $T2.4s, $D2.4s, $D3.4s
1155    zip2 $T3.4s, $D2.4s, $D3.4s
1156
1157    zip1 $D0.2d, $T0.2d, $T2.2d
1158    zip2 $D1.2d, $T0.2d, $T2.2d
1159    zip1 $D2.2d, $T1.2d, $T3.2d
1160    zip2 $D3.2d, $T1.2d, $T3.2d
1161
1162    add $A0.4s, $A0.4s, $CONSTS.4s
1163    add $B0.4s, $B0.4s, $B_STORE.4s
1164    add $C0.4s, $C0.4s, $C_STORE.4s
1165    add $D0.4s, $D0.4s, $D_STORE.4s
1166
1167    add $A1.4s, $A1.4s, $CONSTS.4s
1168    add $B1.4s, $B1.4s, $B_STORE.4s
1169    add $C1.4s, $C1.4s, $C_STORE.4s
1170    add $D1.4s, $D1.4s, $D_STORE.4s
1171
1172    add $A2.4s, $A2.4s, $CONSTS.4s
1173    add $B2.4s, $B2.4s, $B_STORE.4s
1174    add $C2.4s, $C2.4s, $C_STORE.4s
1175    add $D2.4s, $D2.4s, $D_STORE.4s
1176
1177    add $A3.4s, $A3.4s, $CONSTS.4s
1178    add $B3.4s, $B3.4s, $B_STORE.4s
1179    add $C3.4s, $C3.4s, $C_STORE.4s
1180    add $D3.4s, $D3.4s, $D_STORE.4s
1181
1182    add $A4.4s, $A4.4s, $CONSTS.4s
1183    add $B4.4s, $B4.4s, $B_STORE.4s
1184    add $C4.4s, $C4.4s, $C_STORE.4s
1185    add $D4.4s, $D4.4s, $D_STORE.4s
1186
1187    // We can always safely store 192 bytes
1188    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1189    eor $T0.16b, $T0.16b, $A0.16b
1190    eor $T1.16b, $T1.16b, $B0.16b
1191    eor $T2.16b, $T2.16b, $C0.16b
1192    eor $T3.16b, $T3.16b, $D0.16b
1193    st1 {$T0.16b - $T3.16b}, [$oup], #64
1194
1195    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1196    eor $T0.16b, $T0.16b, $A1.16b
1197    eor $T1.16b, $T1.16b, $B1.16b
1198    eor $T2.16b, $T2.16b, $C1.16b
1199    eor $T3.16b, $T3.16b, $D1.16b
1200    st1 {$T0.16b - $T3.16b}, [$oup], #64
1201
1202    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1203    eor $T0.16b, $T0.16b, $A2.16b
1204    eor $T1.16b, $T1.16b, $B2.16b
1205    eor $T2.16b, $T2.16b, $C2.16b
1206    eor $T3.16b, $T3.16b, $D2.16b
1207    st1 {$T0.16b - $T3.16b}, [$oup], #64
1208
1209    sub $inl, $inl, #192
1210
1211    mov $A0.16b, $A3.16b
1212    mov $B0.16b, $B3.16b
1213    mov $C0.16b, $C3.16b
1214    mov $D0.16b, $D3.16b
1215
1216    cmp $inl, #64
1217    b.lt .Lopen_tail_64_store
1218
1219    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1220    eor $T0.16b, $T0.16b, $A3.16b
1221    eor $T1.16b, $T1.16b, $B3.16b
1222    eor $T2.16b, $T2.16b, $C3.16b
1223    eor $T3.16b, $T3.16b, $D3.16b
1224    st1 {$T0.16b - $T3.16b}, [$oup], #64
1225
1226    sub $inl, $inl, #64
1227
1228    mov $A0.16b, $A4.16b
1229    mov $B0.16b, $B4.16b
1230    mov $C0.16b, $C4.16b
1231    mov $D0.16b, $D4.16b
1232
1233    cmp $inl, #64
1234    b.lt .Lopen_tail_64_store
1235
1236    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1237    eor $T0.16b, $T0.16b, $A4.16b
1238    eor $T1.16b, $T1.16b, $B4.16b
1239    eor $T2.16b, $T2.16b, $C4.16b
1240    eor $T3.16b, $T3.16b, $D4.16b
1241    st1 {$T0.16b - $T3.16b}, [$oup], #64
1242
1243    sub $inl, $inl, #64
1244    b .Lopen_main_loop
1245
1246.Lopen_tail:
1247
1248    cbz $inl, .Lopen_finalize
1249
1250    lsr $adl, $inl, #4 // How many whole blocks we have to hash
1251
1252    cmp $inl, #64
1253    b.le .Lopen_tail_64
1254    cmp $inl, #128
1255    b.le .Lopen_tail_128
1256
1257.Lopen_tail_192:
1258     // We need three more blocks
1259    mov $A0.16b, $CONSTS.16b
1260    mov $A1.16b, $CONSTS.16b
1261    mov $A2.16b, $CONSTS.16b
1262    mov $B0.16b, $B_STORE.16b
1263    mov $B1.16b, $B_STORE.16b
1264    mov $B2.16b, $B_STORE.16b
1265    mov $C0.16b, $C_STORE.16b
1266    mov $C1.16b, $C_STORE.16b
1267    mov $C2.16b, $C_STORE.16b
1268    mov $D0.16b, $D_STORE.16b
1269    mov $D1.16b, $D_STORE.16b
1270    mov $D2.16b, $D_STORE.16b
1271    eor $T3.16b, $T3.16b, $T3.16b
1272    eor $T1.16b, $T1.16b, $T1.16b
1273    ins $T3.s[0], $INC.s[0]
1274    ins $T1.d[0], $one
1275
1276    add $T2.4s, $T3.4s, $T1.4s
1277    add $T1.4s, $T2.4s, $T1.4s
1278
1279    add $D0.4s, $D0.4s, $T1.4s
1280    add $D1.4s, $D1.4s, $T3.4s
1281    add $D2.4s, $D2.4s, $T2.4s
1282
1283    mov $itr2, #10
1284    subs $itr1, $itr2, $adl // itr1 can be negative if we have more than 160 bytes to hash
1285    csel $itr2, $itr2, $adl, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
1286    sub $adl, $adl, $itr2
1287
1288    cbz $itr2, .Lopen_tail_192_rounds_no_hash
1289
1290.Lopen_tail_192_rounds:
1291___
1292        &poly_add($adp);
1293        &poly_mul();
1294$code.=<<___;
1295.Lopen_tail_192_rounds_no_hash:
1296___
1297        &chacha_qr_x3("left");
1298        &chacha_qr_x3("right");
1299$code.=<<___;
1300        subs $itr2, $itr2, #1
1301        b.gt .Lopen_tail_192_rounds
1302        subs $itr1, $itr1, #1
1303        b.ge .Lopen_tail_192_rounds_no_hash
1304
1305    // We hashed 160 bytes at most, may still have 32 bytes left
1306.Lopen_tail_192_hash:
1307    cbz $adl, .Lopen_tail_192_hash_done
1308___
1309        &poly_add($adp);
1310        &poly_mul();
1311$code.=<<___;
1312    sub $adl, $adl, #1
1313    b .Lopen_tail_192_hash
1314
1315.Lopen_tail_192_hash_done:
1316
1317    add $A0.4s, $A0.4s, $CONSTS.4s
1318    add $A1.4s, $A1.4s, $CONSTS.4s
1319    add $A2.4s, $A2.4s, $CONSTS.4s
1320    add $B0.4s, $B0.4s, $B_STORE.4s
1321    add $B1.4s, $B1.4s, $B_STORE.4s
1322    add $B2.4s, $B2.4s, $B_STORE.4s
1323    add $C0.4s, $C0.4s, $C_STORE.4s
1324    add $C1.4s, $C1.4s, $C_STORE.4s
1325    add $C2.4s, $C2.4s, $C_STORE.4s
1326    add $D0.4s, $D0.4s, $D_STORE.4s
1327    add $D1.4s, $D1.4s, $D_STORE.4s
1328    add $D2.4s, $D2.4s, $D_STORE.4s
1329
1330    add $D0.4s, $D0.4s, $T1.4s
1331    add $D1.4s, $D1.4s, $T3.4s
1332    add $D2.4s, $D2.4s, $T2.4s
1333
1334    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1335
1336    eor $T0.16b, $T0.16b, $A1.16b
1337    eor $T1.16b, $T1.16b, $B1.16b
1338    eor $T2.16b, $T2.16b, $C1.16b
1339    eor $T3.16b, $T3.16b, $D1.16b
1340
1341    st1 {$T0.16b - $T3.16b}, [$oup], #64
1342
1343    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1344
1345    eor $T0.16b, $T0.16b, $A2.16b
1346    eor $T1.16b, $T1.16b, $B2.16b
1347    eor $T2.16b, $T2.16b, $C2.16b
1348    eor $T3.16b, $T3.16b, $D2.16b
1349
1350    st1 {$T0.16b - $T3.16b}, [$oup], #64
1351
1352    sub $inl, $inl, #128
1353    b .Lopen_tail_64_store
1354
1355.Lopen_tail_128:
1356     // We need two more blocks
1357    mov $A0.16b, $CONSTS.16b
1358    mov $A1.16b, $CONSTS.16b
1359    mov $B0.16b, $B_STORE.16b
1360    mov $B1.16b, $B_STORE.16b
1361    mov $C0.16b, $C_STORE.16b
1362    mov $C1.16b, $C_STORE.16b
1363    mov $D0.16b, $D_STORE.16b
1364    mov $D1.16b, $D_STORE.16b
1365    eor $T3.16b, $T3.16b, $T3.16b
1366    eor $T2.16b, $T2.16b, $T2.16b
1367    ins $T3.s[0], $INC.s[0]
1368    ins $T2.d[0], $one
1369    add $T2.4s, $T2.4s, $T3.4s
1370
1371    add $D0.4s, $D0.4s, $T2.4s
1372    add $D1.4s, $D1.4s, $T3.4s
1373
1374    mov $itr1, #10
1375    sub $itr1, $itr1, $adl
1376
1377.Lopen_tail_128_rounds:
1378___
1379        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
1380        &chacha_qr($A1, $B1, $C1, $D1, $T0, "left");
1381        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
1382        &chacha_qr($A1, $B1, $C1, $D1, $T0, "right");
1383$code.=<<___;
1384        subs $itr1, $itr1, #1
1385        b.gt .Lopen_tail_128_rounds
1386        cbz $adl, .Lopen_tail_128_rounds_done
1387        subs $adl, $adl, #1
1388___
1389        &poly_add($adp);
1390        &poly_mul();
1391$code.=<<___;
1392    b .Lopen_tail_128_rounds
1393
1394.Lopen_tail_128_rounds_done:
1395    add $A0.4s, $A0.4s, $CONSTS.4s
1396    add $A1.4s, $A1.4s, $CONSTS.4s
1397    add $B0.4s, $B0.4s, $B_STORE.4s
1398    add $B1.4s, $B1.4s, $B_STORE.4s
1399    add $C0.4s, $C0.4s, $C_STORE.4s
1400    add $C1.4s, $C1.4s, $C_STORE.4s
1401    add $D0.4s, $D0.4s, $D_STORE.4s
1402    add $D1.4s, $D1.4s, $D_STORE.4s
1403    add $D0.4s, $D0.4s, $T2.4s
1404    add $D1.4s, $D1.4s, $T3.4s
1405
1406    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1407
1408    eor $T0.16b, $T0.16b, $A1.16b
1409    eor $T1.16b, $T1.16b, $B1.16b
1410    eor $T2.16b, $T2.16b, $C1.16b
1411    eor $T3.16b, $T3.16b, $D1.16b
1412
1413    st1 {$T0.16b - $T3.16b}, [$oup], #64
1414    sub $inl, $inl, #64
1415
1416    b .Lopen_tail_64_store
1417
1418.Lopen_tail_64:
1419    // We just need a single block
1420    mov $A0.16b, $CONSTS.16b
1421    mov $B0.16b, $B_STORE.16b
1422    mov $C0.16b, $C_STORE.16b
1423    mov $D0.16b, $D_STORE.16b
1424    eor $T3.16b, $T3.16b, $T3.16b
1425    ins $T3.s[0], $INC.s[0]
1426    add $D0.4s, $D0.4s, $T3.4s
1427
1428    mov $itr1, #10
1429    sub $itr1, $itr1, $adl
1430
1431.Lopen_tail_64_rounds:
1432___
1433        &chacha_qr($A0, $B0, $C0, $D0, $T0, "left");
1434        &chacha_qr($A0, $B0, $C0, $D0, $T0, "right");
1435$code.=<<___;
1436        subs $itr1, $itr1, #1
1437        b.gt .Lopen_tail_64_rounds
1438        cbz $adl, .Lopen_tail_64_rounds_done
1439        subs $adl, $adl, #1
1440___
1441        &poly_add($adp);
1442        &poly_mul();
1443$code.=<<___;
1444    b .Lopen_tail_64_rounds
1445
1446.Lopen_tail_64_rounds_done:
1447    add $A0.4s, $A0.4s, $CONSTS.4s
1448    add $B0.4s, $B0.4s, $B_STORE.4s
1449    add $C0.4s, $C0.4s, $C_STORE.4s
1450    add $D0.4s, $D0.4s, $D_STORE.4s
1451    add $D0.4s, $D0.4s, $T3.4s
1452
1453.Lopen_tail_64_store:
1454    cmp $inl, #16
1455    b.lt .Lopen_tail_16
1456
1457    ld1 {$T0.16b}, [$inp], #16
1458    eor $T0.16b, $T0.16b, $A0.16b
1459    st1 {$T0.16b}, [$oup], #16
1460    mov $A0.16b, $B0.16b
1461    mov $B0.16b, $C0.16b
1462    mov $C0.16b, $D0.16b
1463    sub $inl, $inl, #16
1464    b .Lopen_tail_64_store
1465
1466.Lopen_tail_16:
1467    // Here we handle the last [0,16) bytes that require a padded block
1468    cbz $inl, .Lopen_finalize
1469
1470    eor $T0.16b, $T0.16b, $T0.16b // Use T0 to load the ciphertext
1471    eor $T1.16b, $T1.16b, $T1.16b // Use T1 to generate an AND mask
1472    not $T2.16b, $T0.16b
1473
1474    add $itr2, $inp, $inl
1475    mov $itr1, $inl
1476
1477.Lopen_tail_16_compose:
1478    ext  $T0.16b, $T0.16b, $T0.16b, #15
1479    ldrb $t0w, [$itr2, #-1]!
1480    mov  $T0.b[0], $t0w
1481    ext  $T1.16b, $T2.16b, $T1.16b, #15
1482    subs $inl, $inl, #1
1483    b.gt .Lopen_tail_16_compose
1484
1485    and $T0.16b, $T0.16b, $T1.16b
1486    // Hash in the final padded block
1487___
1488    &poly_add_vec($T0);
1489    &poly_mul();
1490$code.=<<___;
1491    eor $T0.16b, $T0.16b, $A0.16b
1492
1493.Lopen_tail_16_store:
1494    umov $t0w, $T0.b[0]
1495    strb $t0w, [$oup], #1
1496    ext  $T0.16b, $T0.16b, $T0.16b, #1
1497    subs $itr1, $itr1, #1
1498    b.gt .Lopen_tail_16_store
1499
1500.Lopen_finalize:
1501___
1502    &poly_add_vec($LEN_STORE);
1503    &poly_mul();
1504$code.=<<___;
1505    // Final reduction step
1506    sub  $t1, xzr, $one
1507    orr  $t2, xzr, #3
1508    subs $t0, $acc0, #-5
1509    sbcs $t1, $acc1, $t1
1510    sbcs $t2, $acc2, $t2
1511    csel $acc0, $t0, $acc0, cs
1512    csel $acc1, $t1, $acc1, cs
1513    csel $acc2, $t2, $acc2, cs
1514___
1515    &poly_add_vec($S_STORE);
1516$code.=<<___;
1517
1518    stp  $acc0, $acc1, [$keyp]
1519
1520    ldp	d8, d9, [sp, #16]
1521    ldp	d10, d11, [sp, #32]
1522    ldp	d12, d13, [sp, #48]
1523    ldp	d14, d15, [sp, #64]
1524.cfi_restore b15
1525.cfi_restore b14
1526.cfi_restore b13
1527.cfi_restore b12
1528.cfi_restore b11
1529.cfi_restore b10
1530.cfi_restore b9
1531.cfi_restore b8
1532    ldp x29, x30, [sp], 80
1533.cfi_restore w29
1534.cfi_restore w30
1535.cfi_def_cfa_offset 0
1536    AARCH64_VALIDATE_LINK_REGISTER
1537    ret
1538
1539.Lopen_128:
1540    // On some architectures preparing 5 blocks for small buffers is wasteful
1541    eor $INC.16b, $INC.16b, $INC.16b
1542    mov $t0, #1
1543    mov $INC.s[0], $t0w
1544    mov $A0.16b, $CONSTS.16b
1545    mov $A1.16b, $CONSTS.16b
1546    mov $A2.16b, $CONSTS.16b
1547    mov $B0.16b, $B_STORE.16b
1548    mov $B1.16b, $B_STORE.16b
1549    mov $B2.16b, $B_STORE.16b
1550    mov $C0.16b, $C_STORE.16b
1551    mov $C1.16b, $C_STORE.16b
1552    mov $C2.16b, $C_STORE.16b
1553    mov $D2.16b, $D_STORE.16b
1554    add $D0.4s, $D2.4s, $INC.4s
1555    add $D1.4s, $D0.4s, $INC.4s
1556
1557    mov  $itr1, #10
1558
1559.Lopen_128_rounds:
1560___
1561        &chacha_qr_x3("left");
1562        &chacha_qr_x3("right");
1563$code.=<<___;
1564        subs $itr1, $itr1, #1
1565    b.hi .Lopen_128_rounds
1566
1567    add $A0.4s, $A0.4s, $CONSTS.4s
1568    add $A1.4s, $A1.4s, $CONSTS.4s
1569    add $A2.4s, $A2.4s, $CONSTS.4s
1570
1571    add $B0.4s, $B0.4s, $B_STORE.4s
1572    add $B1.4s, $B1.4s, $B_STORE.4s
1573    add $B2.4s, $B2.4s, $B_STORE.4s
1574
1575    add $C0.4s, $C0.4s, $C_STORE.4s
1576    add $C1.4s, $C1.4s, $C_STORE.4s
1577
1578    add $D_STORE.4s, $D_STORE.4s, $INC.4s
1579    add $D0.4s, $D0.4s, $D_STORE.4s
1580    add $D_STORE.4s, $D_STORE.4s, $INC.4s
1581    add $D1.4s, $D1.4s, $D_STORE.4s
1582
1583    and $A2.16b, $A2.16b, $CLAMP.16b
1584    mov $r0, $A2.d[0] // Move the R key to GPRs
1585    mov $r1, $A2.d[1]
1586    mov $S_STORE.16b, $B2.16b // Store the S key
1587
1588    bl  .Lpoly_hash_ad_internal
1589
1590.Lopen_128_store:
1591    cmp $inl, #64
1592    b.lt .Lopen_128_store_64
1593
1594    ld1 {$T0.16b - $T3.16b}, [$inp], #64
1595
1596___
1597    &poly_add_vec($T0);
1598    &poly_mul();
1599    &poly_add_vec($T1);
1600    &poly_mul();
1601    &poly_add_vec($T2);
1602    &poly_mul();
1603    &poly_add_vec($T3);
1604    &poly_mul();
1605$code.=<<___;
1606
1607    eor $T0.16b, $T0.16b, $A0.16b
1608    eor $T1.16b, $T1.16b, $B0.16b
1609    eor $T2.16b, $T2.16b, $C0.16b
1610    eor $T3.16b, $T3.16b, $D0.16b
1611
1612    st1 {$T0.16b - $T3.16b}, [$oup], #64
1613
1614    sub $inl, $inl, #64
1615
1616    mov $A0.16b, $A1.16b
1617    mov $B0.16b, $B1.16b
1618    mov $C0.16b, $C1.16b
1619    mov $D0.16b, $D1.16b
1620
1621.Lopen_128_store_64:
1622
1623    lsr $adl, $inl, #4
1624    mov $adp, $inp
1625
1626.Lopen_128_hash_64:
1627    cbz $adl, .Lopen_tail_64_store
1628___
1629    &poly_add($adp);
1630    &poly_mul();
1631$code.=<<___;
1632    sub $adl, $adl, #1
1633    b .Lopen_128_hash_64
1634.cfi_endproc
1635.size chacha20_poly1305_open,.-chacha20_poly1305_open
1636___
1637}
1638
1639foreach (split("\n",$code)) {
1640	s/\`([^\`]*)\`/eval $1/ge;
1641
1642	print $_,"\n";
1643}
1644close STDOUT or die "error closing STDOUT";
1645